diff options
Diffstat (limited to 'libavcodec/mips')
103 files changed, 87999 insertions, 48 deletions
diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile new file mode 100644 index 0000000000..3c43600659 --- /dev/null +++ b/libavcodec/mips/Makefile @@ -0,0 +1,80 @@ +MIPSFPU-OBJS-$(CONFIG_AMRNB_DECODER) += mips/acelp_filters_mips.o \ + mips/celp_filters_mips.o \ + mips/celp_math_mips.o \ + mips/acelp_vectors_mips.o +MIPSFPU-OBJS-$(CONFIG_AMRWB_DECODER) += mips/acelp_filters_mips.o \ + mips/celp_filters_mips.o \ + mips/amrwbdec_mips.o \ + mips/celp_math_mips.o \ + mips/acelp_vectors_mips.o +MIPSFPU-OBJS-$(CONFIG_MPEGAUDIODSP) += mips/mpegaudiodsp_mips_float.o +MIPSDSP-OBJS-$(CONFIG_MPEGAUDIODSP) += mips/mpegaudiodsp_mips_fixed.o +MIPSFPU-OBJS-$(CONFIG_FFT) += mips/fft_mips.o +MIPSFPU-OBJS-$(CONFIG_FMTCONVERT) += mips/fmtconvert_mips.o +OBJS-$(CONFIG_AC3DSP) += mips/ac3dsp_mips.o +OBJS-$(CONFIG_AAC_DECODER) += mips/aacdec_mips.o \ + mips/aacsbr_mips.o \ + mips/sbrdsp_mips.o \ + mips/aacpsdsp_mips.o +MIPSDSP-OBJS-$(CONFIG_AAC_ENCODER) += mips/aaccoder_mips.o +MIPSFPU-OBJS-$(CONFIG_AAC_ENCODER) += mips/iirfilter_mips.o +OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_init_mips.o \ + mips/hevcpred_init_mips.o +OBJS-$(CONFIG_VP9_DECODER) += mips/vp9dsp_init_mips.o +OBJS-$(CONFIG_VP8_DECODER) += mips/vp8dsp_init_mips.o +OBJS-$(CONFIG_H264DSP) += mips/h264dsp_init_mips.o +OBJS-$(CONFIG_H264QPEL) += mips/h264qpel_init_mips.o +OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_init_mips.o +OBJS-$(CONFIG_H264PRED) += mips/h264pred_init_mips.o +OBJS-$(CONFIG_H263DSP) += mips/h263dsp_init_mips.o +OBJS-$(CONFIG_QPELDSP) += mips/qpeldsp_init_mips.o +OBJS-$(CONFIG_HPELDSP) += mips/hpeldsp_init_mips.o +OBJS-$(CONFIG_BLOCKDSP) += mips/blockdsp_init_mips.o +OBJS-$(CONFIG_PIXBLOCKDSP) += mips/pixblockdsp_init_mips.o +OBJS-$(CONFIG_IDCTDSP) += mips/idctdsp_init_mips.o +OBJS-$(CONFIG_MPEGVIDEO) += mips/mpegvideo_init_mips.o +OBJS-$(CONFIG_MPEGVIDEOENC) += mips/mpegvideoencdsp_init_mips.o +OBJS-$(CONFIG_ME_CMP) += mips/me_cmp_init_mips.o +OBJS-$(CONFIG_MPEG4_DECODER) += mips/xvididct_init_mips.o +MSA-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_msa.o \ + mips/hevc_mc_uni_msa.o \ + mips/hevc_mc_uniw_msa.o \ + mips/hevc_mc_bi_msa.o \ + mips/hevc_mc_biw_msa.o \ + mips/hevc_idct_msa.o \ + mips/hevc_lpf_sao_msa.o \ + mips/hevcpred_msa.o +MSA-OBJS-$(CONFIG_VP9_DECODER) += mips/vp9_mc_msa.o \ + mips/vp9_lpf_msa.o \ + mips/vp9_idct_msa.o \ + mips/vp9_intra_msa.o +MSA-OBJS-$(CONFIG_VP8_DECODER) += mips/vp8_mc_msa.o \ + mips/vp8_idct_msa.o \ + mips/vp8_lpf_msa.o +MSA-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_msa.o \ + mips/h264idct_msa.o +MSA-OBJS-$(CONFIG_H264QPEL) += mips/h264qpel_msa.o +MSA-OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_msa.o +MSA-OBJS-$(CONFIG_H264PRED) += mips/h264pred_msa.o +MSA-OBJS-$(CONFIG_H263DSP) += mips/h263dsp_msa.o +MSA-OBJS-$(CONFIG_QPELDSP) += mips/qpeldsp_msa.o +MSA-OBJS-$(CONFIG_HPELDSP) += mips/hpeldsp_msa.o +MSA-OBJS-$(CONFIG_BLOCKDSP) += mips/blockdsp_msa.o +MSA-OBJS-$(CONFIG_PIXBLOCKDSP) += mips/pixblockdsp_msa.o +MSA-OBJS-$(CONFIG_IDCTDSP) += mips/idctdsp_msa.o \ + mips/simple_idct_msa.o +MSA-OBJS-$(CONFIG_MPEGVIDEO) += mips/mpegvideo_msa.o +MSA-OBJS-$(CONFIG_MPEGVIDEOENC) += mips/mpegvideoencdsp_msa.o +MSA-OBJS-$(CONFIG_ME_CMP) += mips/me_cmp_msa.o +MMI-OBJS += mips/constants.o +MMI-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_mmi.o +MMI-OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_mmi.o +MMI-OBJS-$(CONFIG_H264PRED) += mips/h264pred_mmi.o +MMI-OBJS-$(CONFIG_MPEGVIDEO) += mips/mpegvideo_mmi.o +MMI-OBJS-$(CONFIG_IDCTDSP) += mips/idctdsp_mmi.o \ + mips/simple_idct_mmi.o +MMI-OBJS-$(CONFIG_MPEG4_DECODER) += mips/xvid_idct_mmi.o +MMI-OBJS-$(CONFIG_BLOCKDSP) += mips/blockdsp_mmi.o +MMI-OBJS-$(CONFIG_PIXBLOCKDSP) += mips/pixblockdsp_mmi.o +MMI-OBJS-$(CONFIG_H264QPEL) += mips/h264qpel_mmi.o +MMI-OBJS-$(CONFIG_HPELDSP) += mips/hpeldsp_mmi.o diff --git a/libavcodec/mips/aaccoder_mips.c b/libavcodec/mips/aaccoder_mips.c new file mode 100644 index 0000000000..d690c8c24a --- /dev/null +++ b/libavcodec/mips/aaccoder_mips.c @@ -0,0 +1,2502 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Stanislav Ocovaj (socovaj@mips.com) + * Szabolcs Pal (sabolc@mips.com) + * + * AAC coefficients encoder optimized for MIPS floating-point architecture + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/aaccoder.c + */ + +#include "libavutil/libm.h" + +#include <float.h> +#include "libavutil/mathematics.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/put_bits.h" +#include "libavcodec/aac.h" +#include "libavcodec/aacenc.h" +#include "libavcodec/aactab.h" +#include "libavcodec/aacenctab.h" +#include "libavcodec/aacenc_utils.h" + +#if HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 +typedef struct BandCodingPath { + int prev_idx; + float cost; + int run; +} BandCodingPath; + +static const uint8_t uquad_sign_bits[81] = { + 0, 1, 1, 1, 2, 2, 1, 2, 2, + 1, 2, 2, 2, 3, 3, 2, 3, 3, + 1, 2, 2, 2, 3, 3, 2, 3, 3, + 1, 2, 2, 2, 3, 3, 2, 3, 3, + 2, 3, 3, 3, 4, 4, 3, 4, 4, + 2, 3, 3, 3, 4, 4, 3, 4, 4, + 1, 2, 2, 2, 3, 3, 2, 3, 3, + 2, 3, 3, 3, 4, 4, 3, 4, 4, + 2, 3, 3, 3, 4, 4, 3, 4, 4 +}; + +static const uint8_t upair7_sign_bits[64] = { + 0, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, +}; + +static const uint8_t upair12_sign_bits[169] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +}; + +static const uint8_t esc_sign_bits[289] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +}; + +/** + * Functions developed from template function and optimized for quantizing and encoding band + */ +static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, float *out, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy, const float ROUNDING) +{ + const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; + const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; + int i; + int qc1, qc2, qc3, qc4; + float qenergy = 0.0f; + + uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; + uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1]; + float *p_vec = (float *)ff_aac_codebook_vectors[cb-1]; + + abs_pow34_v(s->scoefs, in, size); + scaled = s->scoefs; + for (i = 0; i < size; i += 4) { + int curidx; + int *in_int = (int *)&in[i]; + int t0, t1, t2, t3, t4, t5, t6, t7; + const float *vec; + + qc1 = scaled[i ] * Q34 + ROUND_STANDARD; + qc2 = scaled[i+1] * Q34 + ROUND_STANDARD; + qc3 = scaled[i+2] * Q34 + ROUND_STANDARD; + qc4 = scaled[i+3] * Q34 + ROUND_STANDARD; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "slt %[qc1], $zero, %[qc1] \n\t" + "slt %[qc2], $zero, %[qc2] \n\t" + "slt %[qc3], $zero, %[qc3] \n\t" + "slt %[qc4], $zero, %[qc4] \n\t" + "lw %[t0], 0(%[in_int]) \n\t" + "lw %[t1], 4(%[in_int]) \n\t" + "lw %[t2], 8(%[in_int]) \n\t" + "lw %[t3], 12(%[in_int]) \n\t" + "srl %[t0], %[t0], 31 \n\t" + "srl %[t1], %[t1], 31 \n\t" + "srl %[t2], %[t2], 31 \n\t" + "srl %[t3], %[t3], 31 \n\t" + "subu %[t4], $zero, %[qc1] \n\t" + "subu %[t5], $zero, %[qc2] \n\t" + "subu %[t6], $zero, %[qc3] \n\t" + "subu %[t7], $zero, %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t5], %[t1] \n\t" + "movn %[qc3], %[t6], %[t2] \n\t" + "movn %[qc4], %[t7], %[t3] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7) + : [in_int]"r"(in_int) + : "memory" + ); + + curidx = qc1; + curidx *= 3; + curidx += qc2; + curidx *= 3; + curidx += qc3; + curidx *= 3; + curidx += qc4; + curidx += 40; + + put_bits(pb, p_bits[curidx], p_codes[curidx]); + + if (out || energy) { + float e1,e2,e3,e4; + vec = &p_vec[curidx*4]; + e1 = vec[0] * IQ; + e2 = vec[1] * IQ; + e3 = vec[2] * IQ; + e4 = vec[3] * IQ; + if (out) { + out[i+0] = e1; + out[i+1] = e2; + out[i+2] = e3; + out[i+3] = e4; + } + if (energy) + qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4); + } + } + if (energy) + *energy = qenergy; +} + +static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, float *out, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy, const float ROUNDING) +{ + const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; + const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; + int i; + int qc1, qc2, qc3, qc4; + float qenergy = 0.0f; + + uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; + uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1]; + float *p_vec = (float *)ff_aac_codebook_vectors[cb-1]; + + abs_pow34_v(s->scoefs, in, size); + scaled = s->scoefs; + for (i = 0; i < size; i += 4) { + int curidx, sign, count; + int *in_int = (int *)&in[i]; + uint8_t v_bits; + unsigned int v_codes; + int t0, t1, t2, t3, t4; + const float *vec; + + qc1 = scaled[i ] * Q34 + ROUND_STANDARD; + qc2 = scaled[i+1] * Q34 + ROUND_STANDARD; + qc3 = scaled[i+2] * Q34 + ROUND_STANDARD; + qc4 = scaled[i+3] * Q34 + ROUND_STANDARD; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "ori %[t4], $zero, 2 \n\t" + "ori %[sign], $zero, 0 \n\t" + "slt %[t0], %[t4], %[qc1] \n\t" + "slt %[t1], %[t4], %[qc2] \n\t" + "slt %[t2], %[t4], %[qc3] \n\t" + "slt %[t3], %[t4], %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t4], %[t1] \n\t" + "movn %[qc3], %[t4], %[t2] \n\t" + "movn %[qc4], %[t4], %[t3] \n\t" + "lw %[t0], 0(%[in_int]) \n\t" + "lw %[t1], 4(%[in_int]) \n\t" + "lw %[t2], 8(%[in_int]) \n\t" + "lw %[t3], 12(%[in_int]) \n\t" + "slt %[t0], %[t0], $zero \n\t" + "movn %[sign], %[t0], %[qc1] \n\t" + "slt %[t1], %[t1], $zero \n\t" + "slt %[t2], %[t2], $zero \n\t" + "slt %[t3], %[t3], $zero \n\t" + "sll %[t0], %[sign], 1 \n\t" + "or %[t0], %[t0], %[t1] \n\t" + "movn %[sign], %[t0], %[qc2] \n\t" + "slt %[t4], $zero, %[qc1] \n\t" + "slt %[t1], $zero, %[qc2] \n\t" + "slt %[count], $zero, %[qc3] \n\t" + "sll %[t0], %[sign], 1 \n\t" + "or %[t0], %[t0], %[t2] \n\t" + "movn %[sign], %[t0], %[qc3] \n\t" + "slt %[t2], $zero, %[qc4] \n\t" + "addu %[count], %[count], %[t4] \n\t" + "addu %[count], %[count], %[t1] \n\t" + "sll %[t0], %[sign], 1 \n\t" + "or %[t0], %[t0], %[t3] \n\t" + "movn %[sign], %[t0], %[qc4] \n\t" + "addu %[count], %[count], %[t2] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [sign]"=&r"(sign), [count]"=&r"(count), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4) + : [in_int]"r"(in_int) + : "memory" + ); + + curidx = qc1; + curidx *= 3; + curidx += qc2; + curidx *= 3; + curidx += qc3; + curidx *= 3; + curidx += qc4; + + v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1)); + v_bits = p_bits[curidx] + count; + put_bits(pb, v_bits, v_codes); + + if (out || energy) { + float e1,e2,e3,e4; + vec = &p_vec[curidx*4]; + e1 = copysignf(vec[0] * IQ, in[i+0]); + e2 = copysignf(vec[1] * IQ, in[i+1]); + e3 = copysignf(vec[2] * IQ, in[i+2]); + e4 = copysignf(vec[3] * IQ, in[i+3]); + if (out) { + out[i+0] = e1; + out[i+1] = e2; + out[i+2] = e3; + out[i+3] = e4; + } + if (energy) + qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4); + } + } + if (energy) + *energy = qenergy; +} + +static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, float *out, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy, const float ROUNDING) +{ + const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; + const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; + int i; + int qc1, qc2, qc3, qc4; + float qenergy = 0.0f; + + uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; + uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1]; + float *p_vec = (float *)ff_aac_codebook_vectors[cb-1]; + + abs_pow34_v(s->scoefs, in, size); + scaled = s->scoefs; + for (i = 0; i < size; i += 4) { + int curidx, curidx2; + int *in_int = (int *)&in[i]; + uint8_t v_bits; + unsigned int v_codes; + int t0, t1, t2, t3, t4, t5, t6, t7; + const float *vec1, *vec2; + + qc1 = scaled[i ] * Q34 + ROUND_STANDARD; + qc2 = scaled[i+1] * Q34 + ROUND_STANDARD; + qc3 = scaled[i+2] * Q34 + ROUND_STANDARD; + qc4 = scaled[i+3] * Q34 + ROUND_STANDARD; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "ori %[t4], $zero, 4 \n\t" + "slt %[t0], %[t4], %[qc1] \n\t" + "slt %[t1], %[t4], %[qc2] \n\t" + "slt %[t2], %[t4], %[qc3] \n\t" + "slt %[t3], %[t4], %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t4], %[t1] \n\t" + "movn %[qc3], %[t4], %[t2] \n\t" + "movn %[qc4], %[t4], %[t3] \n\t" + "lw %[t0], 0(%[in_int]) \n\t" + "lw %[t1], 4(%[in_int]) \n\t" + "lw %[t2], 8(%[in_int]) \n\t" + "lw %[t3], 12(%[in_int]) \n\t" + "srl %[t0], %[t0], 31 \n\t" + "srl %[t1], %[t1], 31 \n\t" + "srl %[t2], %[t2], 31 \n\t" + "srl %[t3], %[t3], 31 \n\t" + "subu %[t4], $zero, %[qc1] \n\t" + "subu %[t5], $zero, %[qc2] \n\t" + "subu %[t6], $zero, %[qc3] \n\t" + "subu %[t7], $zero, %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t5], %[t1] \n\t" + "movn %[qc3], %[t6], %[t2] \n\t" + "movn %[qc4], %[t7], %[t3] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7) + : [in_int]"r"(in_int) + : "memory" + ); + + curidx = 9 * qc1; + curidx += qc2 + 40; + + curidx2 = 9 * qc3; + curidx2 += qc4 + 40; + + v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]); + v_bits = p_bits[curidx] + p_bits[curidx2]; + put_bits(pb, v_bits, v_codes); + + if (out || energy) { + float e1,e2,e3,e4; + vec1 = &p_vec[curidx*2 ]; + vec2 = &p_vec[curidx2*2]; + e1 = vec1[0] * IQ; + e2 = vec1[1] * IQ; + e3 = vec2[0] * IQ; + e4 = vec2[1] * IQ; + if (out) { + out[i+0] = e1; + out[i+1] = e2; + out[i+2] = e3; + out[i+3] = e4; + } + if (energy) + qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4); + } + } + if (energy) + *energy = qenergy; +} + +static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, float *out, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy, const float ROUNDING) +{ + const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; + const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; + int i; + int qc1, qc2, qc3, qc4; + float qenergy = 0.0f; + + uint8_t *p_bits = (uint8_t*) ff_aac_spectral_bits[cb-1]; + uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1]; + float *p_vec = (float *)ff_aac_codebook_vectors[cb-1]; + + abs_pow34_v(s->scoefs, in, size); + scaled = s->scoefs; + for (i = 0; i < size; i += 4) { + int curidx1, curidx2, sign1, count1, sign2, count2; + int *in_int = (int *)&in[i]; + uint8_t v_bits; + unsigned int v_codes; + int t0, t1, t2, t3, t4; + const float *vec1, *vec2; + + qc1 = scaled[i ] * Q34 + ROUND_STANDARD; + qc2 = scaled[i+1] * Q34 + ROUND_STANDARD; + qc3 = scaled[i+2] * Q34 + ROUND_STANDARD; + qc4 = scaled[i+3] * Q34 + ROUND_STANDARD; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "ori %[t4], $zero, 7 \n\t" + "ori %[sign1], $zero, 0 \n\t" + "ori %[sign2], $zero, 0 \n\t" + "slt %[t0], %[t4], %[qc1] \n\t" + "slt %[t1], %[t4], %[qc2] \n\t" + "slt %[t2], %[t4], %[qc3] \n\t" + "slt %[t3], %[t4], %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t4], %[t1] \n\t" + "movn %[qc3], %[t4], %[t2] \n\t" + "movn %[qc4], %[t4], %[t3] \n\t" + "lw %[t0], 0(%[in_int]) \n\t" + "lw %[t1], 4(%[in_int]) \n\t" + "lw %[t2], 8(%[in_int]) \n\t" + "lw %[t3], 12(%[in_int]) \n\t" + "slt %[t0], %[t0], $zero \n\t" + "movn %[sign1], %[t0], %[qc1] \n\t" + "slt %[t2], %[t2], $zero \n\t" + "movn %[sign2], %[t2], %[qc3] \n\t" + "slt %[t1], %[t1], $zero \n\t" + "sll %[t0], %[sign1], 1 \n\t" + "or %[t0], %[t0], %[t1] \n\t" + "movn %[sign1], %[t0], %[qc2] \n\t" + "slt %[t3], %[t3], $zero \n\t" + "sll %[t0], %[sign2], 1 \n\t" + "or %[t0], %[t0], %[t3] \n\t" + "movn %[sign2], %[t0], %[qc4] \n\t" + "slt %[count1], $zero, %[qc1] \n\t" + "slt %[t1], $zero, %[qc2] \n\t" + "slt %[count2], $zero, %[qc3] \n\t" + "slt %[t2], $zero, %[qc4] \n\t" + "addu %[count1], %[count1], %[t1] \n\t" + "addu %[count2], %[count2], %[t2] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [sign1]"=&r"(sign1), [count1]"=&r"(count1), + [sign2]"=&r"(sign2), [count2]"=&r"(count2), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4) + : [in_int]"r"(in_int) + : "t0", "t1", "t2", "t3", "t4", + "memory" + ); + + curidx1 = 8 * qc1; + curidx1 += qc2; + + v_codes = (p_codes[curidx1] << count1) | sign1; + v_bits = p_bits[curidx1] + count1; + put_bits(pb, v_bits, v_codes); + + curidx2 = 8 * qc3; + curidx2 += qc4; + + v_codes = (p_codes[curidx2] << count2) | sign2; + v_bits = p_bits[curidx2] + count2; + put_bits(pb, v_bits, v_codes); + + if (out || energy) { + float e1,e2,e3,e4; + vec1 = &p_vec[curidx1*2]; + vec2 = &p_vec[curidx2*2]; + e1 = copysignf(vec1[0] * IQ, in[i+0]); + e2 = copysignf(vec1[1] * IQ, in[i+1]); + e3 = copysignf(vec2[0] * IQ, in[i+2]); + e4 = copysignf(vec2[1] * IQ, in[i+3]); + if (out) { + out[i+0] = e1; + out[i+1] = e2; + out[i+2] = e3; + out[i+3] = e4; + } + if (energy) + qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4); + } + } + if (energy) + *energy = qenergy; +} + +static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, float *out, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy, const float ROUNDING) +{ + const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; + const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; + int i; + int qc1, qc2, qc3, qc4; + float qenergy = 0.0f; + + uint8_t *p_bits = (uint8_t*) ff_aac_spectral_bits[cb-1]; + uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1]; + float *p_vec = (float *)ff_aac_codebook_vectors[cb-1]; + + abs_pow34_v(s->scoefs, in, size); + scaled = s->scoefs; + for (i = 0; i < size; i += 4) { + int curidx1, curidx2, sign1, count1, sign2, count2; + int *in_int = (int *)&in[i]; + uint8_t v_bits; + unsigned int v_codes; + int t0, t1, t2, t3, t4; + const float *vec1, *vec2; + + qc1 = scaled[i ] * Q34 + ROUND_STANDARD; + qc2 = scaled[i+1] * Q34 + ROUND_STANDARD; + qc3 = scaled[i+2] * Q34 + ROUND_STANDARD; + qc4 = scaled[i+3] * Q34 + ROUND_STANDARD; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "ori %[t4], $zero, 12 \n\t" + "ori %[sign1], $zero, 0 \n\t" + "ori %[sign2], $zero, 0 \n\t" + "slt %[t0], %[t4], %[qc1] \n\t" + "slt %[t1], %[t4], %[qc2] \n\t" + "slt %[t2], %[t4], %[qc3] \n\t" + "slt %[t3], %[t4], %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t4], %[t1] \n\t" + "movn %[qc3], %[t4], %[t2] \n\t" + "movn %[qc4], %[t4], %[t3] \n\t" + "lw %[t0], 0(%[in_int]) \n\t" + "lw %[t1], 4(%[in_int]) \n\t" + "lw %[t2], 8(%[in_int]) \n\t" + "lw %[t3], 12(%[in_int]) \n\t" + "slt %[t0], %[t0], $zero \n\t" + "movn %[sign1], %[t0], %[qc1] \n\t" + "slt %[t2], %[t2], $zero \n\t" + "movn %[sign2], %[t2], %[qc3] \n\t" + "slt %[t1], %[t1], $zero \n\t" + "sll %[t0], %[sign1], 1 \n\t" + "or %[t0], %[t0], %[t1] \n\t" + "movn %[sign1], %[t0], %[qc2] \n\t" + "slt %[t3], %[t3], $zero \n\t" + "sll %[t0], %[sign2], 1 \n\t" + "or %[t0], %[t0], %[t3] \n\t" + "movn %[sign2], %[t0], %[qc4] \n\t" + "slt %[count1], $zero, %[qc1] \n\t" + "slt %[t1], $zero, %[qc2] \n\t" + "slt %[count2], $zero, %[qc3] \n\t" + "slt %[t2], $zero, %[qc4] \n\t" + "addu %[count1], %[count1], %[t1] \n\t" + "addu %[count2], %[count2], %[t2] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [sign1]"=&r"(sign1), [count1]"=&r"(count1), + [sign2]"=&r"(sign2), [count2]"=&r"(count2), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4) + : [in_int]"r"(in_int) + : "memory" + ); + + curidx1 = 13 * qc1; + curidx1 += qc2; + + v_codes = (p_codes[curidx1] << count1) | sign1; + v_bits = p_bits[curidx1] + count1; + put_bits(pb, v_bits, v_codes); + + curidx2 = 13 * qc3; + curidx2 += qc4; + + v_codes = (p_codes[curidx2] << count2) | sign2; + v_bits = p_bits[curidx2] + count2; + put_bits(pb, v_bits, v_codes); + + if (out || energy) { + float e1,e2,e3,e4; + vec1 = &p_vec[curidx1*2]; + vec2 = &p_vec[curidx2*2]; + e1 = copysignf(vec1[0] * IQ, in[i+0]); + e2 = copysignf(vec1[1] * IQ, in[i+1]); + e3 = copysignf(vec2[0] * IQ, in[i+2]); + e4 = copysignf(vec2[1] * IQ, in[i+3]); + if (out) { + out[i+0] = e1; + out[i+1] = e2; + out[i+2] = e3; + out[i+3] = e4; + } + if (energy) + qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4); + } + } + if (energy) + *energy = qenergy; +} + +static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, float *out, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy, const float ROUNDING) +{ + const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; + const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; + int i; + int qc1, qc2, qc3, qc4; + float qenergy = 0.0f; + + uint8_t *p_bits = (uint8_t* )ff_aac_spectral_bits[cb-1]; + uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1]; + float *p_vectors = (float* )ff_aac_codebook_vectors[cb-1]; + + abs_pow34_v(s->scoefs, in, size); + scaled = s->scoefs; + + if (cb < 11) { + for (i = 0; i < size; i += 4) { + int curidx, curidx2, sign1, count1, sign2, count2; + int *in_int = (int *)&in[i]; + uint8_t v_bits; + unsigned int v_codes; + int t0, t1, t2, t3, t4; + const float *vec1, *vec2; + + qc1 = scaled[i ] * Q34 + ROUNDING; + qc2 = scaled[i+1] * Q34 + ROUNDING; + qc3 = scaled[i+2] * Q34 + ROUNDING; + qc4 = scaled[i+3] * Q34 + ROUNDING; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "ori %[t4], $zero, 16 \n\t" + "ori %[sign1], $zero, 0 \n\t" + "ori %[sign2], $zero, 0 \n\t" + "slt %[t0], %[t4], %[qc1] \n\t" + "slt %[t1], %[t4], %[qc2] \n\t" + "slt %[t2], %[t4], %[qc3] \n\t" + "slt %[t3], %[t4], %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t4], %[t1] \n\t" + "movn %[qc3], %[t4], %[t2] \n\t" + "movn %[qc4], %[t4], %[t3] \n\t" + "lw %[t0], 0(%[in_int]) \n\t" + "lw %[t1], 4(%[in_int]) \n\t" + "lw %[t2], 8(%[in_int]) \n\t" + "lw %[t3], 12(%[in_int]) \n\t" + "slt %[t0], %[t0], $zero \n\t" + "movn %[sign1], %[t0], %[qc1] \n\t" + "slt %[t2], %[t2], $zero \n\t" + "movn %[sign2], %[t2], %[qc3] \n\t" + "slt %[t1], %[t1], $zero \n\t" + "sll %[t0], %[sign1], 1 \n\t" + "or %[t0], %[t0], %[t1] \n\t" + "movn %[sign1], %[t0], %[qc2] \n\t" + "slt %[t3], %[t3], $zero \n\t" + "sll %[t0], %[sign2], 1 \n\t" + "or %[t0], %[t0], %[t3] \n\t" + "movn %[sign2], %[t0], %[qc4] \n\t" + "slt %[count1], $zero, %[qc1] \n\t" + "slt %[t1], $zero, %[qc2] \n\t" + "slt %[count2], $zero, %[qc3] \n\t" + "slt %[t2], $zero, %[qc4] \n\t" + "addu %[count1], %[count1], %[t1] \n\t" + "addu %[count2], %[count2], %[t2] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [sign1]"=&r"(sign1), [count1]"=&r"(count1), + [sign2]"=&r"(sign2), [count2]"=&r"(count2), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4) + : [in_int]"r"(in_int) + : "memory" + ); + + curidx = 17 * qc1; + curidx += qc2; + curidx2 = 17 * qc3; + curidx2 += qc4; + + v_codes = (p_codes[curidx] << count1) | sign1; + v_bits = p_bits[curidx] + count1; + put_bits(pb, v_bits, v_codes); + + v_codes = (p_codes[curidx2] << count2) | sign2; + v_bits = p_bits[curidx2] + count2; + put_bits(pb, v_bits, v_codes); + + if (out || energy) { + float e1,e2,e3,e4; + vec1 = &p_vectors[curidx*2 ]; + vec2 = &p_vectors[curidx2*2]; + e1 = copysignf(vec1[0] * IQ, in[i+0]); + e2 = copysignf(vec1[1] * IQ, in[i+1]); + e3 = copysignf(vec2[0] * IQ, in[i+2]); + e4 = copysignf(vec2[1] * IQ, in[i+3]); + if (out) { + out[i+0] = e1; + out[i+1] = e2; + out[i+2] = e3; + out[i+3] = e4; + } + if (energy) + qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4); + } + } + } else { + for (i = 0; i < size; i += 4) { + int curidx, curidx2, sign1, count1, sign2, count2; + int *in_int = (int *)&in[i]; + uint8_t v_bits; + unsigned int v_codes; + int c1, c2, c3, c4; + int t0, t1, t2, t3, t4; + + qc1 = scaled[i ] * Q34 + ROUNDING; + qc2 = scaled[i+1] * Q34 + ROUNDING; + qc3 = scaled[i+2] * Q34 + ROUNDING; + qc4 = scaled[i+3] * Q34 + ROUNDING; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "ori %[t4], $zero, 16 \n\t" + "ori %[sign1], $zero, 0 \n\t" + "ori %[sign2], $zero, 0 \n\t" + "shll_s.w %[c1], %[qc1], 18 \n\t" + "shll_s.w %[c2], %[qc2], 18 \n\t" + "shll_s.w %[c3], %[qc3], 18 \n\t" + "shll_s.w %[c4], %[qc4], 18 \n\t" + "srl %[c1], %[c1], 18 \n\t" + "srl %[c2], %[c2], 18 \n\t" + "srl %[c3], %[c3], 18 \n\t" + "srl %[c4], %[c4], 18 \n\t" + "slt %[t0], %[t4], %[qc1] \n\t" + "slt %[t1], %[t4], %[qc2] \n\t" + "slt %[t2], %[t4], %[qc3] \n\t" + "slt %[t3], %[t4], %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t4], %[t1] \n\t" + "movn %[qc3], %[t4], %[t2] \n\t" + "movn %[qc4], %[t4], %[t3] \n\t" + "lw %[t0], 0(%[in_int]) \n\t" + "lw %[t1], 4(%[in_int]) \n\t" + "lw %[t2], 8(%[in_int]) \n\t" + "lw %[t3], 12(%[in_int]) \n\t" + "slt %[t0], %[t0], $zero \n\t" + "movn %[sign1], %[t0], %[qc1] \n\t" + "slt %[t2], %[t2], $zero \n\t" + "movn %[sign2], %[t2], %[qc3] \n\t" + "slt %[t1], %[t1], $zero \n\t" + "sll %[t0], %[sign1], 1 \n\t" + "or %[t0], %[t0], %[t1] \n\t" + "movn %[sign1], %[t0], %[qc2] \n\t" + "slt %[t3], %[t3], $zero \n\t" + "sll %[t0], %[sign2], 1 \n\t" + "or %[t0], %[t0], %[t3] \n\t" + "movn %[sign2], %[t0], %[qc4] \n\t" + "slt %[count1], $zero, %[qc1] \n\t" + "slt %[t1], $zero, %[qc2] \n\t" + "slt %[count2], $zero, %[qc3] \n\t" + "slt %[t2], $zero, %[qc4] \n\t" + "addu %[count1], %[count1], %[t1] \n\t" + "addu %[count2], %[count2], %[t2] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [sign1]"=&r"(sign1), [count1]"=&r"(count1), + [sign2]"=&r"(sign2), [count2]"=&r"(count2), + [c1]"=&r"(c1), [c2]"=&r"(c2), + [c3]"=&r"(c3), [c4]"=&r"(c4), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4) + : [in_int]"r"(in_int) + : "memory" + ); + + curidx = 17 * qc1; + curidx += qc2; + + curidx2 = 17 * qc3; + curidx2 += qc4; + + v_codes = (p_codes[curidx] << count1) | sign1; + v_bits = p_bits[curidx] + count1; + put_bits(pb, v_bits, v_codes); + + if (p_vectors[curidx*2 ] == 64.0f) { + int len = av_log2(c1); + v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1)); + put_bits(pb, len * 2 - 3, v_codes); + } + if (p_vectors[curidx*2+1] == 64.0f) { + int len = av_log2(c2); + v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1)); + put_bits(pb, len*2-3, v_codes); + } + + v_codes = (p_codes[curidx2] << count2) | sign2; + v_bits = p_bits[curidx2] + count2; + put_bits(pb, v_bits, v_codes); + + if (p_vectors[curidx2*2 ] == 64.0f) { + int len = av_log2(c3); + v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1)); + put_bits(pb, len* 2 - 3, v_codes); + } + if (p_vectors[curidx2*2+1] == 64.0f) { + int len = av_log2(c4); + v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1)); + put_bits(pb, len * 2 - 3, v_codes); + } + + if (out || energy) { + float e1, e2, e3, e4; + e1 = copysignf(c1 * cbrtf(c1) * IQ, in[i+0]); + e2 = copysignf(c2 * cbrtf(c2) * IQ, in[i+1]); + e3 = copysignf(c3 * cbrtf(c3) * IQ, in[i+2]); + e4 = copysignf(c4 * cbrtf(c4) * IQ, in[i+3]); + if (out) { + out[i+0] = e1; + out[i+1] = e2; + out[i+2] = e3; + out[i+3] = e4; + } + if (energy) + qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4); + } + } + } + if (energy) + *energy = qenergy; +} + +static void quantize_and_encode_band_cost_NONE_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, float *out, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy, const float ROUNDING) { + av_assert0(0); +} + +static void quantize_and_encode_band_cost_ZERO_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, float *out, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy, const float ROUNDING) { + int i; + if (bits) + *bits = 0; + if (out) { + for (i = 0; i < size; i += 4) { + out[i ] = 0.0f; + out[i+1] = 0.0f; + out[i+2] = 0.0f; + out[i+3] = 0.0f; + } + } + if (energy) + *energy = 0.0f; +} + +static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s, + PutBitContext *pb, const float *in, float *out, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy, const float ROUNDING) = { + quantize_and_encode_band_cost_ZERO_mips, + quantize_and_encode_band_cost_SQUAD_mips, + quantize_and_encode_band_cost_SQUAD_mips, + quantize_and_encode_band_cost_UQUAD_mips, + quantize_and_encode_band_cost_UQUAD_mips, + quantize_and_encode_band_cost_SPAIR_mips, + quantize_and_encode_band_cost_SPAIR_mips, + quantize_and_encode_band_cost_UPAIR7_mips, + quantize_and_encode_band_cost_UPAIR7_mips, + quantize_and_encode_band_cost_UPAIR12_mips, + quantize_and_encode_band_cost_UPAIR12_mips, + quantize_and_encode_band_cost_ESC_mips, + quantize_and_encode_band_cost_NONE_mips, /* cb 12 doesn't exist */ + quantize_and_encode_band_cost_ZERO_mips, + quantize_and_encode_band_cost_ZERO_mips, + quantize_and_encode_band_cost_ZERO_mips, +}; + +#define quantize_and_encode_band_cost( \ + s, pb, in, out, scaled, size, scale_idx, cb, \ + lambda, uplim, bits, energy, ROUNDING) \ + quantize_and_encode_band_cost_arr[cb]( \ + s, pb, in, out, scaled, size, scale_idx, cb, \ + lambda, uplim, bits, energy, ROUNDING) + +static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb, + const float *in, float *out, int size, int scale_idx, + int cb, const float lambda, int rtz) +{ + quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda, + INFINITY, NULL, NULL, (rtz) ? ROUND_TO_ZERO : ROUND_STANDARD); +} + +/** + * Functions developed from template function and optimized for getting the number of bits + */ +static float get_band_numbits_ZERO_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits) +{ + return 0; +} + +static float get_band_numbits_NONE_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits) +{ + av_assert0(0); + return 0; +} + +static float get_band_numbits_SQUAD_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits) +{ + const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; + int i; + int qc1, qc2, qc3, qc4; + int curbits = 0; + + uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; + + for (i = 0; i < size; i += 4) { + int curidx; + int *in_int = (int *)&in[i]; + int t0, t1, t2, t3, t4, t5, t6, t7; + + qc1 = scaled[i ] * Q34 + ROUND_STANDARD; + qc2 = scaled[i+1] * Q34 + ROUND_STANDARD; + qc3 = scaled[i+2] * Q34 + ROUND_STANDARD; + qc4 = scaled[i+3] * Q34 + ROUND_STANDARD; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "slt %[qc1], $zero, %[qc1] \n\t" + "slt %[qc2], $zero, %[qc2] \n\t" + "slt %[qc3], $zero, %[qc3] \n\t" + "slt %[qc4], $zero, %[qc4] \n\t" + "lw %[t0], 0(%[in_int]) \n\t" + "lw %[t1], 4(%[in_int]) \n\t" + "lw %[t2], 8(%[in_int]) \n\t" + "lw %[t3], 12(%[in_int]) \n\t" + "srl %[t0], %[t0], 31 \n\t" + "srl %[t1], %[t1], 31 \n\t" + "srl %[t2], %[t2], 31 \n\t" + "srl %[t3], %[t3], 31 \n\t" + "subu %[t4], $zero, %[qc1] \n\t" + "subu %[t5], $zero, %[qc2] \n\t" + "subu %[t6], $zero, %[qc3] \n\t" + "subu %[t7], $zero, %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t5], %[t1] \n\t" + "movn %[qc3], %[t6], %[t2] \n\t" + "movn %[qc4], %[t7], %[t3] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7) + : [in_int]"r"(in_int) + : "memory" + ); + + curidx = qc1; + curidx *= 3; + curidx += qc2; + curidx *= 3; + curidx += qc3; + curidx *= 3; + curidx += qc4; + curidx += 40; + + curbits += p_bits[curidx]; + } + return curbits; +} + +static float get_band_numbits_UQUAD_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits) +{ + const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; + int i; + int curbits = 0; + int qc1, qc2, qc3, qc4; + + uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; + + for (i = 0; i < size; i += 4) { + int curidx; + int t0, t1, t2, t3, t4; + + qc1 = scaled[i ] * Q34 + ROUND_STANDARD; + qc2 = scaled[i+1] * Q34 + ROUND_STANDARD; + qc3 = scaled[i+2] * Q34 + ROUND_STANDARD; + qc4 = scaled[i+3] * Q34 + ROUND_STANDARD; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "ori %[t4], $zero, 2 \n\t" + "slt %[t0], %[t4], %[qc1] \n\t" + "slt %[t1], %[t4], %[qc2] \n\t" + "slt %[t2], %[t4], %[qc3] \n\t" + "slt %[t3], %[t4], %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t4], %[t1] \n\t" + "movn %[qc3], %[t4], %[t2] \n\t" + "movn %[qc4], %[t4], %[t3] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4) + ); + + curidx = qc1; + curidx *= 3; + curidx += qc2; + curidx *= 3; + curidx += qc3; + curidx *= 3; + curidx += qc4; + + curbits += p_bits[curidx]; + curbits += uquad_sign_bits[curidx]; + } + return curbits; +} + +static float get_band_numbits_SPAIR_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits) +{ + const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; + int i; + int qc1, qc2, qc3, qc4; + int curbits = 0; + + uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1]; + + for (i = 0; i < size; i += 4) { + int curidx, curidx2; + int *in_int = (int *)&in[i]; + int t0, t1, t2, t3, t4, t5, t6, t7; + + qc1 = scaled[i ] * Q34 + ROUND_STANDARD; + qc2 = scaled[i+1] * Q34 + ROUND_STANDARD; + qc3 = scaled[i+2] * Q34 + ROUND_STANDARD; + qc4 = scaled[i+3] * Q34 + ROUND_STANDARD; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "ori %[t4], $zero, 4 \n\t" + "slt %[t0], %[t4], %[qc1] \n\t" + "slt %[t1], %[t4], %[qc2] \n\t" + "slt %[t2], %[t4], %[qc3] \n\t" + "slt %[t3], %[t4], %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t4], %[t1] \n\t" + "movn %[qc3], %[t4], %[t2] \n\t" + "movn %[qc4], %[t4], %[t3] \n\t" + "lw %[t0], 0(%[in_int]) \n\t" + "lw %[t1], 4(%[in_int]) \n\t" + "lw %[t2], 8(%[in_int]) \n\t" + "lw %[t3], 12(%[in_int]) \n\t" + "srl %[t0], %[t0], 31 \n\t" + "srl %[t1], %[t1], 31 \n\t" + "srl %[t2], %[t2], 31 \n\t" + "srl %[t3], %[t3], 31 \n\t" + "subu %[t4], $zero, %[qc1] \n\t" + "subu %[t5], $zero, %[qc2] \n\t" + "subu %[t6], $zero, %[qc3] \n\t" + "subu %[t7], $zero, %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t5], %[t1] \n\t" + "movn %[qc3], %[t6], %[t2] \n\t" + "movn %[qc4], %[t7], %[t3] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7) + : [in_int]"r"(in_int) + : "memory" + ); + + curidx = 9 * qc1; + curidx += qc2 + 40; + + curidx2 = 9 * qc3; + curidx2 += qc4 + 40; + + curbits += p_bits[curidx] + p_bits[curidx2]; + } + return curbits; +} + +static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits) +{ + const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; + int i; + int qc1, qc2, qc3, qc4; + int curbits = 0; + + uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; + + for (i = 0; i < size; i += 4) { + int curidx, curidx2; + int t0, t1, t2, t3, t4; + + qc1 = scaled[i ] * Q34 + ROUND_STANDARD; + qc2 = scaled[i+1] * Q34 + ROUND_STANDARD; + qc3 = scaled[i+2] * Q34 + ROUND_STANDARD; + qc4 = scaled[i+3] * Q34 + ROUND_STANDARD; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "ori %[t4], $zero, 7 \n\t" + "slt %[t0], %[t4], %[qc1] \n\t" + "slt %[t1], %[t4], %[qc2] \n\t" + "slt %[t2], %[t4], %[qc3] \n\t" + "slt %[t3], %[t4], %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t4], %[t1] \n\t" + "movn %[qc3], %[t4], %[t2] \n\t" + "movn %[qc4], %[t4], %[t3] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4) + ); + + curidx = 8 * qc1; + curidx += qc2; + + curidx2 = 8 * qc3; + curidx2 += qc4; + + curbits += p_bits[curidx] + + upair7_sign_bits[curidx] + + p_bits[curidx2] + + upair7_sign_bits[curidx2]; + } + return curbits; +} + +static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits) +{ + const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; + int i; + int qc1, qc2, qc3, qc4; + int curbits = 0; + + uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; + + for (i = 0; i < size; i += 4) { + int curidx, curidx2; + int t0, t1, t2, t3, t4; + + qc1 = scaled[i ] * Q34 + ROUND_STANDARD; + qc2 = scaled[i+1] * Q34 + ROUND_STANDARD; + qc3 = scaled[i+2] * Q34 + ROUND_STANDARD; + qc4 = scaled[i+3] * Q34 + ROUND_STANDARD; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "ori %[t4], $zero, 12 \n\t" + "slt %[t0], %[t4], %[qc1] \n\t" + "slt %[t1], %[t4], %[qc2] \n\t" + "slt %[t2], %[t4], %[qc3] \n\t" + "slt %[t3], %[t4], %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t4], %[t1] \n\t" + "movn %[qc3], %[t4], %[t2] \n\t" + "movn %[qc4], %[t4], %[t3] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4) + ); + + curidx = 13 * qc1; + curidx += qc2; + + curidx2 = 13 * qc3; + curidx2 += qc4; + + curbits += p_bits[curidx] + + p_bits[curidx2] + + upair12_sign_bits[curidx] + + upair12_sign_bits[curidx2]; + } + return curbits; +} + +static float get_band_numbits_ESC_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits) +{ + const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; + int i; + int qc1, qc2, qc3, qc4; + int curbits = 0; + + uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1]; + + for (i = 0; i < size; i += 4) { + int curidx, curidx2; + int cond0, cond1, cond2, cond3; + int c1, c2, c3, c4; + int t4, t5; + + qc1 = scaled[i ] * Q34 + ROUND_STANDARD; + qc2 = scaled[i+1] * Q34 + ROUND_STANDARD; + qc3 = scaled[i+2] * Q34 + ROUND_STANDARD; + qc4 = scaled[i+3] * Q34 + ROUND_STANDARD; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "ori %[t4], $zero, 15 \n\t" + "ori %[t5], $zero, 16 \n\t" + "shll_s.w %[c1], %[qc1], 18 \n\t" + "shll_s.w %[c2], %[qc2], 18 \n\t" + "shll_s.w %[c3], %[qc3], 18 \n\t" + "shll_s.w %[c4], %[qc4], 18 \n\t" + "srl %[c1], %[c1], 18 \n\t" + "srl %[c2], %[c2], 18 \n\t" + "srl %[c3], %[c3], 18 \n\t" + "srl %[c4], %[c4], 18 \n\t" + "slt %[cond0], %[t4], %[qc1] \n\t" + "slt %[cond1], %[t4], %[qc2] \n\t" + "slt %[cond2], %[t4], %[qc3] \n\t" + "slt %[cond3], %[t4], %[qc4] \n\t" + "movn %[qc1], %[t5], %[cond0] \n\t" + "movn %[qc2], %[t5], %[cond1] \n\t" + "movn %[qc3], %[t5], %[cond2] \n\t" + "movn %[qc4], %[t5], %[cond3] \n\t" + "ori %[t5], $zero, 31 \n\t" + "clz %[c1], %[c1] \n\t" + "clz %[c2], %[c2] \n\t" + "clz %[c3], %[c3] \n\t" + "clz %[c4], %[c4] \n\t" + "subu %[c1], %[t5], %[c1] \n\t" + "subu %[c2], %[t5], %[c2] \n\t" + "subu %[c3], %[t5], %[c3] \n\t" + "subu %[c4], %[t5], %[c4] \n\t" + "sll %[c1], %[c1], 1 \n\t" + "sll %[c2], %[c2], 1 \n\t" + "sll %[c3], %[c3], 1 \n\t" + "sll %[c4], %[c4], 1 \n\t" + "addiu %[c1], %[c1], -3 \n\t" + "addiu %[c2], %[c2], -3 \n\t" + "addiu %[c3], %[c3], -3 \n\t" + "addiu %[c4], %[c4], -3 \n\t" + "subu %[cond0], $zero, %[cond0] \n\t" + "subu %[cond1], $zero, %[cond1] \n\t" + "subu %[cond2], $zero, %[cond2] \n\t" + "subu %[cond3], $zero, %[cond3] \n\t" + "and %[c1], %[c1], %[cond0] \n\t" + "and %[c2], %[c2], %[cond1] \n\t" + "and %[c3], %[c3], %[cond2] \n\t" + "and %[c4], %[c4], %[cond3] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [cond0]"=&r"(cond0), [cond1]"=&r"(cond1), + [cond2]"=&r"(cond2), [cond3]"=&r"(cond3), + [c1]"=&r"(c1), [c2]"=&r"(c2), + [c3]"=&r"(c3), [c4]"=&r"(c4), + [t4]"=&r"(t4), [t5]"=&r"(t5) + ); + + curidx = 17 * qc1; + curidx += qc2; + + curidx2 = 17 * qc3; + curidx2 += qc4; + + curbits += p_bits[curidx]; + curbits += esc_sign_bits[curidx]; + curbits += p_bits[curidx2]; + curbits += esc_sign_bits[curidx2]; + + curbits += c1; + curbits += c2; + curbits += c3; + curbits += c4; + } + return curbits; +} + +static float (*const get_band_numbits_arr[])(struct AACEncContext *s, + PutBitContext *pb, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits) = { + get_band_numbits_ZERO_mips, + get_band_numbits_SQUAD_mips, + get_band_numbits_SQUAD_mips, + get_band_numbits_UQUAD_mips, + get_band_numbits_UQUAD_mips, + get_band_numbits_SPAIR_mips, + get_band_numbits_SPAIR_mips, + get_band_numbits_UPAIR7_mips, + get_band_numbits_UPAIR7_mips, + get_band_numbits_UPAIR12_mips, + get_band_numbits_UPAIR12_mips, + get_band_numbits_ESC_mips, + get_band_numbits_NONE_mips, /* cb 12 doesn't exist */ + get_band_numbits_ZERO_mips, + get_band_numbits_ZERO_mips, + get_band_numbits_ZERO_mips, +}; + +#define get_band_numbits( \ + s, pb, in, scaled, size, scale_idx, cb, \ + lambda, uplim, bits) \ + get_band_numbits_arr[cb]( \ + s, pb, in, scaled, size, scale_idx, cb, \ + lambda, uplim, bits) + +static float quantize_band_cost_bits(struct AACEncContext *s, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy, int rtz) +{ + return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits); +} + +/** + * Functions developed from template function and optimized for getting the band cost + */ +#if HAVE_MIPSFPU +static float get_band_cost_ZERO_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy) +{ + int i; + float cost = 0; + + for (i = 0; i < size; i += 4) { + cost += in[i ] * in[i ]; + cost += in[i+1] * in[i+1]; + cost += in[i+2] * in[i+2]; + cost += in[i+3] * in[i+3]; + } + if (bits) + *bits = 0; + if (energy) + *energy = 0.0f; + return cost * lambda; +} + +static float get_band_cost_NONE_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy) +{ + av_assert0(0); + return 0; +} + +static float get_band_cost_SQUAD_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy) +{ + const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; + const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; + int i; + float cost = 0; + float qenergy = 0.0f; + int qc1, qc2, qc3, qc4; + int curbits = 0; + + uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; + float *p_codes = (float *)ff_aac_codebook_vectors[cb-1]; + + for (i = 0; i < size; i += 4) { + const float *vec; + int curidx; + int *in_int = (int *)&in[i]; + float *in_pos = (float *)&in[i]; + float di0, di1, di2, di3; + int t0, t1, t2, t3, t4, t5, t6, t7; + + qc1 = scaled[i ] * Q34 + ROUND_STANDARD; + qc2 = scaled[i+1] * Q34 + ROUND_STANDARD; + qc3 = scaled[i+2] * Q34 + ROUND_STANDARD; + qc4 = scaled[i+3] * Q34 + ROUND_STANDARD; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "slt %[qc1], $zero, %[qc1] \n\t" + "slt %[qc2], $zero, %[qc2] \n\t" + "slt %[qc3], $zero, %[qc3] \n\t" + "slt %[qc4], $zero, %[qc4] \n\t" + "lw %[t0], 0(%[in_int]) \n\t" + "lw %[t1], 4(%[in_int]) \n\t" + "lw %[t2], 8(%[in_int]) \n\t" + "lw %[t3], 12(%[in_int]) \n\t" + "srl %[t0], %[t0], 31 \n\t" + "srl %[t1], %[t1], 31 \n\t" + "srl %[t2], %[t2], 31 \n\t" + "srl %[t3], %[t3], 31 \n\t" + "subu %[t4], $zero, %[qc1] \n\t" + "subu %[t5], $zero, %[qc2] \n\t" + "subu %[t6], $zero, %[qc3] \n\t" + "subu %[t7], $zero, %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t5], %[t1] \n\t" + "movn %[qc3], %[t6], %[t2] \n\t" + "movn %[qc4], %[t7], %[t3] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7) + : [in_int]"r"(in_int) + : "memory" + ); + + curidx = qc1; + curidx *= 3; + curidx += qc2; + curidx *= 3; + curidx += qc3; + curidx *= 3; + curidx += qc4; + curidx += 40; + + curbits += p_bits[curidx]; + vec = &p_codes[curidx*4]; + + qenergy += vec[0]*vec[0] + vec[1]*vec[1] + + vec[2]*vec[2] + vec[3]*vec[3]; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "lwc1 $f0, 0(%[in_pos]) \n\t" + "lwc1 $f1, 0(%[vec]) \n\t" + "lwc1 $f2, 4(%[in_pos]) \n\t" + "lwc1 $f3, 4(%[vec]) \n\t" + "lwc1 $f4, 8(%[in_pos]) \n\t" + "lwc1 $f5, 8(%[vec]) \n\t" + "lwc1 $f6, 12(%[in_pos]) \n\t" + "lwc1 $f7, 12(%[vec]) \n\t" + "nmsub.s %[di0], $f0, $f1, %[IQ] \n\t" + "nmsub.s %[di1], $f2, $f3, %[IQ] \n\t" + "nmsub.s %[di2], $f4, $f5, %[IQ] \n\t" + "nmsub.s %[di3], $f6, $f7, %[IQ] \n\t" + + ".set pop \n\t" + + : [di0]"=&f"(di0), [di1]"=&f"(di1), + [di2]"=&f"(di2), [di3]"=&f"(di3) + : [in_pos]"r"(in_pos), [vec]"r"(vec), + [IQ]"f"(IQ) + : "$f0", "$f1", "$f2", "$f3", + "$f4", "$f5", "$f6", "$f7", + "memory" + ); + + cost += di0 * di0 + di1 * di1 + + di2 * di2 + di3 * di3; + } + + if (bits) + *bits = curbits; + if (energy) + *energy = qenergy * (IQ*IQ); + return cost * lambda + curbits; +} + +static float get_band_cost_UQUAD_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy) +{ + const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; + const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; + int i; + float cost = 0; + float qenergy = 0.0f; + int curbits = 0; + int qc1, qc2, qc3, qc4; + + uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1]; + float *p_codes = (float *)ff_aac_codebook_vectors[cb-1]; + + for (i = 0; i < size; i += 4) { + const float *vec; + int curidx; + float *in_pos = (float *)&in[i]; + float di0, di1, di2, di3; + int t0, t1, t2, t3, t4; + + qc1 = scaled[i ] * Q34 + ROUND_STANDARD; + qc2 = scaled[i+1] * Q34 + ROUND_STANDARD; + qc3 = scaled[i+2] * Q34 + ROUND_STANDARD; + qc4 = scaled[i+3] * Q34 + ROUND_STANDARD; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "ori %[t4], $zero, 2 \n\t" + "slt %[t0], %[t4], %[qc1] \n\t" + "slt %[t1], %[t4], %[qc2] \n\t" + "slt %[t2], %[t4], %[qc3] \n\t" + "slt %[t3], %[t4], %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t4], %[t1] \n\t" + "movn %[qc3], %[t4], %[t2] \n\t" + "movn %[qc4], %[t4], %[t3] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4) + ); + + curidx = qc1; + curidx *= 3; + curidx += qc2; + curidx *= 3; + curidx += qc3; + curidx *= 3; + curidx += qc4; + + curbits += p_bits[curidx]; + curbits += uquad_sign_bits[curidx]; + vec = &p_codes[curidx*4]; + + qenergy += vec[0]*vec[0] + vec[1]*vec[1] + + vec[2]*vec[2] + vec[3]*vec[3]; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "lwc1 %[di0], 0(%[in_pos]) \n\t" + "lwc1 %[di1], 4(%[in_pos]) \n\t" + "lwc1 %[di2], 8(%[in_pos]) \n\t" + "lwc1 %[di3], 12(%[in_pos]) \n\t" + "abs.s %[di0], %[di0] \n\t" + "abs.s %[di1], %[di1] \n\t" + "abs.s %[di2], %[di2] \n\t" + "abs.s %[di3], %[di3] \n\t" + "lwc1 $f0, 0(%[vec]) \n\t" + "lwc1 $f1, 4(%[vec]) \n\t" + "lwc1 $f2, 8(%[vec]) \n\t" + "lwc1 $f3, 12(%[vec]) \n\t" + "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t" + "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t" + "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t" + "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t" + + ".set pop \n\t" + + : [di0]"=&f"(di0), [di1]"=&f"(di1), + [di2]"=&f"(di2), [di3]"=&f"(di3) + : [in_pos]"r"(in_pos), [vec]"r"(vec), + [IQ]"f"(IQ) + : "$f0", "$f1", "$f2", "$f3", + "memory" + ); + + cost += di0 * di0 + di1 * di1 + + di2 * di2 + di3 * di3; + } + + if (bits) + *bits = curbits; + if (energy) + *energy = qenergy * (IQ*IQ); + return cost * lambda + curbits; +} + +static float get_band_cost_SPAIR_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy) +{ + const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; + const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; + int i; + float cost = 0; + float qenergy = 0.0f; + int qc1, qc2, qc3, qc4; + int curbits = 0; + + uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; + float *p_codes = (float *)ff_aac_codebook_vectors[cb-1]; + + for (i = 0; i < size; i += 4) { + const float *vec, *vec2; + int curidx, curidx2; + int *in_int = (int *)&in[i]; + float *in_pos = (float *)&in[i]; + float di0, di1, di2, di3; + int t0, t1, t2, t3, t4, t5, t6, t7; + + qc1 = scaled[i ] * Q34 + ROUND_STANDARD; + qc2 = scaled[i+1] * Q34 + ROUND_STANDARD; + qc3 = scaled[i+2] * Q34 + ROUND_STANDARD; + qc4 = scaled[i+3] * Q34 + ROUND_STANDARD; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "ori %[t4], $zero, 4 \n\t" + "slt %[t0], %[t4], %[qc1] \n\t" + "slt %[t1], %[t4], %[qc2] \n\t" + "slt %[t2], %[t4], %[qc3] \n\t" + "slt %[t3], %[t4], %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t4], %[t1] \n\t" + "movn %[qc3], %[t4], %[t2] \n\t" + "movn %[qc4], %[t4], %[t3] \n\t" + "lw %[t0], 0(%[in_int]) \n\t" + "lw %[t1], 4(%[in_int]) \n\t" + "lw %[t2], 8(%[in_int]) \n\t" + "lw %[t3], 12(%[in_int]) \n\t" + "srl %[t0], %[t0], 31 \n\t" + "srl %[t1], %[t1], 31 \n\t" + "srl %[t2], %[t2], 31 \n\t" + "srl %[t3], %[t3], 31 \n\t" + "subu %[t4], $zero, %[qc1] \n\t" + "subu %[t5], $zero, %[qc2] \n\t" + "subu %[t6], $zero, %[qc3] \n\t" + "subu %[t7], $zero, %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t5], %[t1] \n\t" + "movn %[qc3], %[t6], %[t2] \n\t" + "movn %[qc4], %[t7], %[t3] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7) + : [in_int]"r"(in_int) + : "memory" + ); + + curidx = 9 * qc1; + curidx += qc2 + 40; + + curidx2 = 9 * qc3; + curidx2 += qc4 + 40; + + curbits += p_bits[curidx]; + curbits += p_bits[curidx2]; + + vec = &p_codes[curidx*2]; + vec2 = &p_codes[curidx2*2]; + + qenergy += vec[0]*vec[0] + vec[1]*vec[1] + + vec2[0]*vec2[0] + vec2[1]*vec2[1]; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "lwc1 $f0, 0(%[in_pos]) \n\t" + "lwc1 $f1, 0(%[vec]) \n\t" + "lwc1 $f2, 4(%[in_pos]) \n\t" + "lwc1 $f3, 4(%[vec]) \n\t" + "lwc1 $f4, 8(%[in_pos]) \n\t" + "lwc1 $f5, 0(%[vec2]) \n\t" + "lwc1 $f6, 12(%[in_pos]) \n\t" + "lwc1 $f7, 4(%[vec2]) \n\t" + "nmsub.s %[di0], $f0, $f1, %[IQ] \n\t" + "nmsub.s %[di1], $f2, $f3, %[IQ] \n\t" + "nmsub.s %[di2], $f4, $f5, %[IQ] \n\t" + "nmsub.s %[di3], $f6, $f7, %[IQ] \n\t" + + ".set pop \n\t" + + : [di0]"=&f"(di0), [di1]"=&f"(di1), + [di2]"=&f"(di2), [di3]"=&f"(di3) + : [in_pos]"r"(in_pos), [vec]"r"(vec), + [vec2]"r"(vec2), [IQ]"f"(IQ) + : "$f0", "$f1", "$f2", "$f3", + "$f4", "$f5", "$f6", "$f7", + "memory" + ); + + cost += di0 * di0 + di1 * di1 + + di2 * di2 + di3 * di3; + } + + if (bits) + *bits = curbits; + if (energy) + *energy = qenergy * (IQ*IQ); + return cost * lambda + curbits; +} + +static float get_band_cost_UPAIR7_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy) +{ + const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; + const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; + int i; + float cost = 0; + float qenergy = 0.0f; + int qc1, qc2, qc3, qc4; + int curbits = 0; + + uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; + float *p_codes = (float *)ff_aac_codebook_vectors[cb-1]; + + for (i = 0; i < size; i += 4) { + const float *vec, *vec2; + int curidx, curidx2, sign1, count1, sign2, count2; + int *in_int = (int *)&in[i]; + float *in_pos = (float *)&in[i]; + float di0, di1, di2, di3; + int t0, t1, t2, t3, t4; + + qc1 = scaled[i ] * Q34 + ROUND_STANDARD; + qc2 = scaled[i+1] * Q34 + ROUND_STANDARD; + qc3 = scaled[i+2] * Q34 + ROUND_STANDARD; + qc4 = scaled[i+3] * Q34 + ROUND_STANDARD; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "ori %[t4], $zero, 7 \n\t" + "ori %[sign1], $zero, 0 \n\t" + "ori %[sign2], $zero, 0 \n\t" + "slt %[t0], %[t4], %[qc1] \n\t" + "slt %[t1], %[t4], %[qc2] \n\t" + "slt %[t2], %[t4], %[qc3] \n\t" + "slt %[t3], %[t4], %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t4], %[t1] \n\t" + "movn %[qc3], %[t4], %[t2] \n\t" + "movn %[qc4], %[t4], %[t3] \n\t" + "lw %[t0], 0(%[in_int]) \n\t" + "lw %[t1], 4(%[in_int]) \n\t" + "lw %[t2], 8(%[in_int]) \n\t" + "lw %[t3], 12(%[in_int]) \n\t" + "slt %[t0], %[t0], $zero \n\t" + "movn %[sign1], %[t0], %[qc1] \n\t" + "slt %[t2], %[t2], $zero \n\t" + "movn %[sign2], %[t2], %[qc3] \n\t" + "slt %[t1], %[t1], $zero \n\t" + "sll %[t0], %[sign1], 1 \n\t" + "or %[t0], %[t0], %[t1] \n\t" + "movn %[sign1], %[t0], %[qc2] \n\t" + "slt %[t3], %[t3], $zero \n\t" + "sll %[t0], %[sign2], 1 \n\t" + "or %[t0], %[t0], %[t3] \n\t" + "movn %[sign2], %[t0], %[qc4] \n\t" + "slt %[count1], $zero, %[qc1] \n\t" + "slt %[t1], $zero, %[qc2] \n\t" + "slt %[count2], $zero, %[qc3] \n\t" + "slt %[t2], $zero, %[qc4] \n\t" + "addu %[count1], %[count1], %[t1] \n\t" + "addu %[count2], %[count2], %[t2] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [sign1]"=&r"(sign1), [count1]"=&r"(count1), + [sign2]"=&r"(sign2), [count2]"=&r"(count2), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4) + : [in_int]"r"(in_int) + : "memory" + ); + + curidx = 8 * qc1; + curidx += qc2; + + curidx2 = 8 * qc3; + curidx2 += qc4; + + curbits += p_bits[curidx]; + curbits += upair7_sign_bits[curidx]; + vec = &p_codes[curidx*2]; + + curbits += p_bits[curidx2]; + curbits += upair7_sign_bits[curidx2]; + vec2 = &p_codes[curidx2*2]; + + qenergy += vec[0]*vec[0] + vec[1]*vec[1] + + vec2[0]*vec2[0] + vec2[1]*vec2[1]; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "lwc1 %[di0], 0(%[in_pos]) \n\t" + "lwc1 %[di1], 4(%[in_pos]) \n\t" + "lwc1 %[di2], 8(%[in_pos]) \n\t" + "lwc1 %[di3], 12(%[in_pos]) \n\t" + "abs.s %[di0], %[di0] \n\t" + "abs.s %[di1], %[di1] \n\t" + "abs.s %[di2], %[di2] \n\t" + "abs.s %[di3], %[di3] \n\t" + "lwc1 $f0, 0(%[vec]) \n\t" + "lwc1 $f1, 4(%[vec]) \n\t" + "lwc1 $f2, 0(%[vec2]) \n\t" + "lwc1 $f3, 4(%[vec2]) \n\t" + "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t" + "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t" + "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t" + "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t" + + ".set pop \n\t" + + : [di0]"=&f"(di0), [di1]"=&f"(di1), + [di2]"=&f"(di2), [di3]"=&f"(di3) + : [in_pos]"r"(in_pos), [vec]"r"(vec), + [vec2]"r"(vec2), [IQ]"f"(IQ) + : "$f0", "$f1", "$f2", "$f3", + "memory" + ); + + cost += di0 * di0 + di1 * di1 + + di2 * di2 + di3 * di3; + } + + if (bits) + *bits = curbits; + if (energy) + *energy = qenergy * (IQ*IQ); + return cost * lambda + curbits; +} + +static float get_band_cost_UPAIR12_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy) +{ + const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; + const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; + int i; + float cost = 0; + float qenergy = 0.0f; + int qc1, qc2, qc3, qc4; + int curbits = 0; + + uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; + float *p_codes = (float *)ff_aac_codebook_vectors[cb-1]; + + for (i = 0; i < size; i += 4) { + const float *vec, *vec2; + int curidx, curidx2; + int sign1, count1, sign2, count2; + int *in_int = (int *)&in[i]; + float *in_pos = (float *)&in[i]; + float di0, di1, di2, di3; + int t0, t1, t2, t3, t4; + + qc1 = scaled[i ] * Q34 + ROUND_STANDARD; + qc2 = scaled[i+1] * Q34 + ROUND_STANDARD; + qc3 = scaled[i+2] * Q34 + ROUND_STANDARD; + qc4 = scaled[i+3] * Q34 + ROUND_STANDARD; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "ori %[t4], $zero, 12 \n\t" + "ori %[sign1], $zero, 0 \n\t" + "ori %[sign2], $zero, 0 \n\t" + "slt %[t0], %[t4], %[qc1] \n\t" + "slt %[t1], %[t4], %[qc2] \n\t" + "slt %[t2], %[t4], %[qc3] \n\t" + "slt %[t3], %[t4], %[qc4] \n\t" + "movn %[qc1], %[t4], %[t0] \n\t" + "movn %[qc2], %[t4], %[t1] \n\t" + "movn %[qc3], %[t4], %[t2] \n\t" + "movn %[qc4], %[t4], %[t3] \n\t" + "lw %[t0], 0(%[in_int]) \n\t" + "lw %[t1], 4(%[in_int]) \n\t" + "lw %[t2], 8(%[in_int]) \n\t" + "lw %[t3], 12(%[in_int]) \n\t" + "slt %[t0], %[t0], $zero \n\t" + "movn %[sign1], %[t0], %[qc1] \n\t" + "slt %[t2], %[t2], $zero \n\t" + "movn %[sign2], %[t2], %[qc3] \n\t" + "slt %[t1], %[t1], $zero \n\t" + "sll %[t0], %[sign1], 1 \n\t" + "or %[t0], %[t0], %[t1] \n\t" + "movn %[sign1], %[t0], %[qc2] \n\t" + "slt %[t3], %[t3], $zero \n\t" + "sll %[t0], %[sign2], 1 \n\t" + "or %[t0], %[t0], %[t3] \n\t" + "movn %[sign2], %[t0], %[qc4] \n\t" + "slt %[count1], $zero, %[qc1] \n\t" + "slt %[t1], $zero, %[qc2] \n\t" + "slt %[count2], $zero, %[qc3] \n\t" + "slt %[t2], $zero, %[qc4] \n\t" + "addu %[count1], %[count1], %[t1] \n\t" + "addu %[count2], %[count2], %[t2] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [sign1]"=&r"(sign1), [count1]"=&r"(count1), + [sign2]"=&r"(sign2), [count2]"=&r"(count2), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4) + : [in_int]"r"(in_int) + : "memory" + ); + + curidx = 13 * qc1; + curidx += qc2; + + curidx2 = 13 * qc3; + curidx2 += qc4; + + curbits += p_bits[curidx]; + curbits += p_bits[curidx2]; + curbits += upair12_sign_bits[curidx]; + curbits += upair12_sign_bits[curidx2]; + vec = &p_codes[curidx*2]; + vec2 = &p_codes[curidx2*2]; + + qenergy += vec[0]*vec[0] + vec[1]*vec[1] + + vec2[0]*vec2[0] + vec2[1]*vec2[1]; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "lwc1 %[di0], 0(%[in_pos]) \n\t" + "lwc1 %[di1], 4(%[in_pos]) \n\t" + "lwc1 %[di2], 8(%[in_pos]) \n\t" + "lwc1 %[di3], 12(%[in_pos]) \n\t" + "abs.s %[di0], %[di0] \n\t" + "abs.s %[di1], %[di1] \n\t" + "abs.s %[di2], %[di2] \n\t" + "abs.s %[di3], %[di3] \n\t" + "lwc1 $f0, 0(%[vec]) \n\t" + "lwc1 $f1, 4(%[vec]) \n\t" + "lwc1 $f2, 0(%[vec2]) \n\t" + "lwc1 $f3, 4(%[vec2]) \n\t" + "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t" + "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t" + "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t" + "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t" + + ".set pop \n\t" + + : [di0]"=&f"(di0), [di1]"=&f"(di1), + [di2]"=&f"(di2), [di3]"=&f"(di3) + : [in_pos]"r"(in_pos), [vec]"r"(vec), + [vec2]"r"(vec2), [IQ]"f"(IQ) + : "$f0", "$f1", "$f2", "$f3", + "memory" + ); + + cost += di0 * di0 + di1 * di1 + + di2 * di2 + di3 * di3; + } + + if (bits) + *bits = curbits; + if (energy) + *energy = qenergy * (IQ*IQ); + return cost * lambda + curbits; +} + +static float get_band_cost_ESC_mips(struct AACEncContext *s, + PutBitContext *pb, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy) +{ + const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; + const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; + const float CLIPPED_ESCAPE = 165140.0f * IQ; + int i; + float cost = 0; + float qenergy = 0.0f; + int qc1, qc2, qc3, qc4; + int curbits = 0; + + uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1]; + float *p_codes = (float* )ff_aac_codebook_vectors[cb-1]; + + for (i = 0; i < size; i += 4) { + const float *vec, *vec2; + int curidx, curidx2; + float t1, t2, t3, t4, V; + float di1, di2, di3, di4; + int cond0, cond1, cond2, cond3; + int c1, c2, c3, c4; + int t6, t7; + + qc1 = scaled[i ] * Q34 + ROUND_STANDARD; + qc2 = scaled[i+1] * Q34 + ROUND_STANDARD; + qc3 = scaled[i+2] * Q34 + ROUND_STANDARD; + qc4 = scaled[i+3] * Q34 + ROUND_STANDARD; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "ori %[t6], $zero, 15 \n\t" + "ori %[t7], $zero, 16 \n\t" + "shll_s.w %[c1], %[qc1], 18 \n\t" + "shll_s.w %[c2], %[qc2], 18 \n\t" + "shll_s.w %[c3], %[qc3], 18 \n\t" + "shll_s.w %[c4], %[qc4], 18 \n\t" + "srl %[c1], %[c1], 18 \n\t" + "srl %[c2], %[c2], 18 \n\t" + "srl %[c3], %[c3], 18 \n\t" + "srl %[c4], %[c4], 18 \n\t" + "slt %[cond0], %[t6], %[qc1] \n\t" + "slt %[cond1], %[t6], %[qc2] \n\t" + "slt %[cond2], %[t6], %[qc3] \n\t" + "slt %[cond3], %[t6], %[qc4] \n\t" + "movn %[qc1], %[t7], %[cond0] \n\t" + "movn %[qc2], %[t7], %[cond1] \n\t" + "movn %[qc3], %[t7], %[cond2] \n\t" + "movn %[qc4], %[t7], %[cond3] \n\t" + + ".set pop \n\t" + + : [qc1]"+r"(qc1), [qc2]"+r"(qc2), + [qc3]"+r"(qc3), [qc4]"+r"(qc4), + [cond0]"=&r"(cond0), [cond1]"=&r"(cond1), + [cond2]"=&r"(cond2), [cond3]"=&r"(cond3), + [c1]"=&r"(c1), [c2]"=&r"(c2), + [c3]"=&r"(c3), [c4]"=&r"(c4), + [t6]"=&r"(t6), [t7]"=&r"(t7) + ); + + curidx = 17 * qc1; + curidx += qc2; + + curidx2 = 17 * qc3; + curidx2 += qc4; + + curbits += p_bits[curidx]; + curbits += esc_sign_bits[curidx]; + vec = &p_codes[curidx*2]; + + curbits += p_bits[curidx2]; + curbits += esc_sign_bits[curidx2]; + vec2 = &p_codes[curidx2*2]; + + curbits += (av_log2(c1) * 2 - 3) & (-cond0); + curbits += (av_log2(c2) * 2 - 3) & (-cond1); + curbits += (av_log2(c3) * 2 - 3) & (-cond2); + curbits += (av_log2(c4) * 2 - 3) & (-cond3); + + t1 = fabsf(in[i ]); + t2 = fabsf(in[i+1]); + t3 = fabsf(in[i+2]); + t4 = fabsf(in[i+3]); + + if (cond0) { + if (t1 >= CLIPPED_ESCAPE) { + di1 = t1 - CLIPPED_ESCAPE; + qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE; + } else { + di1 = t1 - (V = c1 * cbrtf(c1) * IQ); + qenergy += V*V; + } + } else { + di1 = t1 - (V = vec[0] * IQ); + qenergy += V*V; + } + + if (cond1) { + if (t2 >= CLIPPED_ESCAPE) { + di2 = t2 - CLIPPED_ESCAPE; + qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE; + } else { + di2 = t2 - (V = c2 * cbrtf(c2) * IQ); + qenergy += V*V; + } + } else { + di2 = t2 - (V = vec[1] * IQ); + qenergy += V*V; + } + + if (cond2) { + if (t3 >= CLIPPED_ESCAPE) { + di3 = t3 - CLIPPED_ESCAPE; + qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE; + } else { + di3 = t3 - (V = c3 * cbrtf(c3) * IQ); + qenergy += V*V; + } + } else { + di3 = t3 - (V = vec2[0] * IQ); + qenergy += V*V; + } + + if (cond3) { + if (t4 >= CLIPPED_ESCAPE) { + di4 = t4 - CLIPPED_ESCAPE; + qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE; + } else { + di4 = t4 - (V = c4 * cbrtf(c4) * IQ); + qenergy += V*V; + } + } else { + di4 = t4 - (V = vec2[1]*IQ); + qenergy += V*V; + } + + cost += di1 * di1 + di2 * di2 + + di3 * di3 + di4 * di4; + } + + if (bits) + *bits = curbits; + return cost * lambda + curbits; +} + +static float (*const get_band_cost_arr[])(struct AACEncContext *s, + PutBitContext *pb, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy) = { + get_band_cost_ZERO_mips, + get_band_cost_SQUAD_mips, + get_band_cost_SQUAD_mips, + get_band_cost_UQUAD_mips, + get_band_cost_UQUAD_mips, + get_band_cost_SPAIR_mips, + get_band_cost_SPAIR_mips, + get_band_cost_UPAIR7_mips, + get_band_cost_UPAIR7_mips, + get_band_cost_UPAIR12_mips, + get_band_cost_UPAIR12_mips, + get_band_cost_ESC_mips, + get_band_cost_NONE_mips, /* cb 12 doesn't exist */ + get_band_cost_ZERO_mips, + get_band_cost_ZERO_mips, + get_band_cost_ZERO_mips, +}; + +#define get_band_cost( \ + s, pb, in, scaled, size, scale_idx, cb, \ + lambda, uplim, bits, energy) \ + get_band_cost_arr[cb]( \ + s, pb, in, scaled, size, scale_idx, cb, \ + lambda, uplim, bits, energy) + +static float quantize_band_cost(struct AACEncContext *s, const float *in, + const float *scaled, int size, int scale_idx, + int cb, const float lambda, const float uplim, + int *bits, float *energy, int rtz) +{ + return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits, energy); +} + +#include "libavcodec/aacenc_quantization_misc.h" + +#include "libavcodec/aaccoder_twoloop.h" + +static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe) +{ + int start = 0, i, w, w2, g, sid_sf_boost, prev_mid, prev_side; + uint8_t nextband0[128], nextband1[128]; + float M[128], S[128]; + float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3; + const float lambda = s->lambda; + const float mslambda = FFMIN(1.0f, lambda / 120.f); + SingleChannelElement *sce0 = &cpe->ch[0]; + SingleChannelElement *sce1 = &cpe->ch[1]; + if (!cpe->common_window) + return; + + /** Scout out next nonzero bands */ + ff_init_nextband_map(sce0, nextband0); + ff_init_nextband_map(sce1, nextband1); + + prev_mid = sce0->sf_idx[0]; + prev_side = sce1->sf_idx[0]; + for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) { + start = 0; + for (g = 0; g < sce0->ics.num_swb; g++) { + float bmax = bval2bmax(g * 17.0f / sce0->ics.num_swb) / 0.0045f; + if (!cpe->is_mask[w*16+g]) + cpe->ms_mask[w*16+g] = 0; + if (!sce0->zeroes[w*16+g] && !sce1->zeroes[w*16+g] && !cpe->is_mask[w*16+g]) { + float Mmax = 0.0f, Smax = 0.0f; + + /* Must compute mid/side SF and book for the whole window group */ + for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) { + for (i = 0; i < sce0->ics.swb_sizes[g]; i++) { + M[i] = (sce0->coeffs[start+(w+w2)*128+i] + + sce1->coeffs[start+(w+w2)*128+i]) * 0.5; + S[i] = M[i] + - sce1->coeffs[start+(w+w2)*128+i]; + } + abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]); + abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]); + for (i = 0; i < sce0->ics.swb_sizes[g]; i++ ) { + Mmax = FFMAX(Mmax, M34[i]); + Smax = FFMAX(Smax, S34[i]); + } + } + + for (sid_sf_boost = 0; sid_sf_boost < 4; sid_sf_boost++) { + float dist1 = 0.0f, dist2 = 0.0f; + int B0 = 0, B1 = 0; + int minidx; + int mididx, sididx; + int midcb, sidcb; + + minidx = FFMIN(sce0->sf_idx[w*16+g], sce1->sf_idx[w*16+g]); + mididx = av_clip(minidx, 0, SCALE_MAX_POS - SCALE_DIV_512); + sididx = av_clip(minidx - sid_sf_boost * 3, 0, SCALE_MAX_POS - SCALE_DIV_512); + if (sce0->band_type[w*16+g] != NOISE_BT && sce1->band_type[w*16+g] != NOISE_BT + && ( !ff_sfdelta_can_replace(sce0, nextband0, prev_mid, mididx, w*16+g) + || !ff_sfdelta_can_replace(sce1, nextband1, prev_side, sididx, w*16+g))) { + /* scalefactor range violation, bad stuff, will decrease quality unacceptably */ + continue; + } + + midcb = find_min_book(Mmax, mididx); + sidcb = find_min_book(Smax, sididx); + + /* No CB can be zero */ + midcb = FFMAX(1,midcb); + sidcb = FFMAX(1,sidcb); + + for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) { + FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g]; + FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g]; + float minthr = FFMIN(band0->threshold, band1->threshold); + int b1,b2,b3,b4; + for (i = 0; i < sce0->ics.swb_sizes[g]; i++) { + M[i] = (sce0->coeffs[start+(w+w2)*128+i] + + sce1->coeffs[start+(w+w2)*128+i]) * 0.5; + S[i] = M[i] + - sce1->coeffs[start+(w+w2)*128+i]; + } + + abs_pow34_v(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]); + abs_pow34_v(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]); + abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]); + abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]); + dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128], + L34, + sce0->ics.swb_sizes[g], + sce0->sf_idx[w*16+g], + sce0->band_type[w*16+g], + lambda / band0->threshold, INFINITY, &b1, NULL, 0); + dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128], + R34, + sce1->ics.swb_sizes[g], + sce1->sf_idx[w*16+g], + sce1->band_type[w*16+g], + lambda / band1->threshold, INFINITY, &b2, NULL, 0); + dist2 += quantize_band_cost(s, M, + M34, + sce0->ics.swb_sizes[g], + mididx, + midcb, + lambda / minthr, INFINITY, &b3, NULL, 0); + dist2 += quantize_band_cost(s, S, + S34, + sce1->ics.swb_sizes[g], + sididx, + sidcb, + mslambda / (minthr * bmax), INFINITY, &b4, NULL, 0); + B0 += b1+b2; + B1 += b3+b4; + dist1 -= b1+b2; + dist2 -= b3+b4; + } + cpe->ms_mask[w*16+g] = dist2 <= dist1 && B1 < B0; + if (cpe->ms_mask[w*16+g]) { + if (sce0->band_type[w*16+g] != NOISE_BT && sce1->band_type[w*16+g] != NOISE_BT) { + sce0->sf_idx[w*16+g] = mididx; + sce1->sf_idx[w*16+g] = sididx; + sce0->band_type[w*16+g] = midcb; + sce1->band_type[w*16+g] = sidcb; + } else if ((sce0->band_type[w*16+g] != NOISE_BT) ^ (sce1->band_type[w*16+g] != NOISE_BT)) { + /* ms_mask unneeded, and it confuses some decoders */ + cpe->ms_mask[w*16+g] = 0; + } + break; + } else if (B1 > B0) { + /* More boost won't fix this */ + break; + } + } + } + if (!sce0->zeroes[w*16+g] && sce0->band_type[w*16+g] < RESERVED_BT) + prev_mid = sce0->sf_idx[w*16+g]; + if (!sce1->zeroes[w*16+g] && !cpe->is_mask[w*16+g] && sce1->band_type[w*16+g] < RESERVED_BT) + prev_side = sce1->sf_idx[w*16+g]; + start += sce0->ics.swb_sizes[g]; + } + } +} +#endif /*HAVE_MIPSFPU */ + +#include "libavcodec/aaccoder_trellis.h" + +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_INLINE_ASM */ + +void ff_aac_coder_init_mips(AACEncContext *c) { +#if HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 + AACCoefficientsEncoder *e = c->coder; + int option = c->options.coder; + + if (option == 2) { + e->quantize_and_encode_band = quantize_and_encode_band_mips; + e->encode_window_bands_info = codebook_trellis_rate; +#if HAVE_MIPSFPU + e->search_for_quantizers = search_for_quantizers_twoloop; +#endif /* HAVE_MIPSFPU */ + } +#if HAVE_MIPSFPU + e->search_for_ms = search_for_ms_mips; +#endif /* HAVE_MIPSFPU */ +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_INLINE_ASM */ +} diff --git a/libavcodec/mips/aacdec_mips.c b/libavcodec/mips/aacdec_mips.c new file mode 100644 index 0000000000..253cdeb80b --- /dev/null +++ b/libavcodec/mips/aacdec_mips.c @@ -0,0 +1,442 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Authors: Darko Laus (darko@mips.com) + * Djordje Pesut (djordje@mips.com) + * Mirjana Vulin (mvulin@mips.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/aacdec.c + */ + +#include "libavcodec/aac.h" +#include "aacdec_mips.h" +#include "libavcodec/aactab.h" +#include "libavcodec/sinewin.h" +#include "libavutil/mips/asmdefs.h" + +#if HAVE_INLINE_ASM +static av_always_inline void float_copy(float *dst, const float *src, int count) +{ + // Copy 'count' floats from src to dst + const float *loop_end = src + count; + int temp[8]; + + // count must be a multiple of 8 + av_assert2(count % 8 == 0); + + // loop unrolled 8 times + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "1: \n\t" + "lw %[temp0], 0(%[src]) \n\t" + "lw %[temp1], 4(%[src]) \n\t" + "lw %[temp2], 8(%[src]) \n\t" + "lw %[temp3], 12(%[src]) \n\t" + "lw %[temp4], 16(%[src]) \n\t" + "lw %[temp5], 20(%[src]) \n\t" + "lw %[temp6], 24(%[src]) \n\t" + "lw %[temp7], 28(%[src]) \n\t" + PTR_ADDIU "%[src], %[src], 32 \n\t" + "sw %[temp0], 0(%[dst]) \n\t" + "sw %[temp1], 4(%[dst]) \n\t" + "sw %[temp2], 8(%[dst]) \n\t" + "sw %[temp3], 12(%[dst]) \n\t" + "sw %[temp4], 16(%[dst]) \n\t" + "sw %[temp5], 20(%[dst]) \n\t" + "sw %[temp6], 24(%[dst]) \n\t" + "sw %[temp7], 28(%[dst]) \n\t" + "bne %[src], %[loop_end], 1b \n\t" + PTR_ADDIU "%[dst], %[dst], 32 \n\t" + ".set pop \n\t" + + : [temp0]"=&r"(temp[0]), [temp1]"=&r"(temp[1]), + [temp2]"=&r"(temp[2]), [temp3]"=&r"(temp[3]), + [temp4]"=&r"(temp[4]), [temp5]"=&r"(temp[5]), + [temp6]"=&r"(temp[6]), [temp7]"=&r"(temp[7]), + [src]"+r"(src), [dst]"+r"(dst) + : [loop_end]"r"(loop_end) + : "memory" + ); +} + +static av_always_inline int lcg_random(unsigned previous_val) +{ + union { unsigned u; int s; } v = { previous_val * 1664525u + 1013904223 }; + return v.s; +} + +static void imdct_and_windowing_mips(AACContext *ac, SingleChannelElement *sce) +{ + IndividualChannelStream *ics = &sce->ics; + float *in = sce->coeffs; + float *out = sce->ret; + float *saved = sce->saved; + const float *swindow = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128; + const float *lwindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_long_1024 : ff_sine_1024; + const float *swindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_short_128 : ff_sine_128; + float *buf = ac->buf_mdct; + int i; + + if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) { + for (i = 0; i < 1024; i += 128) + ac->mdct_small.imdct_half(&ac->mdct_small, buf + i, in + i); + } else + ac->mdct.imdct_half(&ac->mdct, buf, in); + + /* window overlapping + * NOTE: To simplify the overlapping code, all 'meaningless' short to long + * and long to short transitions are considered to be short to short + * transitions. This leaves just two cases (long to long and short to short) + * with a little special sauce for EIGHT_SHORT_SEQUENCE. + */ + if ((ics->window_sequence[1] == ONLY_LONG_SEQUENCE || ics->window_sequence[1] == LONG_STOP_SEQUENCE) && + (ics->window_sequence[0] == ONLY_LONG_SEQUENCE || ics->window_sequence[0] == LONG_START_SEQUENCE)) { + ac->fdsp->vector_fmul_window( out, saved, buf, lwindow_prev, 512); + } else { + float_copy(out, saved, 448); + + if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) { + { + float wi; + float wj; + int i; + float temp0, temp1, temp2, temp3; + float *dst0 = out + 448 + 0*128; + float *dst1 = dst0 + 64 + 63; + float *dst2 = saved + 63; + float *win0 = (float*)swindow; + float *win1 = win0 + 64 + 63; + float *win0_prev = (float*)swindow_prev; + float *win1_prev = win0_prev + 64 + 63; + float *src0_prev = saved + 448; + float *src1_prev = buf + 0*128 + 63; + float *src0 = buf + 0*128 + 64; + float *src1 = buf + 1*128 + 63; + + for(i = 0; i < 64; i++) + { + temp0 = src0_prev[0]; + temp1 = src1_prev[0]; + wi = *win0_prev; + wj = *win1_prev; + temp2 = src0[0]; + temp3 = src1[0]; + dst0[0] = temp0 * wj - temp1 * wi; + dst1[0] = temp0 * wi + temp1 * wj; + + wi = *win0; + wj = *win1; + + temp0 = src0[128]; + temp1 = src1[128]; + dst0[128] = temp2 * wj - temp3 * wi; + dst1[128] = temp2 * wi + temp3 * wj; + + temp2 = src0[256]; + temp3 = src1[256]; + dst0[256] = temp0 * wj - temp1 * wi; + dst1[256] = temp0 * wi + temp1 * wj; + dst0[384] = temp2 * wj - temp3 * wi; + dst1[384] = temp2 * wi + temp3 * wj; + + temp0 = src0[384]; + temp1 = src1[384]; + dst0[512] = temp0 * wj - temp1 * wi; + dst2[0] = temp0 * wi + temp1 * wj; + + src0++; + src1--; + src0_prev++; + src1_prev--; + win0++; + win1--; + win0_prev++; + win1_prev--; + dst0++; + dst1--; + dst2--; + } + } + } else { + ac->fdsp->vector_fmul_window(out + 448, saved + 448, buf, swindow_prev, 64); + float_copy(out + 576, buf + 64, 448); + } + } + + // buffer update + if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) { + ac->fdsp->vector_fmul_window(saved + 64, buf + 4*128 + 64, buf + 5*128, swindow, 64); + ac->fdsp->vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 64); + ac->fdsp->vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 64); + float_copy(saved + 448, buf + 7*128 + 64, 64); + } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) { + float_copy(saved, buf + 512, 448); + float_copy(saved + 448, buf + 7*128 + 64, 64); + } else { // LONG_STOP or ONLY_LONG + float_copy(saved, buf + 512, 512); + } +} + +static void apply_ltp_mips(AACContext *ac, SingleChannelElement *sce) +{ + const LongTermPrediction *ltp = &sce->ics.ltp; + const uint16_t *offsets = sce->ics.swb_offset; + int i, sfb; + int j, k; + + if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) { + float *predTime = sce->ret; + float *predFreq = ac->buf_mdct; + float *p_predTime; + int16_t num_samples = 2048; + + if (ltp->lag < 1024) + num_samples = ltp->lag + 1024; + j = (2048 - num_samples) >> 2; + k = (2048 - num_samples) & 3; + p_predTime = &predTime[num_samples]; + + for (i = 0; i < num_samples; i++) + predTime[i] = sce->ltp_state[i + 2048 - ltp->lag] * ltp->coef; + for (i = 0; i < j; i++) { + + /* loop unrolled 4 times */ + __asm__ volatile ( + "sw $0, 0(%[p_predTime]) \n\t" + "sw $0, 4(%[p_predTime]) \n\t" + "sw $0, 8(%[p_predTime]) \n\t" + "sw $0, 12(%[p_predTime]) \n\t" + PTR_ADDIU "%[p_predTime], %[p_predTime], 16 \n\t" + + : [p_predTime]"+r"(p_predTime) + : + : "memory" + ); + } + for (i = 0; i < k; i++) { + + __asm__ volatile ( + "sw $0, 0(%[p_predTime]) \n\t" + PTR_ADDIU "%[p_predTime], %[p_predTime], 4 \n\t" + + : [p_predTime]"+r"(p_predTime) + : + : "memory" + ); + } + + ac->windowing_and_mdct_ltp(ac, predFreq, predTime, &sce->ics); + + if (sce->tns.present) + ac->apply_tns(predFreq, &sce->tns, &sce->ics, 0); + + for (sfb = 0; sfb < FFMIN(sce->ics.max_sfb, MAX_LTP_LONG_SFB); sfb++) + if (ltp->used[sfb]) + for (i = offsets[sfb]; i < offsets[sfb + 1]; i++) + sce->coeffs[i] += predFreq[i]; + } +} + +#if HAVE_MIPSFPU +static av_always_inline void fmul_and_reverse(float *dst, const float *src0, const float *src1, int count) +{ + /* Multiply 'count' floats in src0 by src1 and store the results in dst in reverse */ + /* This should be equivalent to a normal fmul, followed by reversing dst */ + + // count must be a multiple of 4 + av_assert2(count % 4 == 0); + + // move src0 and src1 to the last element of their arrays + src0 += count - 1; + src1 += count - 1; + + for (; count > 0; count -= 4){ + float temp[12]; + + /* loop unrolled 4 times */ + __asm__ volatile ( + "lwc1 %[temp0], 0(%[ptr2]) \n\t" + "lwc1 %[temp1], -4(%[ptr2]) \n\t" + "lwc1 %[temp2], -8(%[ptr2]) \n\t" + "lwc1 %[temp3], -12(%[ptr2]) \n\t" + "lwc1 %[temp4], 0(%[ptr3]) \n\t" + "lwc1 %[temp5], -4(%[ptr3]) \n\t" + "lwc1 %[temp6], -8(%[ptr3]) \n\t" + "lwc1 %[temp7], -12(%[ptr3]) \n\t" + "mul.s %[temp8], %[temp0], %[temp4] \n\t" + "mul.s %[temp9], %[temp1], %[temp5] \n\t" + "mul.s %[temp10], %[temp2], %[temp6] \n\t" + "mul.s %[temp11], %[temp3], %[temp7] \n\t" + "swc1 %[temp8], 0(%[ptr1]) \n\t" + "swc1 %[temp9], 4(%[ptr1]) \n\t" + "swc1 %[temp10], 8(%[ptr1]) \n\t" + "swc1 %[temp11], 12(%[ptr1]) \n\t" + PTR_ADDIU "%[ptr1], %[ptr1], 16 \n\t" + PTR_ADDIU "%[ptr2], %[ptr2], -16 \n\t" + PTR_ADDIU "%[ptr3], %[ptr3], -16 \n\t" + + : [temp0]"=&f"(temp[0]), [temp1]"=&f"(temp[1]), + [temp2]"=&f"(temp[2]), [temp3]"=&f"(temp[3]), + [temp4]"=&f"(temp[4]), [temp5]"=&f"(temp[5]), + [temp6]"=&f"(temp[6]), [temp7]"=&f"(temp[7]), + [temp8]"=&f"(temp[8]), [temp9]"=&f"(temp[9]), + [temp10]"=&f"(temp[10]), [temp11]"=&f"(temp[11]), + [ptr1]"+r"(dst), [ptr2]"+r"(src0), [ptr3]"+r"(src1) + : + : "memory" + ); + } +} + +static void update_ltp_mips(AACContext *ac, SingleChannelElement *sce) +{ + IndividualChannelStream *ics = &sce->ics; + float *saved = sce->saved; + float *saved_ltp = sce->coeffs; + const float *lwindow = ics->use_kb_window[0] ? ff_aac_kbd_long_1024 : ff_sine_1024; + const float *swindow = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128; + float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + + if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) { + float *p_saved_ltp = saved_ltp + 576; + float *loop_end1 = p_saved_ltp + 448; + + float_copy(saved_ltp, saved, 512); + + /* loop unrolled 8 times */ + __asm__ volatile ( + "1: \n\t" + "sw $0, 0(%[p_saved_ltp]) \n\t" + "sw $0, 4(%[p_saved_ltp]) \n\t" + "sw $0, 8(%[p_saved_ltp]) \n\t" + "sw $0, 12(%[p_saved_ltp]) \n\t" + "sw $0, 16(%[p_saved_ltp]) \n\t" + "sw $0, 20(%[p_saved_ltp]) \n\t" + "sw $0, 24(%[p_saved_ltp]) \n\t" + "sw $0, 28(%[p_saved_ltp]) \n\t" + PTR_ADDIU "%[p_saved_ltp],%[p_saved_ltp], 32 \n\t" + "bne %[p_saved_ltp], %[loop_end1], 1b \n\t" + + : [p_saved_ltp]"+r"(p_saved_ltp) + : [loop_end1]"r"(loop_end1) + : "memory" + ); + + ac->fdsp->vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960, &swindow[64], 64); + fmul_and_reverse(saved_ltp + 512, ac->buf_mdct + 960, swindow, 64); + } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) { + float *buff0 = saved; + float *buff1 = saved_ltp; + float *loop_end = saved + 448; + + /* loop unrolled 8 times */ + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "1: \n\t" + "lw %[temp0], 0(%[src]) \n\t" + "lw %[temp1], 4(%[src]) \n\t" + "lw %[temp2], 8(%[src]) \n\t" + "lw %[temp3], 12(%[src]) \n\t" + "lw %[temp4], 16(%[src]) \n\t" + "lw %[temp5], 20(%[src]) \n\t" + "lw %[temp6], 24(%[src]) \n\t" + "lw %[temp7], 28(%[src]) \n\t" + PTR_ADDIU "%[src], %[src], 32 \n\t" + "sw %[temp0], 0(%[dst]) \n\t" + "sw %[temp1], 4(%[dst]) \n\t" + "sw %[temp2], 8(%[dst]) \n\t" + "sw %[temp3], 12(%[dst]) \n\t" + "sw %[temp4], 16(%[dst]) \n\t" + "sw %[temp5], 20(%[dst]) \n\t" + "sw %[temp6], 24(%[dst]) \n\t" + "sw %[temp7], 28(%[dst]) \n\t" + "sw $0, 2304(%[dst]) \n\t" + "sw $0, 2308(%[dst]) \n\t" + "sw $0, 2312(%[dst]) \n\t" + "sw $0, 2316(%[dst]) \n\t" + "sw $0, 2320(%[dst]) \n\t" + "sw $0, 2324(%[dst]) \n\t" + "sw $0, 2328(%[dst]) \n\t" + "sw $0, 2332(%[dst]) \n\t" + "bne %[src], %[loop_end], 1b \n\t" + PTR_ADDIU "%[dst], %[dst], 32 \n\t" + ".set pop \n\t" + + : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), + [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), + [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), + [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), + [src]"+r"(buff0), [dst]"+r"(buff1) + : [loop_end]"r"(loop_end) + : "memory" + ); + ac->fdsp->vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960, &swindow[64], 64); + fmul_and_reverse(saved_ltp + 512, ac->buf_mdct + 960, swindow, 64); + } else { // LONG_STOP or ONLY_LONG + ac->fdsp->vector_fmul_reverse(saved_ltp, ac->buf_mdct + 512, &lwindow[512], 512); + fmul_and_reverse(saved_ltp + 512, ac->buf_mdct + 512, lwindow, 512); + } + + float_copy(sce->ltp_state, sce->ltp_state + 1024, 1024); + float_copy(sce->ltp_state + 1024, sce->ret, 1024); + float_copy(sce->ltp_state + 2048, saved_ltp, 1024); +} +#endif /* HAVE_MIPSFPU */ +#endif /* HAVE_INLINE_ASM */ + +void ff_aacdec_init_mips(AACContext *c) +{ +#if HAVE_INLINE_ASM + c->imdct_and_windowing = imdct_and_windowing_mips; + c->apply_ltp = apply_ltp_mips; +#if HAVE_MIPSFPU + c->update_ltp = update_ltp_mips; +#endif /* HAVE_MIPSFPU */ +#endif /* HAVE_INLINE_ASM */ +} diff --git a/libavcodec/mips/aacdec_mips.h b/libavcodec/mips/aacdec_mips.h new file mode 100644 index 0000000000..758266fc16 --- /dev/null +++ b/libavcodec/mips/aacdec_mips.h @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Authors: Darko Laus (darko@mips.com) + * Djordje Pesut (djordje@mips.com) + * Mirjana Vulin (mvulin@mips.com) + * + * AAC Spectral Band Replication decoding functions optimized for MIPS + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/aacdec.c + */ + +#ifndef AVCODEC_MIPS_AACDEC_MIPS_H +#define AVCODEC_MIPS_AACDEC_MIPS_H + +#include "libavcodec/aac.h" +#include "libavutil/mips/asmdefs.h" + +#if HAVE_INLINE_ASM && HAVE_MIPSFPU +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 +static inline float *VMUL2_mips(float *dst, const float *v, unsigned idx, + const float *scale) +{ + float temp0, temp1, temp2; + int temp3, temp4; + float *ret; + + __asm__ volatile( + "andi %[temp3], %[idx], 0x0F \n\t" + "andi %[temp4], %[idx], 0xF0 \n\t" + "sll %[temp3], %[temp3], 2 \n\t" + "srl %[temp4], %[temp4], 2 \n\t" + "lwc1 %[temp2], 0(%[scale]) \n\t" + "lwxc1 %[temp0], %[temp3](%[v]) \n\t" + "lwxc1 %[temp1], %[temp4](%[v]) \n\t" + "mul.s %[temp0], %[temp0], %[temp2] \n\t" + "mul.s %[temp1], %[temp1], %[temp2] \n\t" + PTR_ADDIU "%[ret], %[dst], 8 \n\t" + "swc1 %[temp0], 0(%[dst]) \n\t" + "swc1 %[temp1], 4(%[dst]) \n\t" + + : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), + [temp2]"=&f"(temp2), [temp3]"=&r"(temp3), + [temp4]"=&r"(temp4), [ret]"=&r"(ret) + : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v), + [dst]"r"(dst) + : "memory" + ); + return ret; +} + +static inline float *VMUL4_mips(float *dst, const float *v, unsigned idx, + const float *scale) +{ + int temp0, temp1, temp2, temp3; + float temp4, temp5, temp6, temp7, temp8; + float *ret; + + __asm__ volatile( + "andi %[temp0], %[idx], 0x03 \n\t" + "andi %[temp1], %[idx], 0x0C \n\t" + "andi %[temp2], %[idx], 0x30 \n\t" + "andi %[temp3], %[idx], 0xC0 \n\t" + "sll %[temp0], %[temp0], 2 \n\t" + "srl %[temp2], %[temp2], 2 \n\t" + "srl %[temp3], %[temp3], 4 \n\t" + "lwc1 %[temp4], 0(%[scale]) \n\t" + "lwxc1 %[temp5], %[temp0](%[v]) \n\t" + "lwxc1 %[temp6], %[temp1](%[v]) \n\t" + "lwxc1 %[temp7], %[temp2](%[v]) \n\t" + "lwxc1 %[temp8], %[temp3](%[v]) \n\t" + "mul.s %[temp5], %[temp5], %[temp4] \n\t" + "mul.s %[temp6], %[temp6], %[temp4] \n\t" + "mul.s %[temp7], %[temp7], %[temp4] \n\t" + "mul.s %[temp8], %[temp8], %[temp4] \n\t" + PTR_ADDIU "%[ret], %[dst], 16 \n\t" + "swc1 %[temp5], 0(%[dst]) \n\t" + "swc1 %[temp6], 4(%[dst]) \n\t" + "swc1 %[temp7], 8(%[dst]) \n\t" + "swc1 %[temp8], 12(%[dst]) \n\t" + + : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), + [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), + [temp4]"=&f"(temp4), [temp5]"=&f"(temp5), + [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), + [temp8]"=&f"(temp8), [ret]"=&r"(ret) + : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v), + [dst]"r"(dst) + : "memory" + ); + return ret; +} + +static inline float *VMUL2S_mips(float *dst, const float *v, unsigned idx, + unsigned sign, const float *scale) +{ + int temp0, temp1, temp2, temp3, temp4, temp5; + float temp6, temp7, temp8, temp9; + float *ret; + + __asm__ volatile( + "andi %[temp0], %[idx], 0x0F \n\t" + "andi %[temp1], %[idx], 0xF0 \n\t" + "lw %[temp4], 0(%[scale]) \n\t" + "srl %[temp2], %[sign], 1 \n\t" + "sll %[temp3], %[sign], 31 \n\t" + "sll %[temp2], %[temp2], 31 \n\t" + "sll %[temp0], %[temp0], 2 \n\t" + "srl %[temp1], %[temp1], 2 \n\t" + "lwxc1 %[temp8], %[temp0](%[v]) \n\t" + "lwxc1 %[temp9], %[temp1](%[v]) \n\t" + "xor %[temp5], %[temp4], %[temp2] \n\t" + "xor %[temp4], %[temp4], %[temp3] \n\t" + "mtc1 %[temp5], %[temp6] \n\t" + "mtc1 %[temp4], %[temp7] \n\t" + "mul.s %[temp8], %[temp8], %[temp6] \n\t" + "mul.s %[temp9], %[temp9], %[temp7] \n\t" + PTR_ADDIU "%[ret], %[dst], 8 \n\t" + "swc1 %[temp8], 0(%[dst]) \n\t" + "swc1 %[temp9], 4(%[dst]) \n\t" + + : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), + [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), + [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), + [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), + [temp8]"=&f"(temp8), [temp9]"=&f"(temp9), + [ret]"=&r"(ret) + : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v), + [dst]"r"(dst), [sign]"r"(sign) + : "memory" + ); + return ret; +} + +static inline float *VMUL4S_mips(float *dst, const float *v, unsigned idx, + unsigned sign, const float *scale) +{ + int temp0, temp1, temp2, temp3, temp4; + float temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17; + float *ret; + unsigned int mask = 1U << 31; + + __asm__ volatile( + "lw %[temp0], 0(%[scale]) \n\t" + "andi %[temp1], %[idx], 0x03 \n\t" + "andi %[temp2], %[idx], 0x0C \n\t" + "andi %[temp3], %[idx], 0x30 \n\t" + "andi %[temp4], %[idx], 0xC0 \n\t" + "sll %[temp1], %[temp1], 2 \n\t" + "srl %[temp3], %[temp3], 2 \n\t" + "srl %[temp4], %[temp4], 4 \n\t" + "lwxc1 %[temp10], %[temp1](%[v]) \n\t" + "lwxc1 %[temp11], %[temp2](%[v]) \n\t" + "lwxc1 %[temp12], %[temp3](%[v]) \n\t" + "lwxc1 %[temp13], %[temp4](%[v]) \n\t" + "and %[temp1], %[sign], %[mask] \n\t" + "srl %[temp2], %[idx], 12 \n\t" + "srl %[temp3], %[idx], 13 \n\t" + "srl %[temp4], %[idx], 14 \n\t" + "andi %[temp2], %[temp2], 1 \n\t" + "andi %[temp3], %[temp3], 1 \n\t" + "andi %[temp4], %[temp4], 1 \n\t" + "sllv %[sign], %[sign], %[temp2] \n\t" + "xor %[temp1], %[temp0], %[temp1] \n\t" + "and %[temp2], %[sign], %[mask] \n\t" + "mtc1 %[temp1], %[temp14] \n\t" + "xor %[temp2], %[temp0], %[temp2] \n\t" + "sllv %[sign], %[sign], %[temp3] \n\t" + "mtc1 %[temp2], %[temp15] \n\t" + "and %[temp3], %[sign], %[mask] \n\t" + "sllv %[sign], %[sign], %[temp4] \n\t" + "xor %[temp3], %[temp0], %[temp3] \n\t" + "and %[temp4], %[sign], %[mask] \n\t" + "mtc1 %[temp3], %[temp16] \n\t" + "xor %[temp4], %[temp0], %[temp4] \n\t" + "mtc1 %[temp4], %[temp17] \n\t" + "mul.s %[temp10], %[temp10], %[temp14] \n\t" + "mul.s %[temp11], %[temp11], %[temp15] \n\t" + "mul.s %[temp12], %[temp12], %[temp16] \n\t" + "mul.s %[temp13], %[temp13], %[temp17] \n\t" + PTR_ADDIU "%[ret], %[dst], 16 \n\t" + "swc1 %[temp10], 0(%[dst]) \n\t" + "swc1 %[temp11], 4(%[dst]) \n\t" + "swc1 %[temp12], 8(%[dst]) \n\t" + "swc1 %[temp13], 12(%[dst]) \n\t" + + : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), + [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), + [temp4]"=&r"(temp4), [temp10]"=&f"(temp10), + [temp11]"=&f"(temp11), [temp12]"=&f"(temp12), + [temp13]"=&f"(temp13), [temp14]"=&f"(temp14), + [temp15]"=&f"(temp15), [temp16]"=&f"(temp16), + [temp17]"=&f"(temp17), [ret]"=&r"(ret), + [sign]"+r"(sign) + : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v), + [dst]"r"(dst), [mask]"r"(mask) + : "memory" + ); + return ret; +} + +#define VMUL2 VMUL2_mips +#define VMUL4 VMUL4_mips +#define VMUL2S VMUL2S_mips +#define VMUL4S VMUL4S_mips +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_INLINE_ASM && HAVE_MIPSFPU */ + +#endif /* AVCODEC_MIPS_AACDEC_MIPS_H */ diff --git a/libavcodec/mips/aacpsdsp_mips.c b/libavcodec/mips/aacpsdsp_mips.c new file mode 100644 index 0000000000..83fdc2f9db --- /dev/null +++ b/libavcodec/mips/aacpsdsp_mips.c @@ -0,0 +1,464 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Authors: Darko Laus (darko@mips.com) + * Djordje Pesut (djordje@mips.com) + * Mirjana Vulin (mvulin@mips.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/aacpsdsp.c + */ + +#include "config.h" +#include "libavcodec/aacpsdsp.h" +#include "libavutil/mips/asmdefs.h" + +#if HAVE_INLINE_ASM +static void ps_hybrid_analysis_ileave_mips(float (*out)[32][2], float L[2][38][64], + int i, int len) +{ + int temp0, temp1, temp2, temp3; + int temp4, temp5, temp6, temp7; + float *out1=&out[i][0][0]; + float *L1=&L[0][0][i]; + float *j=out1+ len*2; + + for (; i < 64; i++) { + + /* loop unrolled 8 times */ + __asm__ volatile ( + "1: \n\t" + "lw %[temp0], 0(%[L1]) \n\t" + "lw %[temp1], 9728(%[L1]) \n\t" + "lw %[temp2], 256(%[L1]) \n\t" + "lw %[temp3], 9984(%[L1]) \n\t" + "lw %[temp4], 512(%[L1]) \n\t" + "lw %[temp5], 10240(%[L1]) \n\t" + "lw %[temp6], 768(%[L1]) \n\t" + "lw %[temp7], 10496(%[L1]) \n\t" + "sw %[temp0], 0(%[out1]) \n\t" + "sw %[temp1], 4(%[out1]) \n\t" + "sw %[temp2], 8(%[out1]) \n\t" + "sw %[temp3], 12(%[out1]) \n\t" + "sw %[temp4], 16(%[out1]) \n\t" + "sw %[temp5], 20(%[out1]) \n\t" + "sw %[temp6], 24(%[out1]) \n\t" + "sw %[temp7], 28(%[out1]) \n\t" + PTR_ADDIU "%[out1], %[out1], 32 \n\t" + PTR_ADDIU "%[L1], %[L1], 1024 \n\t" + "bne %[out1], %[j], 1b \n\t" + + : [out1]"+r"(out1), [L1]"+r"(L1), [j]"+r"(j), + [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), + [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), + [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), + [temp6]"=&r"(temp6), [temp7]"=&r"(temp7) + : [len]"r"(len) + : "memory" + ); + out1-=(len<<1)-64; + L1-=(len<<6)-1; + j+=len*2; + } +} + +static void ps_hybrid_synthesis_deint_mips(float out[2][38][64], + float (*in)[32][2], + int i, int len) +{ + int n; + int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + float *out1 = (float*)out + i; + float *out2 = (float*)out + 2432 + i; + float *in1 = (float*)in + 64 * i; + float *in2 = (float*)in + 64 * i + 1; + + for (; i < 64; i++) { + for (n = 0; n < 7; n++) { + + /* loop unrolled 8 times */ + __asm__ volatile ( + "lw %[temp0], 0(%[in1]) \n\t" + "lw %[temp1], 0(%[in2]) \n\t" + "lw %[temp2], 8(%[in1]) \n\t" + "lw %[temp3], 8(%[in2]) \n\t" + "lw %[temp4], 16(%[in1]) \n\t" + "lw %[temp5], 16(%[in2]) \n\t" + "lw %[temp6], 24(%[in1]) \n\t" + "lw %[temp7], 24(%[in2]) \n\t" + PTR_ADDIU "%[out1], %[out1], 1024 \n\t" + PTR_ADDIU "%[out2], %[out2], 1024 \n\t" + PTR_ADDIU "%[in1], %[in1], 32 \n\t" + PTR_ADDIU "%[in2], %[in2], 32 \n\t" + "sw %[temp0], -1024(%[out1]) \n\t" + "sw %[temp1], -1024(%[out2]) \n\t" + "sw %[temp2], -768(%[out1]) \n\t" + "sw %[temp3], -768(%[out2]) \n\t" + "sw %[temp4], -512(%[out1]) \n\t" + "sw %[temp5], -512(%[out2]) \n\t" + "sw %[temp6], -256(%[out1]) \n\t" + "sw %[temp7], -256(%[out2]) \n\t" + + : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), + [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), + [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), + [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), + [out1]"+r"(out1), [out2]"+r"(out2), + [in1]"+r"(in1), [in2]"+r"(in2) + : + : "memory" + ); + } + /* loop unrolled 8 times */ + __asm__ volatile ( + "lw %[temp0], 0(%[in1]) \n\t" + "lw %[temp1], 0(%[in2]) \n\t" + "lw %[temp2], 8(%[in1]) \n\t" + "lw %[temp3], 8(%[in2]) \n\t" + "lw %[temp4], 16(%[in1]) \n\t" + "lw %[temp5], 16(%[in2]) \n\t" + "lw %[temp6], 24(%[in1]) \n\t" + "lw %[temp7], 24(%[in2]) \n\t" + PTR_ADDIU "%[out1], %[out1], -7164 \n\t" + PTR_ADDIU "%[out2], %[out2], -7164 \n\t" + PTR_ADDIU "%[in1], %[in1], 32 \n\t" + PTR_ADDIU "%[in2], %[in2], 32 \n\t" + "sw %[temp0], 7164(%[out1]) \n\t" + "sw %[temp1], 7164(%[out2]) \n\t" + "sw %[temp2], 7420(%[out1]) \n\t" + "sw %[temp3], 7420(%[out2]) \n\t" + "sw %[temp4], 7676(%[out1]) \n\t" + "sw %[temp5], 7676(%[out2]) \n\t" + "sw %[temp6], 7932(%[out1]) \n\t" + "sw %[temp7], 7932(%[out2]) \n\t" + + : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), + [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), + [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), + [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), + [out1]"+r"(out1), [out2]"+r"(out2), + [in1]"+r"(in1), [in2]"+r"(in2) + : + : "memory" + ); + } +} + +#if HAVE_MIPSFPU +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 +static void ps_add_squares_mips(float *dst, const float (*src)[2], int n) +{ + int i; + float temp0, temp1, temp2, temp3, temp4, temp5; + float temp6, temp7, temp8, temp9, temp10, temp11; + float *src0 = (float*)&src[0][0]; + float *dst0 = &dst[0]; + + for (i = 0; i < 8; i++) { + /* loop unrolled 4 times */ + __asm__ volatile ( + "lwc1 %[temp0], 0(%[src0]) \n\t" + "lwc1 %[temp1], 4(%[src0]) \n\t" + "lwc1 %[temp2], 8(%[src0]) \n\t" + "lwc1 %[temp3], 12(%[src0]) \n\t" + "lwc1 %[temp4], 16(%[src0]) \n\t" + "lwc1 %[temp5], 20(%[src0]) \n\t" + "lwc1 %[temp6], 24(%[src0]) \n\t" + "lwc1 %[temp7], 28(%[src0]) \n\t" + "lwc1 %[temp8], 0(%[dst0]) \n\t" + "lwc1 %[temp9], 4(%[dst0]) \n\t" + "lwc1 %[temp10], 8(%[dst0]) \n\t" + "lwc1 %[temp11], 12(%[dst0]) \n\t" + "mul.s %[temp1], %[temp1], %[temp1] \n\t" + "mul.s %[temp3], %[temp3], %[temp3] \n\t" + "mul.s %[temp5], %[temp5], %[temp5] \n\t" + "mul.s %[temp7], %[temp7], %[temp7] \n\t" + "madd.s %[temp0], %[temp1], %[temp0], %[temp0] \n\t" + "madd.s %[temp2], %[temp3], %[temp2], %[temp2] \n\t" + "madd.s %[temp4], %[temp5], %[temp4], %[temp4] \n\t" + "madd.s %[temp6], %[temp7], %[temp6], %[temp6] \n\t" + "add.s %[temp0], %[temp8], %[temp0] \n\t" + "add.s %[temp2], %[temp9], %[temp2] \n\t" + "add.s %[temp4], %[temp10], %[temp4] \n\t" + "add.s %[temp6], %[temp11], %[temp6] \n\t" + "swc1 %[temp0], 0(%[dst0]) \n\t" + "swc1 %[temp2], 4(%[dst0]) \n\t" + "swc1 %[temp4], 8(%[dst0]) \n\t" + "swc1 %[temp6], 12(%[dst0]) \n\t" + PTR_ADDIU "%[dst0], %[dst0], 16 \n\t" + PTR_ADDIU "%[src0], %[src0], 32 \n\t" + + : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5), + [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8), + [temp9]"=&f"(temp9), [dst0]"+r"(dst0), [src0]"+r"(src0), + [temp10]"=&f"(temp10), [temp11]"=&f"(temp11) + : + : "memory" + ); + } +} + +static void ps_mul_pair_single_mips(float (*dst)[2], float (*src0)[2], float *src1, + int n) +{ + float temp0, temp1, temp2; + float *p_d, *p_s0, *p_s1, *end; + p_d = &dst[0][0]; + p_s0 = &src0[0][0]; + p_s1 = &src1[0]; + end = p_s1 + n; + + __asm__ volatile( + ".set push \n\t" + ".set noreorder \n\t" + "1: \n\t" + "lwc1 %[temp2], 0(%[p_s1]) \n\t" + "lwc1 %[temp0], 0(%[p_s0]) \n\t" + "lwc1 %[temp1], 4(%[p_s0]) \n\t" + PTR_ADDIU "%[p_d], %[p_d], 8 \n\t" + "mul.s %[temp0], %[temp0], %[temp2] \n\t" + "mul.s %[temp1], %[temp1], %[temp2] \n\t" + PTR_ADDIU "%[p_s0], %[p_s0], 8 \n\t" + "swc1 %[temp0], -8(%[p_d]) \n\t" + "swc1 %[temp1], -4(%[p_d]) \n\t" + "bne %[p_s1], %[end], 1b \n\t" + PTR_ADDIU "%[p_s1], %[p_s1], 4 \n\t" + ".set pop \n\t" + + : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), + [temp2]"=&f"(temp2), [p_d]"+r"(p_d), + [p_s0]"+r"(p_s0), [p_s1]"+r"(p_s1) + : [end]"r"(end) + : "memory" + ); +} + +static void ps_decorrelate_mips(float (*out)[2], float (*delay)[2], + float (*ap_delay)[PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2], + const float phi_fract[2], const float (*Q_fract)[2], + const float *transient_gain, + float g_decay_slope, + int len) +{ + float *p_delay = &delay[0][0]; + float *p_out = &out[0][0]; + float *p_ap_delay = &ap_delay[0][0][0]; + const float *p_t_gain = transient_gain; + const float *p_Q_fract = &Q_fract[0][0]; + float ag0, ag1, ag2; + float phi_fract0 = phi_fract[0]; + float phi_fract1 = phi_fract[1]; + float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; + + float *p_delay_end = (p_delay + (len << 1)); + + /* merged 2 loops */ + __asm__ volatile( + ".set push \n\t" + ".set noreorder \n\t" + "li.s %[ag0], 0.65143905753106 \n\t" + "li.s %[ag1], 0.56471812200776 \n\t" + "li.s %[ag2], 0.48954165955695 \n\t" + "mul.s %[ag0], %[ag0], %[g_decay_slope] \n\t" + "mul.s %[ag1], %[ag1], %[g_decay_slope] \n\t" + "mul.s %[ag2], %[ag2], %[g_decay_slope] \n\t" + "1: \n\t" + "lwc1 %[temp0], 0(%[p_delay]) \n\t" + "lwc1 %[temp1], 4(%[p_delay]) \n\t" + "lwc1 %[temp4], 16(%[p_ap_delay]) \n\t" + "lwc1 %[temp5], 20(%[p_ap_delay]) \n\t" + "mul.s %[temp3], %[temp0], %[phi_fract1] \n\t" + "lwc1 %[temp6], 0(%[p_Q_fract]) \n\t" + "mul.s %[temp2], %[temp1], %[phi_fract1] \n\t" + "lwc1 %[temp7], 4(%[p_Q_fract]) \n\t" + "madd.s %[temp3], %[temp3], %[temp1], %[phi_fract0] \n\t" + "msub.s %[temp2], %[temp2], %[temp0], %[phi_fract0] \n\t" + "mul.s %[temp8], %[temp5], %[temp7] \n\t" + "mul.s %[temp9], %[temp4], %[temp7] \n\t" + "lwc1 %[temp7], 12(%[p_Q_fract]) \n\t" + "mul.s %[temp0], %[ag0], %[temp2] \n\t" + "mul.s %[temp1], %[ag0], %[temp3] \n\t" + "msub.s %[temp8], %[temp8], %[temp4], %[temp6] \n\t" + "lwc1 %[temp4], 304(%[p_ap_delay]) \n\t" + "madd.s %[temp9], %[temp9], %[temp5], %[temp6] \n\t" + "lwc1 %[temp5], 308(%[p_ap_delay]) \n\t" + "sub.s %[temp0], %[temp8], %[temp0] \n\t" + "sub.s %[temp1], %[temp9], %[temp1] \n\t" + "madd.s %[temp2], %[temp2], %[ag0], %[temp0] \n\t" + "lwc1 %[temp6], 8(%[p_Q_fract]) \n\t" + "madd.s %[temp3], %[temp3], %[ag0], %[temp1] \n\t" + "mul.s %[temp8], %[temp5], %[temp7] \n\t" + "mul.s %[temp9], %[temp4], %[temp7] \n\t" + "lwc1 %[temp7], 20(%[p_Q_fract]) \n\t" + "msub.s %[temp8], %[temp8], %[temp4], %[temp6] \n\t" + "swc1 %[temp2], 40(%[p_ap_delay]) \n\t" + "mul.s %[temp2], %[ag1], %[temp0] \n\t" + "swc1 %[temp3], 44(%[p_ap_delay]) \n\t" + "mul.s %[temp3], %[ag1], %[temp1] \n\t" + "lwc1 %[temp4], 592(%[p_ap_delay]) \n\t" + "madd.s %[temp9], %[temp9], %[temp5], %[temp6] \n\t" + "lwc1 %[temp5], 596(%[p_ap_delay]) \n\t" + "sub.s %[temp2], %[temp8], %[temp2] \n\t" + "sub.s %[temp3], %[temp9], %[temp3] \n\t" + "lwc1 %[temp6], 16(%[p_Q_fract]) \n\t" + "madd.s %[temp0], %[temp0], %[ag1], %[temp2] \n\t" + "madd.s %[temp1], %[temp1], %[ag1], %[temp3] \n\t" + "mul.s %[temp8], %[temp5], %[temp7] \n\t" + "mul.s %[temp9], %[temp4], %[temp7] \n\t" + "msub.s %[temp8], %[temp8], %[temp4], %[temp6] \n\t" + "madd.s %[temp9], %[temp9], %[temp5], %[temp6] \n\t" + "swc1 %[temp0], 336(%[p_ap_delay]) \n\t" + "mul.s %[temp0], %[ag2], %[temp2] \n\t" + "swc1 %[temp1], 340(%[p_ap_delay]) \n\t" + "mul.s %[temp1], %[ag2], %[temp3] \n\t" + "lwc1 %[temp4], 0(%[p_t_gain]) \n\t" + "sub.s %[temp0], %[temp8], %[temp0] \n\t" + PTR_ADDIU "%[p_ap_delay], %[p_ap_delay], 8 \n\t" + "sub.s %[temp1], %[temp9], %[temp1] \n\t" + PTR_ADDIU "%[p_t_gain], %[p_t_gain], 4 \n\t" + "madd.s %[temp2], %[temp2], %[ag2], %[temp0] \n\t" + PTR_ADDIU "%[p_delay], %[p_delay], 8 \n\t" + "madd.s %[temp3], %[temp3], %[ag2], %[temp1] \n\t" + PTR_ADDIU "%[p_out], %[p_out], 8 \n\t" + "mul.s %[temp5], %[temp4], %[temp0] \n\t" + "mul.s %[temp6], %[temp4], %[temp1] \n\t" + "swc1 %[temp2], 624(%[p_ap_delay]) \n\t" + "swc1 %[temp3], 628(%[p_ap_delay]) \n\t" + "swc1 %[temp5], -8(%[p_out]) \n\t" + "swc1 %[temp6], -4(%[p_out]) \n\t" + "bne %[p_delay], %[p_delay_end],1b \n\t" + " swc1 %[temp6], -4(%[p_out]) \n\t" + ".set pop \n\t" + + : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5), + [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8), + [temp9]"=&f"(temp9), [p_delay]"+r"(p_delay), [p_ap_delay]"+r"(p_ap_delay), + [p_Q_fract]"+r"(p_Q_fract), [p_t_gain]"+r"(p_t_gain), [p_out]"+r"(p_out), + [ag0]"=&f"(ag0), [ag1]"=&f"(ag1), [ag2]"=&f"(ag2) + : [phi_fract0]"f"(phi_fract0), [phi_fract1]"f"(phi_fract1), + [p_delay_end]"r"(p_delay_end), [g_decay_slope]"f"(g_decay_slope) + : "memory" + ); +} + +static void ps_stereo_interpolate_mips(float (*l)[2], float (*r)[2], + float h[2][4], float h_step[2][4], + int len) +{ + float h0 = h[0][0]; + float h1 = h[0][1]; + float h2 = h[0][2]; + float h3 = h[0][3]; + float hs0 = h_step[0][0]; + float hs1 = h_step[0][1]; + float hs2 = h_step[0][2]; + float hs3 = h_step[0][3]; + float temp0, temp1, temp2, temp3; + float l_re, l_im, r_re, r_im; + + float *l_end = ((float *)l + (len << 1)); + + __asm__ volatile( + ".set push \n\t" + ".set noreorder \n\t" + "1: \n\t" + "add.s %[h0], %[h0], %[hs0] \n\t" + "lwc1 %[l_re], 0(%[l]) \n\t" + "add.s %[h1], %[h1], %[hs1] \n\t" + "lwc1 %[r_re], 0(%[r]) \n\t" + "add.s %[h2], %[h2], %[hs2] \n\t" + "lwc1 %[l_im], 4(%[l]) \n\t" + "add.s %[h3], %[h3], %[hs3] \n\t" + "lwc1 %[r_im], 4(%[r]) \n\t" + "mul.s %[temp0], %[h0], %[l_re] \n\t" + PTR_ADDIU "%[l], %[l], 8 \n\t" + "mul.s %[temp2], %[h1], %[l_re] \n\t" + PTR_ADDIU "%[r], %[r], 8 \n\t" + "madd.s %[temp0], %[temp0], %[h2], %[r_re] \n\t" + "madd.s %[temp2], %[temp2], %[h3], %[r_re] \n\t" + "mul.s %[temp1], %[h0], %[l_im] \n\t" + "mul.s %[temp3], %[h1], %[l_im] \n\t" + "madd.s %[temp1], %[temp1], %[h2], %[r_im] \n\t" + "madd.s %[temp3], %[temp3], %[h3], %[r_im] \n\t" + "swc1 %[temp0], -8(%[l]) \n\t" + "swc1 %[temp2], -8(%[r]) \n\t" + "swc1 %[temp1], -4(%[l]) \n\t" + "bne %[l], %[l_end], 1b \n\t" + " swc1 %[temp3], -4(%[r]) \n\t" + ".set pop \n\t" + + : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), + [temp2]"=&f"(temp2), [temp3]"=&f"(temp3), + [h0]"+f"(h0), [h1]"+f"(h1), [h2]"+f"(h2), + [h3]"+f"(h3), [l]"+r"(l), [r]"+r"(r), + [l_re]"=&f"(l_re), [l_im]"=&f"(l_im), + [r_re]"=&f"(r_re), [r_im]"=&f"(r_im) + : [hs0]"f"(hs0), [hs1]"f"(hs1), [hs2]"f"(hs2), + [hs3]"f"(hs3), [l_end]"r"(l_end) + : "memory" + ); +} +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_MIPSFPU */ +#endif /* HAVE_INLINE_ASM */ + +void ff_psdsp_init_mips(PSDSPContext *s) +{ +#if HAVE_INLINE_ASM + s->hybrid_analysis_ileave = ps_hybrid_analysis_ileave_mips; + s->hybrid_synthesis_deint = ps_hybrid_synthesis_deint_mips; +#if HAVE_MIPSFPU +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 + s->add_squares = ps_add_squares_mips; + s->mul_pair_single = ps_mul_pair_single_mips; + s->decorrelate = ps_decorrelate_mips; + s->stereo_interpolate[0] = ps_stereo_interpolate_mips; +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_MIPSFPU */ +#endif /* HAVE_INLINE_ASM */ +} diff --git a/libavcodec/mips/aacpsy_mips.h b/libavcodec/mips/aacpsy_mips.h new file mode 100644 index 0000000000..a1fe5ccea9 --- /dev/null +++ b/libavcodec/mips/aacpsy_mips.h @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Bojan Zivkovic (bojan@mips.com) + * + * AAC encoder psychoacoustic model routines optimized + * for MIPS floating-point architecture + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/aacpsy.c + */ + +#ifndef AVCODEC_MIPS_AACPSY_MIPS_H +#define AVCODEC_MIPS_AACPSY_MIPS_H + +#include "libavutil/mips/asmdefs.h" + +#if HAVE_INLINE_ASM && HAVE_MIPSFPU && ( PSY_LAME_FIR_LEN == 21 ) +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 +static void calc_thr_3gpp_mips(const FFPsyWindowInfo *wi, const int num_bands, + AacPsyChannel *pch, const uint8_t *band_sizes, + const float *coefs, const int cutoff) +{ + int i, w, g; + int start = 0, wstart = 0; + for (w = 0; w < wi->num_windows*16; w += 16) { + wstart = 0; + for (g = 0; g < num_bands; g++) { + AacPsyBand *band = &pch->band[w+g]; + + float form_factor = 0.0f; + float Temp; + band->energy = 0.0f; + if (wstart < cutoff) { + for (i = 0; i < band_sizes[g]; i+=4) { + float a, b, c, d; + float ax, bx, cx, dx; + float *cf = (float *)&coefs[start+i]; + + __asm__ volatile ( + "lwc1 %[a], 0(%[cf]) \n\t" + "lwc1 %[b], 4(%[cf]) \n\t" + "lwc1 %[c], 8(%[cf]) \n\t" + "lwc1 %[d], 12(%[cf]) \n\t" + "abs.s %[a], %[a] \n\t" + "abs.s %[b], %[b] \n\t" + "abs.s %[c], %[c] \n\t" + "abs.s %[d], %[d] \n\t" + "sqrt.s %[ax], %[a] \n\t" + "sqrt.s %[bx], %[b] \n\t" + "sqrt.s %[cx], %[c] \n\t" + "sqrt.s %[dx], %[d] \n\t" + "madd.s %[e], %[e], %[a], %[a] \n\t" + "madd.s %[e], %[e], %[b], %[b] \n\t" + "madd.s %[e], %[e], %[c], %[c] \n\t" + "madd.s %[e], %[e], %[d], %[d] \n\t" + "add.s %[f], %[f], %[ax] \n\t" + "add.s %[f], %[f], %[bx] \n\t" + "add.s %[f], %[f], %[cx] \n\t" + "add.s %[f], %[f], %[dx] \n\t" + + : [a]"=&f"(a), [b]"=&f"(b), + [c]"=&f"(c), [d]"=&f"(d), + [e]"+f"(band->energy), [f]"+f"(form_factor), + [ax]"=&f"(ax), [bx]"=&f"(bx), + [cx]"=&f"(cx), [dx]"=&f"(dx) + : [cf]"r"(cf) + : "memory" + ); + } + } + + Temp = sqrtf((float)band_sizes[g] / band->energy); + band->thr = band->energy * 0.001258925f; + band->nz_lines = form_factor * sqrtf(Temp); + start += band_sizes[g]; + wstart += band_sizes[g]; + } + } +} + +static void psy_hp_filter_mips(const float *firbuf, float *hpfsmpl, const float * psy_fir_coeffs) +{ + float sum1, sum2, sum3, sum4; + float *fb = (float*)firbuf; + float *fb_end = fb + AAC_BLOCK_SIZE_LONG; + float *hp = hpfsmpl; + + float coeff0 = psy_fir_coeffs[1]; + float coeff1 = psy_fir_coeffs[3]; + float coeff2 = psy_fir_coeffs[5]; + float coeff3 = psy_fir_coeffs[7]; + float coeff4 = psy_fir_coeffs[9]; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + + "li.s $f12, 32768 \n\t" + "1: \n\t" + "lwc1 $f0, 40(%[fb]) \n\t" + "lwc1 $f1, 4(%[fb]) \n\t" + "lwc1 $f2, 80(%[fb]) \n\t" + "lwc1 $f3, 44(%[fb]) \n\t" + "lwc1 $f4, 8(%[fb]) \n\t" + "madd.s %[sum1], $f0, $f1, %[coeff0] \n\t" + "lwc1 $f5, 84(%[fb]) \n\t" + "lwc1 $f6, 48(%[fb]) \n\t" + "madd.s %[sum2], $f3, $f4, %[coeff0] \n\t" + "lwc1 $f7, 12(%[fb]) \n\t" + "madd.s %[sum1], %[sum1], $f2, %[coeff0] \n\t" + "lwc1 $f8, 88(%[fb]) \n\t" + "lwc1 $f9, 52(%[fb]) \n\t" + "madd.s %[sum2], %[sum2], $f5, %[coeff0] \n\t" + "madd.s %[sum3], $f6, $f7, %[coeff0] \n\t" + "lwc1 $f10, 16(%[fb]) \n\t" + "lwc1 $f11, 92(%[fb]) \n\t" + "madd.s %[sum1], %[sum1], $f7, %[coeff1] \n\t" + "lwc1 $f1, 72(%[fb]) \n\t" + "madd.s %[sum3], %[sum3], $f8, %[coeff0] \n\t" + "madd.s %[sum4], $f9, $f10, %[coeff0] \n\t" + "madd.s %[sum2], %[sum2], $f10, %[coeff1] \n\t" + "madd.s %[sum1], %[sum1], $f1, %[coeff1] \n\t" + "lwc1 $f4, 76(%[fb]) \n\t" + "lwc1 $f8, 20(%[fb]) \n\t" + "madd.s %[sum4], %[sum4], $f11, %[coeff0] \n\t" + "lwc1 $f11, 24(%[fb]) \n\t" + "madd.s %[sum2], %[sum2], $f4, %[coeff1] \n\t" + "madd.s %[sum1], %[sum1], $f8, %[coeff2] \n\t" + "madd.s %[sum3], %[sum3], $f8, %[coeff1] \n\t" + "madd.s %[sum4], %[sum4], $f11, %[coeff1] \n\t" + "lwc1 $f7, 64(%[fb]) \n\t" + "madd.s %[sum2], %[sum2], $f11, %[coeff2] \n\t" + "lwc1 $f10, 68(%[fb]) \n\t" + "madd.s %[sum3], %[sum3], $f2, %[coeff1] \n\t" + "madd.s %[sum4], %[sum4], $f5, %[coeff1] \n\t" + "madd.s %[sum1], %[sum1], $f7, %[coeff2] \n\t" + "madd.s %[sum2], %[sum2], $f10, %[coeff2] \n\t" + "lwc1 $f2, 28(%[fb]) \n\t" + "lwc1 $f5, 32(%[fb]) \n\t" + "lwc1 $f8, 56(%[fb]) \n\t" + "lwc1 $f11, 60(%[fb]) \n\t" + "madd.s %[sum3], %[sum3], $f2, %[coeff2] \n\t" + "madd.s %[sum4], %[sum4], $f5, %[coeff2] \n\t" + "madd.s %[sum1], %[sum1], $f2, %[coeff3] \n\t" + "madd.s %[sum2], %[sum2], $f5, %[coeff3] \n\t" + "madd.s %[sum3], %[sum3], $f1, %[coeff2] \n\t" + "madd.s %[sum4], %[sum4], $f4, %[coeff2] \n\t" + "madd.s %[sum1], %[sum1], $f8, %[coeff3] \n\t" + "madd.s %[sum2], %[sum2], $f11, %[coeff3] \n\t" + "lwc1 $f1, 36(%[fb]) \n\t" + PTR_ADDIU "%[fb], %[fb], 16 \n\t" + "madd.s %[sum4], %[sum4], $f0, %[coeff3] \n\t" + "madd.s %[sum3], %[sum3], $f1, %[coeff3] \n\t" + "madd.s %[sum1], %[sum1], $f1, %[coeff4] \n\t" + "madd.s %[sum2], %[sum2], $f0, %[coeff4] \n\t" + "madd.s %[sum4], %[sum4], $f10, %[coeff3] \n\t" + "madd.s %[sum3], %[sum3], $f7, %[coeff3] \n\t" + "madd.s %[sum1], %[sum1], $f6, %[coeff4] \n\t" + "madd.s %[sum2], %[sum2], $f9, %[coeff4] \n\t" + "madd.s %[sum4], %[sum4], $f6, %[coeff4] \n\t" + "madd.s %[sum3], %[sum3], $f3, %[coeff4] \n\t" + "mul.s %[sum1], %[sum1], $f12 \n\t" + "mul.s %[sum2], %[sum2], $f12 \n\t" + "madd.s %[sum4], %[sum4], $f11, %[coeff4] \n\t" + "madd.s %[sum3], %[sum3], $f8, %[coeff4] \n\t" + "swc1 %[sum1], 0(%[hp]) \n\t" + "swc1 %[sum2], 4(%[hp]) \n\t" + "mul.s %[sum4], %[sum4], $f12 \n\t" + "mul.s %[sum3], %[sum3], $f12 \n\t" + "swc1 %[sum4], 12(%[hp]) \n\t" + "swc1 %[sum3], 8(%[hp]) \n\t" + "bne %[fb], %[fb_end], 1b \n\t" + PTR_ADDIU "%[hp], %[hp], 16 \n\t" + + ".set pop \n\t" + + : [sum1]"=&f"(sum1), [sum2]"=&f"(sum2), + [sum3]"=&f"(sum3), [sum4]"=&f"(sum4), + [fb]"+r"(fb), [hp]"+r"(hp) + : [coeff0]"f"(coeff0), [coeff1]"f"(coeff1), + [coeff2]"f"(coeff2), [coeff3]"f"(coeff3), + [coeff4]"f"(coeff4), [fb_end]"r"(fb_end) + : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", + "$f7", "$f8", "$f9", "$f10", "$f11", "$f12", + "memory" + ); +} + +#define calc_thr_3gpp calc_thr_3gpp_mips +#define psy_hp_filter psy_hp_filter_mips + +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_INLINE_ASM && HAVE_MIPSFPU */ +#endif /* AVCODEC_MIPS_AACPSY_MIPS_H */ diff --git a/libavcodec/mips/aacsbr_mips.c b/libavcodec/mips/aacsbr_mips.c new file mode 100644 index 0000000000..56aa4e8682 --- /dev/null +++ b/libavcodec/mips/aacsbr_mips.c @@ -0,0 +1,623 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Authors: Djordje Pesut (djordje@mips.com) + * Mirjana Vulin (mvulin@mips.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/aacsbr.c + */ + +#include "libavcodec/aac.h" +#include "libavcodec/aacsbr.h" +#include "libavutil/mips/asmdefs.h" + +#define ENVELOPE_ADJUSTMENT_OFFSET 2 + +#if HAVE_INLINE_ASM +static int sbr_lf_gen_mips(AACContext *ac, SpectralBandReplication *sbr, + float X_low[32][40][2], const float W[2][32][32][2], + int buf_idx) +{ + int i, k; + int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + float *p_x_low = &X_low[0][8][0]; + float *p_w = (float*)&W[buf_idx][0][0][0]; + float *p_x1_low = &X_low[0][0][0]; + float *p_w1 = (float*)&W[1-buf_idx][24][0][0]; + + float *loop_end=p_x1_low + 2560; + + /* loop unrolled 8 times */ + __asm__ volatile ( + "1: \n\t" + "sw $0, 0(%[p_x1_low]) \n\t" + "sw $0, 4(%[p_x1_low]) \n\t" + "sw $0, 8(%[p_x1_low]) \n\t" + "sw $0, 12(%[p_x1_low]) \n\t" + "sw $0, 16(%[p_x1_low]) \n\t" + "sw $0, 20(%[p_x1_low]) \n\t" + "sw $0, 24(%[p_x1_low]) \n\t" + "sw $0, 28(%[p_x1_low]) \n\t" + PTR_ADDIU "%[p_x1_low],%[p_x1_low], 32 \n\t" + "bne %[p_x1_low], %[loop_end], 1b \n\t" + PTR_ADDIU "%[p_x1_low],%[p_x1_low], -10240 \n\t" + + : [p_x1_low]"+r"(p_x1_low) + : [loop_end]"r"(loop_end) + : "memory" + ); + + for (k = 0; k < sbr->kx[1]; k++) { + for (i = 0; i < 32; i+=4) { + /* loop unrolled 4 times */ + __asm__ volatile ( + "lw %[temp0], 0(%[p_w]) \n\t" + "lw %[temp1], 4(%[p_w]) \n\t" + "lw %[temp2], 256(%[p_w]) \n\t" + "lw %[temp3], 260(%[p_w]) \n\t" + "lw %[temp4], 512(%[p_w]) \n\t" + "lw %[temp5], 516(%[p_w]) \n\t" + "lw %[temp6], 768(%[p_w]) \n\t" + "lw %[temp7], 772(%[p_w]) \n\t" + "sw %[temp0], 0(%[p_x_low]) \n\t" + "sw %[temp1], 4(%[p_x_low]) \n\t" + "sw %[temp2], 8(%[p_x_low]) \n\t" + "sw %[temp3], 12(%[p_x_low]) \n\t" + "sw %[temp4], 16(%[p_x_low]) \n\t" + "sw %[temp5], 20(%[p_x_low]) \n\t" + "sw %[temp6], 24(%[p_x_low]) \n\t" + "sw %[temp7], 28(%[p_x_low]) \n\t" + PTR_ADDIU "%[p_x_low], %[p_x_low], 32 \n\t" + PTR_ADDIU "%[p_w], %[p_w], 1024 \n\t" + + : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), + [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), + [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), + [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), + [p_w]"+r"(p_w), [p_x_low]"+r"(p_x_low) + : + : "memory" + ); + } + p_x_low += 16; + p_w -= 2046; + } + + for (k = 0; k < sbr->kx[0]; k++) { + for (i = 0; i < 2; i++) { + + /* loop unrolled 4 times */ + __asm__ volatile ( + "lw %[temp0], 0(%[p_w1]) \n\t" + "lw %[temp1], 4(%[p_w1]) \n\t" + "lw %[temp2], 256(%[p_w1]) \n\t" + "lw %[temp3], 260(%[p_w1]) \n\t" + "lw %[temp4], 512(%[p_w1]) \n\t" + "lw %[temp5], 516(%[p_w1]) \n\t" + "lw %[temp6], 768(%[p_w1]) \n\t" + "lw %[temp7], 772(%[p_w1]) \n\t" + "sw %[temp0], 0(%[p_x1_low]) \n\t" + "sw %[temp1], 4(%[p_x1_low]) \n\t" + "sw %[temp2], 8(%[p_x1_low]) \n\t" + "sw %[temp3], 12(%[p_x1_low]) \n\t" + "sw %[temp4], 16(%[p_x1_low]) \n\t" + "sw %[temp5], 20(%[p_x1_low]) \n\t" + "sw %[temp6], 24(%[p_x1_low]) \n\t" + "sw %[temp7], 28(%[p_x1_low]) \n\t" + PTR_ADDIU "%[p_x1_low], %[p_x1_low], 32 \n\t" + PTR_ADDIU "%[p_w1], %[p_w1], 1024 \n\t" + + : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), + [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), + [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), + [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), + [p_w1]"+r"(p_w1), [p_x1_low]"+r"(p_x1_low) + : + : "memory" + ); + } + p_x1_low += 64; + p_w1 -= 510; + } + return 0; +} + +static int sbr_x_gen_mips(SpectralBandReplication *sbr, float X[2][38][64], + const float Y0[38][64][2], const float Y1[38][64][2], + const float X_low[32][40][2], int ch) +{ + int k, i; + const int i_f = 32; + int temp0, temp1, temp2, temp3; + const float *X_low1, *Y01, *Y11; + float *x1=&X[0][0][0]; + float *j=x1+4864; + const int i_Temp = FFMAX(2*sbr->data[ch].t_env_num_env_old - i_f, 0); + + /* loop unrolled 8 times */ + __asm__ volatile ( + "1: \n\t" + "sw $0, 0(%[x1]) \n\t" + "sw $0, 4(%[x1]) \n\t" + "sw $0, 8(%[x1]) \n\t" + "sw $0, 12(%[x1]) \n\t" + "sw $0, 16(%[x1]) \n\t" + "sw $0, 20(%[x1]) \n\t" + "sw $0, 24(%[x1]) \n\t" + "sw $0, 28(%[x1]) \n\t" + PTR_ADDIU "%[x1],%[x1], 32 \n\t" + "bne %[x1], %[j], 1b \n\t" + PTR_ADDIU "%[x1],%[x1], -19456 \n\t" + + : [x1]"+r"(x1) + : [j]"r"(j) + : "memory" + ); + + if (i_Temp != 0) { + + X_low1=&X_low[0][2][0]; + + for (k = 0; k < sbr->kx[0]; k++) { + + __asm__ volatile ( + "move %[i], $zero \n\t" + "2: \n\t" + "lw %[temp0], 0(%[X_low1]) \n\t" + "lw %[temp1], 4(%[X_low1]) \n\t" + "sw %[temp0], 0(%[x1]) \n\t" + "sw %[temp1], 9728(%[x1]) \n\t" + PTR_ADDIU "%[x1], %[x1], 256 \n\t" + PTR_ADDIU "%[X_low1], %[X_low1], 8 \n\t" + "addiu %[i], %[i], 1 \n\t" + "bne %[i], %[i_Temp], 2b \n\t" + + : [x1]"+r"(x1), [X_low1]"+r"(X_low1), [i]"=&r"(i), + [temp0]"=&r"(temp0), [temp1]"=&r"(temp1) + : [i_Temp]"r"(i_Temp) + : "memory" + ); + x1-=(i_Temp<<6)-1; + X_low1-=(i_Temp<<1)-80; + } + + x1=&X[0][0][k]; + Y01=(float*)&Y0[32][k][0]; + + for (; k < sbr->kx[0] + sbr->m[0]; k++) { + __asm__ volatile ( + "move %[i], $zero \n\t" + "3: \n\t" + "lw %[temp0], 0(%[Y01]) \n\t" + "lw %[temp1], 4(%[Y01]) \n\t" + "sw %[temp0], 0(%[x1]) \n\t" + "sw %[temp1], 9728(%[x1]) \n\t" + PTR_ADDIU "%[x1], %[x1], 256 \n\t" + PTR_ADDIU "%[Y01], %[Y01], 512 \n\t" + "addiu %[i], %[i], 1 \n\t" + "bne %[i], %[i_Temp], 3b \n\t" + + : [x1]"+r"(x1), [Y01]"+r"(Y01), [i]"=&r"(i), + [temp0]"=&r"(temp0), [temp1]"=&r"(temp1) + : [i_Temp]"r"(i_Temp) + : "memory" + ); + x1 -=(i_Temp<<6)-1; + Y01 -=(i_Temp<<7)-2; + } + } + + x1=&X[0][i_Temp][0]; + X_low1=&X_low[0][i_Temp+2][0]; + temp3=38; + + for (k = 0; k < sbr->kx[1]; k++) { + + __asm__ volatile ( + "move %[i], %[i_Temp] \n\t" + "4: \n\t" + "lw %[temp0], 0(%[X_low1]) \n\t" + "lw %[temp1], 4(%[X_low1]) \n\t" + "sw %[temp0], 0(%[x1]) \n\t" + "sw %[temp1], 9728(%[x1]) \n\t" + PTR_ADDIU "%[x1], %[x1], 256 \n\t" + PTR_ADDIU "%[X_low1],%[X_low1], 8 \n\t" + "addiu %[i], %[i], 1 \n\t" + "bne %[i], %[temp3], 4b \n\t" + + : [x1]"+r"(x1), [X_low1]"+r"(X_low1), [i]"=&r"(i), + [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), + [temp2]"=&r"(temp2) + : [i_Temp]"r"(i_Temp), [temp3]"r"(temp3) + : "memory" + ); + x1 -= ((38-i_Temp)<<6)-1; + X_low1 -= ((38-i_Temp)<<1)- 80; + } + + x1=&X[0][i_Temp][k]; + Y11=&Y1[i_Temp][k][0]; + temp2=32; + + for (; k < sbr->kx[1] + sbr->m[1]; k++) { + + __asm__ volatile ( + "move %[i], %[i_Temp] \n\t" + "5: \n\t" + "lw %[temp0], 0(%[Y11]) \n\t" + "lw %[temp1], 4(%[Y11]) \n\t" + "sw %[temp0], 0(%[x1]) \n\t" + "sw %[temp1], 9728(%[x1]) \n\t" + PTR_ADDIU "%[x1], %[x1], 256 \n\t" + PTR_ADDIU "%[Y11], %[Y11], 512 \n\t" + "addiu %[i], %[i], 1 \n\t" + "bne %[i], %[temp2], 5b \n\t" + + : [x1]"+r"(x1), [Y11]"+r"(Y11), [i]"=&r"(i), + [temp0]"=&r"(temp0), [temp1]"=&r"(temp1) + : [i_Temp]"r"(i_Temp), [temp3]"r"(temp3), + [temp2]"r"(temp2) + : "memory" + ); + + x1 -= ((32-i_Temp)<<6)-1; + Y11 -= ((32-i_Temp)<<7)-2; + } + return 0; +} + +#if HAVE_MIPSFPU +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 +static void sbr_hf_assemble_mips(float Y1[38][64][2], + const float X_high[64][40][2], + SpectralBandReplication *sbr, SBRData *ch_data, + const int e_a[2]) +{ + int e, i, j, m; + const int h_SL = 4 * !sbr->bs_smoothing_mode; + const int kx = sbr->kx[1]; + const int m_max = sbr->m[1]; + static const float h_smooth[5] = { + 0.33333333333333, + 0.30150283239582, + 0.21816949906249, + 0.11516383427084, + 0.03183050093751, + }; + + float (*g_temp)[48] = ch_data->g_temp, (*q_temp)[48] = ch_data->q_temp; + int indexnoise = ch_data->f_indexnoise; + int indexsine = ch_data->f_indexsine; + float *g_temp1, *q_temp1, *pok, *pok1; + float temp1, temp2, temp3, temp4; + int size = m_max; + + if (sbr->reset) { + for (i = 0; i < h_SL; i++) { + memcpy(g_temp[i + 2*ch_data->t_env[0]], sbr->gain[0], m_max * sizeof(sbr->gain[0][0])); + memcpy(q_temp[i + 2*ch_data->t_env[0]], sbr->q_m[0], m_max * sizeof(sbr->q_m[0][0])); + } + } else if (h_SL) { + memcpy(g_temp[2*ch_data->t_env[0]], g_temp[2*ch_data->t_env_num_env_old], 4*sizeof(g_temp[0])); + memcpy(q_temp[2*ch_data->t_env[0]], q_temp[2*ch_data->t_env_num_env_old], 4*sizeof(q_temp[0])); + } + + for (e = 0; e < ch_data->bs_num_env; e++) { + for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) { + g_temp1 = g_temp[h_SL + i]; + pok = sbr->gain[e]; + q_temp1 = q_temp[h_SL + i]; + pok1 = sbr->q_m[e]; + + /* loop unrolled 4 times */ + for (j=0; j<(size>>2); j++) { + __asm__ volatile ( + "lw %[temp1], 0(%[pok]) \n\t" + "lw %[temp2], 4(%[pok]) \n\t" + "lw %[temp3], 8(%[pok]) \n\t" + "lw %[temp4], 12(%[pok]) \n\t" + "sw %[temp1], 0(%[g_temp1]) \n\t" + "sw %[temp2], 4(%[g_temp1]) \n\t" + "sw %[temp3], 8(%[g_temp1]) \n\t" + "sw %[temp4], 12(%[g_temp1]) \n\t" + "lw %[temp1], 0(%[pok1]) \n\t" + "lw %[temp2], 4(%[pok1]) \n\t" + "lw %[temp3], 8(%[pok1]) \n\t" + "lw %[temp4], 12(%[pok1]) \n\t" + "sw %[temp1], 0(%[q_temp1]) \n\t" + "sw %[temp2], 4(%[q_temp1]) \n\t" + "sw %[temp3], 8(%[q_temp1]) \n\t" + "sw %[temp4], 12(%[q_temp1]) \n\t" + PTR_ADDIU "%[pok], %[pok], 16 \n\t" + PTR_ADDIU "%[g_temp1], %[g_temp1], 16 \n\t" + PTR_ADDIU "%[pok1], %[pok1], 16 \n\t" + PTR_ADDIU "%[q_temp1], %[q_temp1], 16 \n\t" + + : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), + [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), + [pok]"+r"(pok), [g_temp1]"+r"(g_temp1), + [pok1]"+r"(pok1), [q_temp1]"+r"(q_temp1) + : + : "memory" + ); + } + + for (j=0; j<(size&3); j++) { + __asm__ volatile ( + "lw %[temp1], 0(%[pok]) \n\t" + "lw %[temp2], 0(%[pok1]) \n\t" + "sw %[temp1], 0(%[g_temp1]) \n\t" + "sw %[temp2], 0(%[q_temp1]) \n\t" + PTR_ADDIU "%[pok], %[pok], 4 \n\t" + PTR_ADDIU "%[g_temp1], %[g_temp1], 4 \n\t" + PTR_ADDIU "%[pok1], %[pok1], 4 \n\t" + PTR_ADDIU "%[q_temp1], %[q_temp1], 4 \n\t" + + : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), + [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), + [pok]"+r"(pok), [g_temp1]"+r"(g_temp1), + [pok1]"+r"(pok1), [q_temp1]"+r"(q_temp1) + : + : "memory" + ); + } + } + } + + for (e = 0; e < ch_data->bs_num_env; e++) { + for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) { + LOCAL_ALIGNED_16(float, g_filt_tab, [48]); + LOCAL_ALIGNED_16(float, q_filt_tab, [48]); + float *g_filt, *q_filt; + + if (h_SL && e != e_a[0] && e != e_a[1]) { + g_filt = g_filt_tab; + q_filt = q_filt_tab; + + for (m = 0; m < m_max; m++) { + const int idx1 = i + h_SL; + g_filt[m] = 0.0f; + q_filt[m] = 0.0f; + + for (j = 0; j <= h_SL; j++) { + g_filt[m] += g_temp[idx1 - j][m] * h_smooth[j]; + q_filt[m] += q_temp[idx1 - j][m] * h_smooth[j]; + } + } + } else { + g_filt = g_temp[i + h_SL]; + q_filt = q_temp[i]; + } + + sbr->dsp.hf_g_filt(Y1[i] + kx, X_high + kx, g_filt, m_max, + i + ENVELOPE_ADJUSTMENT_OFFSET); + + if (e != e_a[0] && e != e_a[1]) { + sbr->dsp.hf_apply_noise[indexsine](Y1[i] + kx, sbr->s_m[e], + q_filt, indexnoise, + kx, m_max); + } else { + int idx = indexsine&1; + int A = (1-((indexsine+(kx & 1))&2)); + int B = (A^(-idx)) + idx; + float *out = &Y1[i][kx][idx]; + float *in = sbr->s_m[e]; + float temp0, temp1, temp2, temp3, temp4, temp5; + float A_f = (float)A; + float B_f = (float)B; + + for (m = 0; m+1 < m_max; m+=2) { + + temp2 = out[0]; + temp3 = out[2]; + + __asm__ volatile( + "lwc1 %[temp0], 0(%[in]) \n\t" + "lwc1 %[temp1], 4(%[in]) \n\t" + "madd.s %[temp4], %[temp2], %[temp0], %[A_f] \n\t" + "madd.s %[temp5], %[temp3], %[temp1], %[B_f] \n\t" + "swc1 %[temp4], 0(%[out]) \n\t" + "swc1 %[temp5], 8(%[out]) \n\t" + PTR_ADDIU "%[in], %[in], 8 \n\t" + PTR_ADDIU "%[out], %[out], 16 \n\t" + + : [temp0]"=&f" (temp0), [temp1]"=&f"(temp1), + [temp4]"=&f" (temp4), [temp5]"=&f"(temp5), + [in]"+r"(in), [out]"+r"(out) + : [A_f]"f"(A_f), [B_f]"f"(B_f), [temp2]"f"(temp2), + [temp3]"f"(temp3) + : "memory" + ); + } + if(m_max&1) + out[2*m ] += in[m ] * A; + } + indexnoise = (indexnoise + m_max) & 0x1ff; + indexsine = (indexsine + 1) & 3; + } + } + ch_data->f_indexnoise = indexnoise; + ch_data->f_indexsine = indexsine; +} + +static void sbr_hf_inverse_filter_mips(SBRDSPContext *dsp, + float (*alpha0)[2], float (*alpha1)[2], + const float X_low[32][40][2], int k0) +{ + int k; + float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, c; + float *phi1, *alpha_1, *alpha_0, res1, res2, temp_real, temp_im; + + c = 1.000001f; + + for (k = 0; k < k0; k++) { + LOCAL_ALIGNED_16(float, phi, [3], [2][2]); + float dk; + phi1 = &phi[0][0][0]; + alpha_1 = &alpha1[k][0]; + alpha_0 = &alpha0[k][0]; + dsp->autocorrelate(X_low[k], phi); + + __asm__ volatile ( + "lwc1 %[temp0], 40(%[phi1]) \n\t" + "lwc1 %[temp1], 16(%[phi1]) \n\t" + "lwc1 %[temp2], 24(%[phi1]) \n\t" + "lwc1 %[temp3], 28(%[phi1]) \n\t" + "mul.s %[dk], %[temp0], %[temp1] \n\t" + "lwc1 %[temp4], 0(%[phi1]) \n\t" + "mul.s %[res2], %[temp2], %[temp2] \n\t" + "lwc1 %[temp5], 4(%[phi1]) \n\t" + "madd.s %[res2], %[res2], %[temp3], %[temp3] \n\t" + "lwc1 %[temp6], 8(%[phi1]) \n\t" + "div.s %[res2], %[res2], %[c] \n\t" + "lwc1 %[temp0], 12(%[phi1]) \n\t" + "sub.s %[dk], %[dk], %[res2] \n\t" + + : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5), + [temp6]"=&f"(temp6), [res2]"=&f"(res2), [dk]"=&f"(dk) + : [phi1]"r"(phi1), [c]"f"(c) + : "memory" + ); + + if (!dk) { + alpha_1[0] = 0; + alpha_1[1] = 0; + } else { + __asm__ volatile ( + "mul.s %[temp_real], %[temp4], %[temp2] \n\t" + "nmsub.s %[temp_real], %[temp_real], %[temp5], %[temp3] \n\t" + "nmsub.s %[temp_real], %[temp_real], %[temp6], %[temp1] \n\t" + "mul.s %[temp_im], %[temp4], %[temp3] \n\t" + "madd.s %[temp_im], %[temp_im], %[temp5], %[temp2] \n\t" + "nmsub.s %[temp_im], %[temp_im], %[temp0], %[temp1] \n\t" + "div.s %[temp_real], %[temp_real], %[dk] \n\t" + "div.s %[temp_im], %[temp_im], %[dk] \n\t" + "swc1 %[temp_real], 0(%[alpha_1]) \n\t" + "swc1 %[temp_im], 4(%[alpha_1]) \n\t" + + : [temp_real]"=&f" (temp_real), [temp_im]"=&f"(temp_im) + : [phi1]"r"(phi1), [temp0]"f"(temp0), [temp1]"f"(temp1), + [temp2]"f"(temp2), [temp3]"f"(temp3), [temp4]"f"(temp4), + [temp5]"f"(temp5), [temp6]"f"(temp6), + [alpha_1]"r"(alpha_1), [dk]"f"(dk) + : "memory" + ); + } + + if (!phi1[4]) { + alpha_0[0] = 0; + alpha_0[1] = 0; + } else { + __asm__ volatile ( + "lwc1 %[temp6], 0(%[alpha_1]) \n\t" + "lwc1 %[temp7], 4(%[alpha_1]) \n\t" + "mul.s %[temp_real], %[temp6], %[temp2] \n\t" + "add.s %[temp_real], %[temp_real], %[temp4] \n\t" + "madd.s %[temp_real], %[temp_real], %[temp7], %[temp3] \n\t" + "mul.s %[temp_im], %[temp7], %[temp2] \n\t" + "add.s %[temp_im], %[temp_im], %[temp5] \n\t" + "nmsub.s %[temp_im], %[temp_im], %[temp6], %[temp3] \n\t" + "div.s %[temp_real], %[temp_real], %[temp1] \n\t" + "div.s %[temp_im], %[temp_im], %[temp1] \n\t" + "neg.s %[temp_real], %[temp_real] \n\t" + "neg.s %[temp_im], %[temp_im] \n\t" + "swc1 %[temp_real], 0(%[alpha_0]) \n\t" + "swc1 %[temp_im], 4(%[alpha_0]) \n\t" + + : [temp_real]"=&f"(temp_real), [temp_im]"=&f"(temp_im), + [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), + [res1]"=&f"(res1), [res2]"=&f"(res2) + : [alpha_1]"r"(alpha_1), [alpha_0]"r"(alpha_0), + [temp0]"f"(temp0), [temp1]"f"(temp1), [temp2]"f"(temp2), + [temp3]"f"(temp3), [temp4]"f"(temp4), [temp5]"f"(temp5) + : "memory" + ); + } + + __asm__ volatile ( + "lwc1 %[temp1], 0(%[alpha_1]) \n\t" + "lwc1 %[temp2], 4(%[alpha_1]) \n\t" + "lwc1 %[temp_real], 0(%[alpha_0]) \n\t" + "lwc1 %[temp_im], 4(%[alpha_0]) \n\t" + "mul.s %[res1], %[temp1], %[temp1] \n\t" + "madd.s %[res1], %[res1], %[temp2], %[temp2] \n\t" + "mul.s %[res2], %[temp_real], %[temp_real] \n\t" + "madd.s %[res2], %[res2], %[temp_im], %[temp_im] \n\t" + + : [temp_real]"=&f"(temp_real), [temp_im]"=&f"(temp_im), + [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [res1]"=&f"(res1), [res2]"=&f"(res2) + : [alpha_1]"r"(alpha_1), [alpha_0]"r"(alpha_0) + : "memory" + ); + + if (res1 >= 16.0f || res2 >= 16.0f) { + alpha_1[0] = 0; + alpha_1[1] = 0; + alpha_0[0] = 0; + alpha_0[1] = 0; + } + } +} +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_MIPSFPU */ +#endif /* HAVE_INLINE_ASM */ + +void ff_aacsbr_func_ptr_init_mips(AACSBRContext *c) +{ +#if HAVE_INLINE_ASM + c->sbr_lf_gen = sbr_lf_gen_mips; + c->sbr_x_gen = sbr_x_gen_mips; +#if HAVE_MIPSFPU +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 + c->sbr_hf_inverse_filter = sbr_hf_inverse_filter_mips; + c->sbr_hf_assemble = sbr_hf_assemble_mips; +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_MIPSFPU */ +#endif /* HAVE_INLINE_ASM */ +} diff --git a/libavcodec/mips/aacsbr_mips.h b/libavcodec/mips/aacsbr_mips.h new file mode 100644 index 0000000000..4461e763ed --- /dev/null +++ b/libavcodec/mips/aacsbr_mips.h @@ -0,0 +1,496 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Authors: Djordje Pesut (djordje@mips.com) + * Mirjana Vulin (mvulin@mips.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/aacsbr.c + */ + +#ifndef AVCODEC_MIPS_AACSBR_MIPS_H +#define AVCODEC_MIPS_AACSBR_MIPS_H + +#include "libavcodec/aac.h" +#include "libavcodec/sbr.h" +#include "libavutil/mips/asmdefs.h" + +#if HAVE_INLINE_ASM +static void sbr_qmf_analysis_mips(AVFloatDSPContext *fdsp, FFTContext *mdct, + SBRDSPContext *sbrdsp, const float *in, float *x, + float z[320], float W[2][32][32][2], int buf_idx) +{ + int i; + float *w0; + float *w1; + int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + + w0 = x; + w1 = x + 1024; + for(i = 0; i < 36; i++) + { + /* loop unrolled 8 times */ + __asm__ volatile( + "lw %[temp0], 0(%[w1]) \n\t" + "lw %[temp1], 4(%[w1]) \n\t" + "lw %[temp2], 8(%[w1]) \n\t" + "lw %[temp3], 12(%[w1]) \n\t" + "lw %[temp4], 16(%[w1]) \n\t" + "lw %[temp5], 20(%[w1]) \n\t" + "lw %[temp6], 24(%[w1]) \n\t" + "lw %[temp7], 28(%[w1]) \n\t" + "sw %[temp0], 0(%[w0]) \n\t" + "sw %[temp1], 4(%[w0]) \n\t" + "sw %[temp2], 8(%[w0]) \n\t" + "sw %[temp3], 12(%[w0]) \n\t" + "sw %[temp4], 16(%[w0]) \n\t" + "sw %[temp5], 20(%[w0]) \n\t" + "sw %[temp6], 24(%[w0]) \n\t" + "sw %[temp7], 28(%[w0]) \n\t" + PTR_ADDIU " %[w0], %[w0], 32 \n\t" + PTR_ADDIU " %[w1], %[w1], 32 \n\t" + + : [w0]"+r"(w0), [w1]"+r"(w1), + [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), + [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), + [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), + [temp6]"=&r"(temp6), [temp7]"=&r"(temp7) + : + : "memory" + ); + } + + w0 = x + 288; + w1 = (float*)in; + for(i = 0; i < 128; i++) + { + /* loop unrolled 8 times */ + __asm__ volatile( + "lw %[temp0], 0(%[w1]) \n\t" + "lw %[temp1], 4(%[w1]) \n\t" + "lw %[temp2], 8(%[w1]) \n\t" + "lw %[temp3], 12(%[w1]) \n\t" + "lw %[temp4], 16(%[w1]) \n\t" + "lw %[temp5], 20(%[w1]) \n\t" + "lw %[temp6], 24(%[w1]) \n\t" + "lw %[temp7], 28(%[w1]) \n\t" + "sw %[temp0], 0(%[w0]) \n\t" + "sw %[temp1], 4(%[w0]) \n\t" + "sw %[temp2], 8(%[w0]) \n\t" + "sw %[temp3], 12(%[w0]) \n\t" + "sw %[temp4], 16(%[w0]) \n\t" + "sw %[temp5], 20(%[w0]) \n\t" + "sw %[temp6], 24(%[w0]) \n\t" + "sw %[temp7], 28(%[w0]) \n\t" + PTR_ADDIU " %[w0], %[w0], 32 \n\t" + PTR_ADDIU " %[w1], %[w1], 32 \n\t" + + : [w0]"+r"(w0), [w1]"+r"(w1), + [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), + [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), + [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), + [temp6]"=&r"(temp6), [temp7]"=&r"(temp7) + : + : "memory" + ); + } + + for (i = 0; i < 32; i++) { // numTimeSlots*RATE = 16*2 as 960 sample frames + // are not supported + fdsp->vector_fmul_reverse(z, sbr_qmf_window_ds, x, 320); + sbrdsp->sum64x5(z); + sbrdsp->qmf_pre_shuffle(z); + mdct->imdct_half(mdct, z, z+64); + sbrdsp->qmf_post_shuffle(W[buf_idx][i], z); + x += 32; + } +} + +#if HAVE_MIPSFPU +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 +static void sbr_qmf_synthesis_mips(FFTContext *mdct, + SBRDSPContext *sbrdsp, AVFloatDSPContext *fdsp, + float *out, float X[2][38][64], + float mdct_buf[2][64], + float *v0, int *v_off, const unsigned int div) +{ + int i, n; + const float *sbr_qmf_window = div ? sbr_qmf_window_ds : sbr_qmf_window_us; + const int step = 128 >> div; + float *v; + float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12, temp13; + float temp14, temp15, temp16, temp17, temp18, temp19; + float *vv0, *s0, *dst; + dst = out; + + for (i = 0; i < 32; i++) { + if (*v_off < step) { + int saved_samples = (1280 - 128) >> div; + memcpy(&v0[SBR_SYNTHESIS_BUF_SIZE - saved_samples], v0, saved_samples * sizeof(float)); + *v_off = SBR_SYNTHESIS_BUF_SIZE - saved_samples - step; + } else { + *v_off -= step; + } + v = v0 + *v_off; + if (div) { + for (n = 0; n < 32; n++) { + X[0][i][ n] = -X[0][i][n]; + X[0][i][32+n] = X[1][i][31-n]; + } + mdct->imdct_half(mdct, mdct_buf[0], X[0][i]); + sbrdsp->qmf_deint_neg(v, mdct_buf[0]); + } else { + sbrdsp->neg_odd_64(X[1][i]); + mdct->imdct_half(mdct, mdct_buf[0], X[0][i]); + mdct->imdct_half(mdct, mdct_buf[1], X[1][i]); + sbrdsp->qmf_deint_bfly(v, mdct_buf[1], mdct_buf[0]); + } + + if(div == 0) + { + float *v0_end; + vv0 = v; + v0_end = v + 60; + s0 = (float*)sbr_qmf_window; + + /* 10 calls of function vector_fmul_add merged into one loop + and loop unrolled 4 times */ + __asm__ volatile( + ".set push \n\t" + ".set noreorder \n\t" + "lwc1 %[temp4], 0(%[v0]) \n\t" + "lwc1 %[temp5], 0(%[s0]) \n\t" + "lwc1 %[temp6], 4(%[v0]) \n\t" + "lwc1 %[temp7], 4(%[s0]) \n\t" + "lwc1 %[temp8], 8(%[v0]) \n\t" + "lwc1 %[temp9], 8(%[s0]) \n\t" + "lwc1 %[temp10], 12(%[v0]) \n\t" + "lwc1 %[temp11], 12(%[s0]) \n\t" + "lwc1 %[temp12], 768(%[v0]) \n\t" + "lwc1 %[temp13], 256(%[s0]) \n\t" + "lwc1 %[temp14], 772(%[v0]) \n\t" + "lwc1 %[temp15], 260(%[s0]) \n\t" + "lwc1 %[temp16], 776(%[v0]) \n\t" + "lwc1 %[temp17], 264(%[s0]) \n\t" + "lwc1 %[temp18], 780(%[v0]) \n\t" + "lwc1 %[temp19], 268(%[s0]) \n\t" + "1: \n\t" + "mul.s %[temp0], %[temp4], %[temp5] \n\t" + "lwc1 %[temp4], 1024(%[v0]) \n\t" + "mul.s %[temp1], %[temp6], %[temp7] \n\t" + "lwc1 %[temp5], 512(%[s0]) \n\t" + "mul.s %[temp2], %[temp8], %[temp9] \n\t" + "lwc1 %[temp6], 1028(%[v0]) \n\t" + "mul.s %[temp3], %[temp10], %[temp11] \n\t" + "lwc1 %[temp7], 516(%[s0]) \n\t" + "madd.s %[temp0], %[temp0], %[temp12], %[temp13] \n\t" + "lwc1 %[temp8], 1032(%[v0]) \n\t" + "madd.s %[temp1], %[temp1], %[temp14], %[temp15] \n\t" + "lwc1 %[temp9], 520(%[s0]) \n\t" + "madd.s %[temp2], %[temp2], %[temp16], %[temp17] \n\t" + "lwc1 %[temp10], 1036(%[v0]) \n\t" + "madd.s %[temp3], %[temp3], %[temp18], %[temp19] \n\t" + "lwc1 %[temp11], 524(%[s0]) \n\t" + "lwc1 %[temp12], 1792(%[v0]) \n\t" + "lwc1 %[temp13], 768(%[s0]) \n\t" + "lwc1 %[temp14], 1796(%[v0]) \n\t" + "lwc1 %[temp15], 772(%[s0]) \n\t" + "lwc1 %[temp16], 1800(%[v0]) \n\t" + "lwc1 %[temp17], 776(%[s0]) \n\t" + "lwc1 %[temp18], 1804(%[v0]) \n\t" + "lwc1 %[temp19], 780(%[s0]) \n\t" + "madd.s %[temp0], %[temp0], %[temp4], %[temp5] \n\t" + "lwc1 %[temp4], 2048(%[v0]) \n\t" + "madd.s %[temp1], %[temp1], %[temp6], %[temp7] \n\t" + "lwc1 %[temp5], 1024(%[s0]) \n\t" + "madd.s %[temp2], %[temp2], %[temp8], %[temp9] \n\t" + "lwc1 %[temp6], 2052(%[v0]) \n\t" + "madd.s %[temp3], %[temp3], %[temp10], %[temp11] \n\t" + "lwc1 %[temp7], 1028(%[s0]) \n\t" + "madd.s %[temp0], %[temp0], %[temp12], %[temp13] \n\t" + "lwc1 %[temp8], 2056(%[v0]) \n\t" + "madd.s %[temp1], %[temp1], %[temp14], %[temp15] \n\t" + "lwc1 %[temp9], 1032(%[s0]) \n\t" + "madd.s %[temp2], %[temp2], %[temp16], %[temp17] \n\t" + "lwc1 %[temp10], 2060(%[v0]) \n\t" + "madd.s %[temp3], %[temp3], %[temp18], %[temp19] \n\t" + "lwc1 %[temp11], 1036(%[s0]) \n\t" + "lwc1 %[temp12], 2816(%[v0]) \n\t" + "lwc1 %[temp13], 1280(%[s0]) \n\t" + "lwc1 %[temp14], 2820(%[v0]) \n\t" + "lwc1 %[temp15], 1284(%[s0]) \n\t" + "lwc1 %[temp16], 2824(%[v0]) \n\t" + "lwc1 %[temp17], 1288(%[s0]) \n\t" + "lwc1 %[temp18], 2828(%[v0]) \n\t" + "lwc1 %[temp19], 1292(%[s0]) \n\t" + "madd.s %[temp0], %[temp0], %[temp4], %[temp5] \n\t" + "lwc1 %[temp4], 3072(%[v0]) \n\t" + "madd.s %[temp1], %[temp1], %[temp6], %[temp7] \n\t" + "lwc1 %[temp5], 1536(%[s0]) \n\t" + "madd.s %[temp2], %[temp2], %[temp8], %[temp9] \n\t" + "lwc1 %[temp6], 3076(%[v0]) \n\t" + "madd.s %[temp3], %[temp3], %[temp10], %[temp11] \n\t" + "lwc1 %[temp7], 1540(%[s0]) \n\t" + "madd.s %[temp0], %[temp0], %[temp12], %[temp13] \n\t" + "lwc1 %[temp8], 3080(%[v0]) \n\t" + "madd.s %[temp1], %[temp1], %[temp14], %[temp15] \n\t" + "lwc1 %[temp9], 1544(%[s0]) \n\t" + "madd.s %[temp2], %[temp2], %[temp16], %[temp17] \n\t" + "lwc1 %[temp10], 3084(%[v0]) \n\t" + "madd.s %[temp3], %[temp3], %[temp18], %[temp19] \n\t" + "lwc1 %[temp11], 1548(%[s0]) \n\t" + "lwc1 %[temp12], 3840(%[v0]) \n\t" + "lwc1 %[temp13], 1792(%[s0]) \n\t" + "lwc1 %[temp14], 3844(%[v0]) \n\t" + "lwc1 %[temp15], 1796(%[s0]) \n\t" + "lwc1 %[temp16], 3848(%[v0]) \n\t" + "lwc1 %[temp17], 1800(%[s0]) \n\t" + "lwc1 %[temp18], 3852(%[v0]) \n\t" + "lwc1 %[temp19], 1804(%[s0]) \n\t" + "madd.s %[temp0], %[temp0], %[temp4], %[temp5] \n\t" + "lwc1 %[temp4], 4096(%[v0]) \n\t" + "madd.s %[temp1], %[temp1], %[temp6], %[temp7] \n\t" + "lwc1 %[temp5], 2048(%[s0]) \n\t" + "madd.s %[temp2], %[temp2], %[temp8], %[temp9] \n\t" + "lwc1 %[temp6], 4100(%[v0]) \n\t" + "madd.s %[temp3], %[temp3], %[temp10], %[temp11] \n\t" + "lwc1 %[temp7], 2052(%[s0]) \n\t" + "madd.s %[temp0], %[temp0], %[temp12], %[temp13] \n\t" + "lwc1 %[temp8], 4104(%[v0]) \n\t" + PTR_ADDIU "%[dst], %[dst], 16 \n\t" + "madd.s %[temp1], %[temp1], %[temp14], %[temp15] \n\t" + "lwc1 %[temp9], 2056(%[s0]) \n\t" + PTR_ADDIU " %[s0], %[s0], 16 \n\t" + "madd.s %[temp2], %[temp2], %[temp16], %[temp17] \n\t" + "lwc1 %[temp10], 4108(%[v0]) \n\t" + PTR_ADDIU " %[v0], %[v0], 16 \n\t" + "madd.s %[temp3], %[temp3], %[temp18], %[temp19] \n\t" + "lwc1 %[temp11], 2044(%[s0]) \n\t" + "lwc1 %[temp12], 4848(%[v0]) \n\t" + "lwc1 %[temp13], 2288(%[s0]) \n\t" + "lwc1 %[temp14], 4852(%[v0]) \n\t" + "lwc1 %[temp15], 2292(%[s0]) \n\t" + "lwc1 %[temp16], 4856(%[v0]) \n\t" + "lwc1 %[temp17], 2296(%[s0]) \n\t" + "lwc1 %[temp18], 4860(%[v0]) \n\t" + "lwc1 %[temp19], 2300(%[s0]) \n\t" + "madd.s %[temp0], %[temp0], %[temp4], %[temp5] \n\t" + "lwc1 %[temp4], 0(%[v0]) \n\t" + "madd.s %[temp1], %[temp1], %[temp6], %[temp7] \n\t" + "lwc1 %[temp5], 0(%[s0]) \n\t" + "madd.s %[temp2], %[temp2], %[temp8], %[temp9] \n\t" + "lwc1 %[temp6], 4(%[v0]) \n\t" + "madd.s %[temp3], %[temp3], %[temp10], %[temp11] \n\t" + "lwc1 %[temp7], 4(%[s0]) \n\t" + "madd.s %[temp0], %[temp0], %[temp12], %[temp13] \n\t" + "lwc1 %[temp8], 8(%[v0]) \n\t" + "madd.s %[temp1], %[temp1], %[temp14], %[temp15] \n\t" + "lwc1 %[temp9], 8(%[s0]) \n\t" + "madd.s %[temp2], %[temp2], %[temp16], %[temp17] \n\t" + "lwc1 %[temp10], 12(%[v0]) \n\t" + "madd.s %[temp3], %[temp3], %[temp18], %[temp19] \n\t" + "lwc1 %[temp11], 12(%[s0]) \n\t" + "lwc1 %[temp12], 768(%[v0]) \n\t" + "lwc1 %[temp13], 256(%[s0]) \n\t" + "lwc1 %[temp14], 772(%[v0]) \n\t" + "lwc1 %[temp15], 260(%[s0]) \n\t" + "lwc1 %[temp16], 776(%[v0]) \n\t" + "lwc1 %[temp17], 264(%[s0]) \n\t" + "lwc1 %[temp18], 780(%[v0]) \n\t" + "lwc1 %[temp19], 268(%[s0]) \n\t" + "swc1 %[temp0], -16(%[dst]) \n\t" + "swc1 %[temp1], -12(%[dst]) \n\t" + "swc1 %[temp2], -8(%[dst]) \n\t" + "bne %[v0], %[v0_end], 1b \n\t" + " swc1 %[temp3], -4(%[dst]) \n\t" + "mul.s %[temp0], %[temp4], %[temp5] \n\t" + "lwc1 %[temp4], 1024(%[v0]) \n\t" + "mul.s %[temp1], %[temp6], %[temp7] \n\t" + "lwc1 %[temp5], 512(%[s0]) \n\t" + "mul.s %[temp2], %[temp8], %[temp9] \n\t" + "lwc1 %[temp6], 1028(%[v0]) \n\t" + "mul.s %[temp3], %[temp10], %[temp11] \n\t" + "lwc1 %[temp7], 516(%[s0]) \n\t" + "madd.s %[temp0], %[temp0], %[temp12], %[temp13] \n\t" + "lwc1 %[temp8], 1032(%[v0]) \n\t" + "madd.s %[temp1], %[temp1], %[temp14], %[temp15] \n\t" + "lwc1 %[temp9], 520(%[s0]) \n\t" + "madd.s %[temp2], %[temp2], %[temp16], %[temp17] \n\t" + "lwc1 %[temp10], 1036(%[v0]) \n\t" + "madd.s %[temp3], %[temp3], %[temp18], %[temp19] \n\t" + "lwc1 %[temp11], 524(%[s0]) \n\t" + "lwc1 %[temp12], 1792(%[v0]) \n\t" + "lwc1 %[temp13], 768(%[s0]) \n\t" + "lwc1 %[temp14], 1796(%[v0]) \n\t" + "lwc1 %[temp15], 772(%[s0]) \n\t" + "lwc1 %[temp16], 1800(%[v0]) \n\t" + "lwc1 %[temp17], 776(%[s0]) \n\t" + "lwc1 %[temp18], 1804(%[v0]) \n\t" + "lwc1 %[temp19], 780(%[s0]) \n\t" + "madd.s %[temp0], %[temp0], %[temp4], %[temp5] \n\t" + "lwc1 %[temp4], 2048(%[v0]) \n\t" + "madd.s %[temp1], %[temp1], %[temp6], %[temp7] \n\t" + "lwc1 %[temp5], 1024(%[s0]) \n\t" + "madd.s %[temp2], %[temp2], %[temp8], %[temp9] \n\t" + "lwc1 %[temp6], 2052(%[v0]) \n\t" + "madd.s %[temp3], %[temp3], %[temp10], %[temp11] \n\t" + "lwc1 %[temp7], 1028(%[s0]) \n\t" + "madd.s %[temp0], %[temp0], %[temp12], %[temp13] \n\t" + "lwc1 %[temp8], 2056(%[v0]) \n\t" + "madd.s %[temp1], %[temp1], %[temp14], %[temp15] \n\t" + "lwc1 %[temp9], 1032(%[s0]) \n\t" + "madd.s %[temp2], %[temp2], %[temp16], %[temp17] \n\t" + "lwc1 %[temp10], 2060(%[v0]) \n\t" + "madd.s %[temp3], %[temp3], %[temp18], %[temp19] \n\t" + "lwc1 %[temp11], 1036(%[s0]) \n\t" + "lwc1 %[temp12], 2816(%[v0]) \n\t" + "lwc1 %[temp13], 1280(%[s0]) \n\t" + "lwc1 %[temp14], 2820(%[v0]) \n\t" + "lwc1 %[temp15], 1284(%[s0]) \n\t" + "lwc1 %[temp16], 2824(%[v0]) \n\t" + "lwc1 %[temp17], 1288(%[s0]) \n\t" + "lwc1 %[temp18], 2828(%[v0]) \n\t" + "lwc1 %[temp19], 1292(%[s0]) \n\t" + "madd.s %[temp0], %[temp0], %[temp4], %[temp5] \n\t" + "lwc1 %[temp4], 3072(%[v0]) \n\t" + "madd.s %[temp1], %[temp1], %[temp6], %[temp7] \n\t" + "lwc1 %[temp5], 1536(%[s0]) \n\t" + "madd.s %[temp2], %[temp2], %[temp8], %[temp9] \n\t" + "lwc1 %[temp6], 3076(%[v0]) \n\t" + "madd.s %[temp3], %[temp3], %[temp10], %[temp11] \n\t" + "lwc1 %[temp7], 1540(%[s0]) \n\t" + "madd.s %[temp0], %[temp0], %[temp12], %[temp13] \n\t" + "lwc1 %[temp8], 3080(%[v0]) \n\t" + "madd.s %[temp1], %[temp1], %[temp14], %[temp15] \n\t" + "lwc1 %[temp9], 1544(%[s0]) \n\t" + "madd.s %[temp2], %[temp2], %[temp16], %[temp17] \n\t" + "lwc1 %[temp10], 3084(%[v0]) \n\t" + "madd.s %[temp3], %[temp3], %[temp18], %[temp19] \n\t" + "lwc1 %[temp11], 1548(%[s0]) \n\t" + "lwc1 %[temp12], 3840(%[v0]) \n\t" + "lwc1 %[temp13], 1792(%[s0]) \n\t" + "lwc1 %[temp14], 3844(%[v0]) \n\t" + "lwc1 %[temp15], 1796(%[s0]) \n\t" + "lwc1 %[temp16], 3848(%[v0]) \n\t" + "lwc1 %[temp17], 1800(%[s0]) \n\t" + "lwc1 %[temp18], 3852(%[v0]) \n\t" + "lwc1 %[temp19], 1804(%[s0]) \n\t" + "madd.s %[temp0], %[temp0], %[temp4], %[temp5] \n\t" + "lwc1 %[temp4], 4096(%[v0]) \n\t" + "madd.s %[temp1], %[temp1], %[temp6], %[temp7] \n\t" + "lwc1 %[temp5], 2048(%[s0]) \n\t" + "madd.s %[temp2], %[temp2], %[temp8], %[temp9] \n\t" + "lwc1 %[temp6], 4100(%[v0]) \n\t" + "madd.s %[temp3], %[temp3], %[temp10], %[temp11] \n\t" + "lwc1 %[temp7], 2052(%[s0]) \n\t" + "madd.s %[temp0], %[temp0], %[temp12], %[temp13] \n\t" + "lwc1 %[temp8], 4104(%[v0]) \n\t" + "madd.s %[temp1], %[temp1], %[temp14], %[temp15] \n\t" + "lwc1 %[temp9], 2056(%[s0]) \n\t" + "madd.s %[temp2], %[temp2], %[temp16], %[temp17] \n\t" + "lwc1 %[temp10], 4108(%[v0]) \n\t" + "madd.s %[temp3], %[temp3], %[temp18], %[temp19] \n\t" + "lwc1 %[temp11], 2060(%[s0]) \n\t" + "lwc1 %[temp12], 4864(%[v0]) \n\t" + "lwc1 %[temp13], 2304(%[s0]) \n\t" + "lwc1 %[temp14], 4868(%[v0]) \n\t" + "lwc1 %[temp15], 2308(%[s0]) \n\t" + "madd.s %[temp0], %[temp0], %[temp4], %[temp5] \n\t" + "lwc1 %[temp16], 4872(%[v0]) \n\t" + "madd.s %[temp1], %[temp1], %[temp6], %[temp7] \n\t" + "lwc1 %[temp17], 2312(%[s0]) \n\t" + "madd.s %[temp2], %[temp2], %[temp8], %[temp9] \n\t" + "lwc1 %[temp18], 4876(%[v0]) \n\t" + "madd.s %[temp3], %[temp3], %[temp10], %[temp11] \n\t" + "lwc1 %[temp19], 2316(%[s0]) \n\t" + "madd.s %[temp0], %[temp0], %[temp12], %[temp13] \n\t" + PTR_ADDIU "%[dst], %[dst], 16 \n\t" + "madd.s %[temp1], %[temp1], %[temp14], %[temp15] \n\t" + "madd.s %[temp2], %[temp2], %[temp16], %[temp17] \n\t" + "madd.s %[temp3], %[temp3], %[temp18], %[temp19] \n\t" + "swc1 %[temp0], -16(%[dst]) \n\t" + "swc1 %[temp1], -12(%[dst]) \n\t" + "swc1 %[temp2], -8(%[dst]) \n\t" + "swc1 %[temp3], -4(%[dst]) \n\t" + ".set pop \n\t" + + : [dst]"+r"(dst), [v0]"+r"(vv0), [s0]"+r"(s0), + [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5), + [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8), + [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), [temp11]"=&f"(temp11), + [temp12]"=&f"(temp12), [temp13]"=&f"(temp13), [temp14]"=&f"(temp14), + [temp15]"=&f"(temp15), [temp16]"=&f"(temp16), [temp17]"=&f"(temp17), + [temp18]"=&f"(temp18), [temp19]"=&f"(temp19) + : [v0_end]"r"(v0_end) + : "memory" + ); + } + else + { + fdsp->vector_fmul (out, v , sbr_qmf_window , 64 >> div); + fdsp->vector_fmul_add(out, v + ( 192 >> div), sbr_qmf_window + ( 64 >> div), out , 64 >> div); + fdsp->vector_fmul_add(out, v + ( 256 >> div), sbr_qmf_window + (128 >> div), out , 64 >> div); + fdsp->vector_fmul_add(out, v + ( 448 >> div), sbr_qmf_window + (192 >> div), out , 64 >> div); + fdsp->vector_fmul_add(out, v + ( 512 >> div), sbr_qmf_window + (256 >> div), out , 64 >> div); + fdsp->vector_fmul_add(out, v + ( 704 >> div), sbr_qmf_window + (320 >> div), out , 64 >> div); + fdsp->vector_fmul_add(out, v + ( 768 >> div), sbr_qmf_window + (384 >> div), out , 64 >> div); + fdsp->vector_fmul_add(out, v + ( 960 >> div), sbr_qmf_window + (448 >> div), out , 64 >> div); + fdsp->vector_fmul_add(out, v + (1024 >> div), sbr_qmf_window + (512 >> div), out , 64 >> div); + fdsp->vector_fmul_add(out, v + (1216 >> div), sbr_qmf_window + (576 >> div), out , 64 >> div); + out += 64 >> div; + } + } +} + +#define sbr_qmf_analysis sbr_qmf_analysis_mips +#define sbr_qmf_synthesis sbr_qmf_synthesis_mips + +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_MIPSFPU */ +#endif /* HAVE_INLINE_ASM */ + +#endif /* AVCODEC_MIPS_AACSBR_MIPS_H */ diff --git a/libavcodec/mips/ac3dsp_mips.c b/libavcodec/mips/ac3dsp_mips.c new file mode 100644 index 0000000000..f9aaf15639 --- /dev/null +++ b/libavcodec/mips/ac3dsp_mips.c @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Authors: Branimir Vasic (bvasic@mips.com) + * Nedeljko Babic (nbabic@mips.com) + * + * Various AC-3 DSP Utils optimized for MIPS + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/ac3dsp.c + */ + +#include "config.h" +#include "libavcodec/ac3dsp.h" +#include "libavcodec/ac3.h" +#include "libavutil/mips/asmdefs.h" + +#if HAVE_INLINE_ASM +#if HAVE_MIPSDSP +static void ac3_bit_alloc_calc_bap_mips(int16_t *mask, int16_t *psd, + int start, int end, + int snr_offset, int floor, + const uint8_t *bap_tab, uint8_t *bap) +{ + int band, band_end, cond; + int m, address1, address2; + int16_t *psd1, *psd_end; + uint8_t *bap1; + + if (snr_offset == -960) { + memset(bap, 0, AC3_MAX_COEFS); + return; + } + + psd1 = &psd[start]; + bap1 = &bap[start]; + band = ff_ac3_bin_to_band_tab[start]; + + do { + m = (FFMAX(mask[band] - snr_offset - floor, 0) & 0x1FE0) + floor; + band_end = ff_ac3_band_start_tab[++band]; + band_end = FFMIN(band_end, end); + psd_end = psd + band_end - 1; + + __asm__ volatile ( + "slt %[cond], %[psd1], %[psd_end] \n\t" + "beqz %[cond], 1f \n\t" + "2: \n\t" + "lh %[address1], 0(%[psd1]) \n\t" + "lh %[address2], 2(%[psd1]) \n\t" + PTR_ADDIU " %[psd1], %[psd1], 4 \n\t" + "subu %[address1], %[address1], %[m] \n\t" + "sra %[address1], %[address1], 5 \n\t" + "addiu %[address1], %[address1], -32 \n\t" + "shll_s.w %[address1], %[address1], 26 \n\t" + "subu %[address2], %[address2], %[m] \n\t" + "sra %[address2], %[address2], 5 \n\t" + "sra %[address1], %[address1], 26 \n\t" + "addiu %[address1], %[address1], 32 \n\t" + "lbux %[address1], %[address1](%[bap_tab]) \n\t" + "addiu %[address2], %[address2], -32 \n\t" + "shll_s.w %[address2], %[address2], 26 \n\t" + "sb %[address1], 0(%[bap1]) \n\t" + "slt %[cond], %[psd1], %[psd_end] \n\t" + "sra %[address2], %[address2], 26 \n\t" + "addiu %[address2], %[address2], 32 \n\t" + "lbux %[address2], %[address2](%[bap_tab]) \n\t" + "sb %[address2], 1(%[bap1]) \n\t" + PTR_ADDIU " %[bap1], %[bap1], 2 \n\t" + "bnez %[cond], 2b \n\t" + PTR_ADDIU " %[psd_end], %[psd_end], 2 \n\t" + "slt %[cond], %[psd1], %[psd_end] \n\t" + "beqz %[cond], 3f \n\t" + "1: \n\t" + "lh %[address1], 0(%[psd1]) \n\t" + PTR_ADDIU " %[psd1], %[psd1], 2 \n\t" + "subu %[address1], %[address1], %[m] \n\t" + "sra %[address1], %[address1], 5 \n\t" + "addiu %[address1], %[address1], -32 \n\t" + "shll_s.w %[address1], %[address1], 26 \n\t" + "sra %[address1], %[address1], 26 \n\t" + "addiu %[address1], %[address1], 32 \n\t" + "lbux %[address1], %[address1](%[bap_tab]) \n\t" + "sb %[address1], 0(%[bap1]) \n\t" + PTR_ADDIU " %[bap1], %[bap1], 1 \n\t" + "3: \n\t" + + : [address1]"=&r"(address1), [address2]"=&r"(address2), + [cond]"=&r"(cond), [bap1]"+r"(bap1), + [psd1]"+r"(psd1), [psd_end]"+r"(psd_end) + : [m]"r"(m), [bap_tab]"r"(bap_tab) + : "memory" + ); + } while (end > band_end); +} + +static void ac3_update_bap_counts_mips(uint16_t mant_cnt[16], uint8_t *bap, + int len) +{ + void *temp0, *temp2, *temp4, *temp5, *temp6, *temp7; + int temp1, temp3; + + __asm__ volatile ( + "andi %[temp3], %[len], 3 \n\t" + PTR_ADDU "%[temp2], %[bap], %[len] \n\t" + PTR_ADDU "%[temp4], %[bap], %[temp3] \n\t" + "beq %[temp2], %[temp4], 4f \n\t" + "1: \n\t" + "lbu %[temp0], -1(%[temp2]) \n\t" + "lbu %[temp5], -2(%[temp2]) \n\t" + "lbu %[temp6], -3(%[temp2]) \n\t" + "sll %[temp0], %[temp0], 1 \n\t" + PTR_ADDU "%[temp0], %[mant_cnt], %[temp0] \n\t" + "sll %[temp5], %[temp5], 1 \n\t" + PTR_ADDU "%[temp5], %[mant_cnt], %[temp5] \n\t" + "lhu %[temp1], 0(%[temp0]) \n\t" + "sll %[temp6], %[temp6], 1 \n\t" + PTR_ADDU "%[temp6], %[mant_cnt], %[temp6] \n\t" + "addiu %[temp1], %[temp1], 1 \n\t" + "sh %[temp1], 0(%[temp0]) \n\t" + "lhu %[temp1], 0(%[temp5]) \n\t" + "lbu %[temp7], -4(%[temp2]) \n\t" + PTR_ADDIU "%[temp2],%[temp2], -4 \n\t" + "addiu %[temp1], %[temp1], 1 \n\t" + "sh %[temp1], 0(%[temp5]) \n\t" + "lhu %[temp1], 0(%[temp6]) \n\t" + "sll %[temp7], %[temp7], 1 \n\t" + PTR_ADDU "%[temp7], %[mant_cnt], %[temp7] \n\t" + "addiu %[temp1], %[temp1],1 \n\t" + "sh %[temp1], 0(%[temp6]) \n\t" + "lhu %[temp1], 0(%[temp7]) \n\t" + "addiu %[temp1], %[temp1], 1 \n\t" + "sh %[temp1], 0(%[temp7]) \n\t" + "bne %[temp2], %[temp4], 1b \n\t" + "4: \n\t" + "beqz %[temp3], 2f \n\t" + "3: \n\t" + "addiu %[temp3], %[temp3], -1 \n\t" + "lbu %[temp0], -1(%[temp2]) \n\t" + PTR_ADDIU "%[temp2],%[temp2], -1 \n\t" + "sll %[temp0], %[temp0], 1 \n\t" + PTR_ADDU "%[temp0], %[mant_cnt], %[temp0] \n\t" + "lhu %[temp1], 0(%[temp0]) \n\t" + "addiu %[temp1], %[temp1], 1 \n\t" + "sh %[temp1], 0(%[temp0]) \n\t" + "bgtz %[temp3], 3b \n\t" + "2: \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [temp4] "=&r" (temp4), [temp5] "=&r" (temp5), + [temp6] "=&r" (temp6), [temp7] "=&r" (temp7) + : [len] "r" (len), [bap] "r" (bap), + [mant_cnt] "r" (mant_cnt) + : "memory" + ); +} +#endif + +#if HAVE_MIPSFPU +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 +static void float_to_fixed24_mips(int32_t *dst, const float *src, unsigned int len) +{ + const float scale = 1 << 24; + float src0, src1, src2, src3, src4, src5, src6, src7; + int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + + do { + __asm__ volatile ( + "lwc1 %[src0], 0(%[src]) \n\t" + "lwc1 %[src1], 4(%[src]) \n\t" + "lwc1 %[src2], 8(%[src]) \n\t" + "lwc1 %[src3], 12(%[src]) \n\t" + "lwc1 %[src4], 16(%[src]) \n\t" + "lwc1 %[src5], 20(%[src]) \n\t" + "lwc1 %[src6], 24(%[src]) \n\t" + "lwc1 %[src7], 28(%[src]) \n\t" + "mul.s %[src0], %[src0], %[scale] \n\t" + "mul.s %[src1], %[src1], %[scale] \n\t" + "mul.s %[src2], %[src2], %[scale] \n\t" + "mul.s %[src3], %[src3], %[scale] \n\t" + "mul.s %[src4], %[src4], %[scale] \n\t" + "mul.s %[src5], %[src5], %[scale] \n\t" + "mul.s %[src6], %[src6], %[scale] \n\t" + "mul.s %[src7], %[src7], %[scale] \n\t" + "cvt.w.s %[src0], %[src0] \n\t" + "cvt.w.s %[src1], %[src1] \n\t" + "cvt.w.s %[src2], %[src2] \n\t" + "cvt.w.s %[src3], %[src3] \n\t" + "cvt.w.s %[src4], %[src4] \n\t" + "cvt.w.s %[src5], %[src5] \n\t" + "cvt.w.s %[src6], %[src6] \n\t" + "cvt.w.s %[src7], %[src7] \n\t" + "mfc1 %[temp0], %[src0] \n\t" + "mfc1 %[temp1], %[src1] \n\t" + "mfc1 %[temp2], %[src2] \n\t" + "mfc1 %[temp3], %[src3] \n\t" + "mfc1 %[temp4], %[src4] \n\t" + "mfc1 %[temp5], %[src5] \n\t" + "mfc1 %[temp6], %[src6] \n\t" + "mfc1 %[temp7], %[src7] \n\t" + "sw %[temp0], 0(%[dst]) \n\t" + "sw %[temp1], 4(%[dst]) \n\t" + "sw %[temp2], 8(%[dst]) \n\t" + "sw %[temp3], 12(%[dst]) \n\t" + "sw %[temp4], 16(%[dst]) \n\t" + "sw %[temp5], 20(%[dst]) \n\t" + "sw %[temp6], 24(%[dst]) \n\t" + "sw %[temp7], 28(%[dst]) \n\t" + + : [dst] "+r" (dst), [src] "+r" (src), + [src0] "=&f" (src0), [src1] "=&f" (src1), + [src2] "=&f" (src2), [src3] "=&f" (src3), + [src4] "=&f" (src4), [src5] "=&f" (src5), + [src6] "=&f" (src6), [src7] "=&f" (src7), + [temp0] "=r" (temp0), [temp1] "=r" (temp1), + [temp2] "=r" (temp2), [temp3] "=r" (temp3), + [temp4] "=r" (temp4), [temp5] "=r" (temp5), + [temp6] "=r" (temp6), [temp7] "=r" (temp7) + : [scale] "f" (scale) + : "memory" + ); + src = src + 8; + dst = dst + 8; + len -= 8; + } while (len > 0); +} + +static void ac3_downmix_mips(float **samples, float (*matrix)[2], + int out_ch, int in_ch, int len) +{ + int i, j, i1, i2, i3; + float v0, v1, v2, v3; + float v4, v5, v6, v7; + float samples0, samples1, samples2, samples3, matrix_j, matrix_j2; + float *samples_p, *samples_sw, *matrix_p, **samples_x, **samples_end; + + __asm__ volatile( + ".set push \n\t" + ".set noreorder \n\t" + + "li %[i1], 2 \n\t" + "sll %[len], 2 \n\t" + "move %[i], $zero \n\t" + "sll %[j], %[in_ch], " PTRLOG " \n\t" + + "bne %[out_ch], %[i1], 3f \n\t" // if (out_ch == 2) + " li %[i2], 1 \n\t" + + "2: \n\t" // start of the for loop (for (i = 0; i < len; i+=4)) + "move %[matrix_p], %[matrix] \n\t" + "move %[samples_x], %[samples] \n\t" + "mtc1 $zero, %[v0] \n\t" + "mtc1 $zero, %[v1] \n\t" + "mtc1 $zero, %[v2] \n\t" + "mtc1 $zero, %[v3] \n\t" + "mtc1 $zero, %[v4] \n\t" + "mtc1 $zero, %[v5] \n\t" + "mtc1 $zero, %[v6] \n\t" + "mtc1 $zero, %[v7] \n\t" + "addiu %[i1], %[i], 4 \n\t" + "addiu %[i2], %[i], 8 \n\t" + PTR_L " %[samples_p], 0(%[samples_x]) \n\t" + "addiu %[i3], %[i], 12 \n\t" + PTR_ADDU "%[samples_end],%[samples_x], %[j] \n\t" + "move %[samples_sw], %[samples_p] \n\t" + + "1: \n\t" // start of the inner for loop (for (j = 0; j < in_ch; j++)) + "lwc1 %[matrix_j], 0(%[matrix_p]) \n\t" + "lwc1 %[matrix_j2], 4(%[matrix_p]) \n\t" + "lwxc1 %[samples0], %[i](%[samples_p]) \n\t" + "lwxc1 %[samples1], %[i1](%[samples_p]) \n\t" + "lwxc1 %[samples2], %[i2](%[samples_p]) \n\t" + "lwxc1 %[samples3], %[i3](%[samples_p]) \n\t" + PTR_ADDIU "%[matrix_p], 8 \n\t" + PTR_ADDIU "%[samples_x]," PTRSIZE " \n\t" + "madd.s %[v0], %[v0], %[samples0], %[matrix_j] \n\t" + "madd.s %[v1], %[v1], %[samples1], %[matrix_j] \n\t" + "madd.s %[v2], %[v2], %[samples2], %[matrix_j] \n\t" + "madd.s %[v3], %[v3], %[samples3], %[matrix_j] \n\t" + "madd.s %[v4], %[v4], %[samples0], %[matrix_j2]\n\t" + "madd.s %[v5], %[v5], %[samples1], %[matrix_j2]\n\t" + "madd.s %[v6], %[v6], %[samples2], %[matrix_j2]\n\t" + "madd.s %[v7], %[v7], %[samples3], %[matrix_j2]\n\t" + "bne %[samples_x], %[samples_end], 1b \n\t" + PTR_L " %[samples_p], 0(%[samples_x]) \n\t" + + PTR_L " %[samples_p], " PTRSIZE "(%[samples]) \n\t" + "swxc1 %[v0], %[i](%[samples_sw]) \n\t" + "swxc1 %[v1], %[i1](%[samples_sw]) \n\t" + "swxc1 %[v2], %[i2](%[samples_sw]) \n\t" + "swxc1 %[v3], %[i3](%[samples_sw]) \n\t" + "swxc1 %[v4], %[i](%[samples_p]) \n\t" + "addiu %[i], 16 \n\t" + "swxc1 %[v5], %[i1](%[samples_p]) \n\t" + "swxc1 %[v6], %[i2](%[samples_p]) \n\t" + "bne %[i], %[len], 2b \n\t" + " swxc1 %[v7], %[i3](%[samples_p]) \n\t" + + "3: \n\t" + "bne %[out_ch], %[i2], 6f \n\t" // if (out_ch == 1) + " nop \n\t" + + "5: \n\t" // start of the outer for loop (for (i = 0; i < len; i+=4)) + "move %[matrix_p], %[matrix] \n\t" + "move %[samples_x], %[samples] \n\t" + "mtc1 $zero, %[v0] \n\t" + "mtc1 $zero, %[v1] \n\t" + "mtc1 $zero, %[v2] \n\t" + "mtc1 $zero, %[v3] \n\t" + "addiu %[i1], %[i], 4 \n\t" + "addiu %[i2], %[i], 8 \n\t" + PTR_L " %[samples_p], 0(%[samples_x]) \n\t" + "addiu %[i3], %[i], 12 \n\t" + PTR_ADDU "%[samples_end],%[samples_x], %[j] \n\t" + "move %[samples_sw], %[samples_p] \n\t" + + "4: \n\t" // start of the inner for loop (for (j = 0; j < in_ch; j++)) + "lwc1 %[matrix_j], 0(%[matrix_p]) \n\t" + "lwxc1 %[samples0], %[i](%[samples_p]) \n\t" + "lwxc1 %[samples1], %[i1](%[samples_p]) \n\t" + "lwxc1 %[samples2], %[i2](%[samples_p]) \n\t" + "lwxc1 %[samples3], %[i3](%[samples_p]) \n\t" + PTR_ADDIU "%[matrix_p], 8 \n\t" + PTR_ADDIU "%[samples_x]," PTRSIZE " \n\t" + "madd.s %[v0], %[v0], %[samples0], %[matrix_j] \n\t" + "madd.s %[v1], %[v1], %[samples1], %[matrix_j] \n\t" + "madd.s %[v2], %[v2], %[samples2], %[matrix_j] \n\t" + "madd.s %[v3], %[v3], %[samples3], %[matrix_j] \n\t" + "bne %[samples_x], %[samples_end], 4b \n\t" + PTR_L " %[samples_p], 0(%[samples_x]) \n\t" + + "swxc1 %[v0], %[i](%[samples_sw]) \n\t" + "addiu %[i], 16 \n\t" + "swxc1 %[v1], %[i1](%[samples_sw]) \n\t" + "swxc1 %[v2], %[i2](%[samples_sw]) \n\t" + "bne %[i], %[len], 5b \n\t" + " swxc1 %[v3], %[i3](%[samples_sw]) \n\t" + "6: \n\t" + + ".set pop" + :[samples_p]"=&r"(samples_p), [matrix_j]"=&f"(matrix_j), [matrix_j2]"=&f"(matrix_j2), + [samples0]"=&f"(samples0), [samples1]"=&f"(samples1), + [samples2]"=&f"(samples2), [samples3]"=&f"(samples3), + [v0]"=&f"(v0), [v1]"=&f"(v1), [v2]"=&f"(v2), [v3]"=&f"(v3), + [v4]"=&f"(v4), [v5]"=&f"(v5), [v6]"=&f"(v6), [v7]"=&f"(v7), + [samples_x]"=&r"(samples_x), [matrix_p]"=&r"(matrix_p), + [samples_end]"=&r"(samples_end), [samples_sw]"=&r"(samples_sw), + [i1]"=&r"(i1), [i2]"=&r"(i2), [i3]"=&r"(i3), [i]"=&r"(i), + [j]"=&r"(j), [len]"+r"(len) + :[samples]"r"(samples), [matrix]"r"(matrix), + [in_ch]"r"(in_ch), [out_ch]"r"(out_ch) + :"memory" + ); +} +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_MIPSFPU */ +#endif /* HAVE_INLINE_ASM */ + +void ff_ac3dsp_init_mips(AC3DSPContext *c, int bit_exact) { +#if HAVE_INLINE_ASM +#if HAVE_MIPSDSP + c->bit_alloc_calc_bap = ac3_bit_alloc_calc_bap_mips; + c->update_bap_counts = ac3_update_bap_counts_mips; +#endif +#if HAVE_MIPSFPU +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 + c->float_to_fixed24 = float_to_fixed24_mips; + c->downmix = ac3_downmix_mips; +#endif +#endif + +#endif +} diff --git a/libavcodec/mips/acelp_filters_mips.c b/libavcodec/mips/acelp_filters_mips.c new file mode 100644 index 0000000000..478db855b2 --- /dev/null +++ b/libavcodec/mips/acelp_filters_mips.c @@ -0,0 +1,221 @@ + /* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Nedeljko Babic (nbabic@mips.com) + * + * various filters for ACELP-based codecs optimized for MIPS + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/acelp_filters.c + */ +#include "config.h" +#include "libavutil/attributes.h" +#include "libavcodec/acelp_filters.h" +#include "libavutil/mips/asmdefs.h" + +#if HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 +static void ff_acelp_interpolatef_mips(float *out, const float *in, + const float *filter_coeffs, int precision, + int frac_pos, int filter_length, int length) +{ + int n, i; + int prec = precision * 4; + int fc_offset = precision - frac_pos; + float in_val_p, in_val_m, fc_val_p, fc_val_m; + + for (n = 0; n < length; n++) { + /** + * four pointers are defined in order to minimize number of + * computations done in inner loop + */ + const float *p_in_p = &in[n]; + const float *p_in_m = &in[n-1]; + const float *p_filter_coeffs_p = &filter_coeffs[frac_pos]; + const float *p_filter_coeffs_m = filter_coeffs + fc_offset; + float v = 0; + + for (i = 0; i < filter_length;i++) { + __asm__ volatile ( + "lwc1 %[in_val_p], 0(%[p_in_p]) \n\t" + "lwc1 %[fc_val_p], 0(%[p_filter_coeffs_p]) \n\t" + "lwc1 %[in_val_m], 0(%[p_in_m]) \n\t" + "lwc1 %[fc_val_m], 0(%[p_filter_coeffs_m]) \n\t" + PTR_ADDIU "%[p_in_p], %[p_in_p], 4 \n\t" + "madd.s %[v],%[v], %[in_val_p],%[fc_val_p] \n\t" + PTR_ADDIU "%[p_in_m], %[p_in_m], -4 \n\t" + PTR_ADDU "%[p_filter_coeffs_p],%[p_filter_coeffs_p], %[prec] \n\t" + PTR_ADDU "%[p_filter_coeffs_m],%[p_filter_coeffs_m], %[prec] \n\t" + "madd.s %[v],%[v],%[in_val_m], %[fc_val_m] \n\t" + + : [v] "+&f" (v),[p_in_p] "+r" (p_in_p), [p_in_m] "+r" (p_in_m), + [p_filter_coeffs_p] "+r" (p_filter_coeffs_p), + [in_val_p] "=&f" (in_val_p), [in_val_m] "=&f" (in_val_m), + [fc_val_p] "=&f" (fc_val_p), [fc_val_m] "=&f" (fc_val_m), + [p_filter_coeffs_m] "+r" (p_filter_coeffs_m) + : [prec] "r" (prec) + : "memory" + ); + } + out[n] = v; + } +} + +static void ff_acelp_apply_order_2_transfer_function_mips(float *out, const float *in, + const float zero_coeffs[2], + const float pole_coeffs[2], + float gain, float mem[2], int n) +{ + /** + * loop is unrolled eight times + */ + + __asm__ volatile ( + "lwc1 $f0, 0(%[mem]) \n\t" + "blez %[n], ff_acelp_apply_order_2_transfer_function_end%= \n\t" + "lwc1 $f1, 4(%[mem]) \n\t" + "lwc1 $f2, 0(%[pole_coeffs]) \n\t" + "lwc1 $f3, 4(%[pole_coeffs]) \n\t" + "lwc1 $f4, 0(%[zero_coeffs]) \n\t" + "lwc1 $f5, 4(%[zero_coeffs]) \n\t" + + "ff_acelp_apply_order_2_transfer_function_madd%=: \n\t" + + "lwc1 $f6, 0(%[in]) \n\t" + "mul.s $f9, $f3, $f1 \n\t" + "mul.s $f7, $f2, $f0 \n\t" + "msub.s $f7, $f7, %[gain], $f6 \n\t" + "sub.s $f7, $f7, $f9 \n\t" + "madd.s $f8, $f7, $f4, $f0 \n\t" + "madd.s $f8, $f8, $f5, $f1 \n\t" + "lwc1 $f11, 4(%[in]) \n\t" + "mul.s $f12, $f3, $f0 \n\t" + "mul.s $f13, $f2, $f7 \n\t" + "msub.s $f13, $f13, %[gain], $f11 \n\t" + "sub.s $f13, $f13, $f12 \n\t" + "madd.s $f14, $f13, $f4, $f7 \n\t" + "madd.s $f14, $f14, $f5, $f0 \n\t" + "swc1 $f8, 0(%[out]) \n\t" + "lwc1 $f6, 8(%[in]) \n\t" + "mul.s $f9, $f3, $f7 \n\t" + "mul.s $f15, $f2, $f13 \n\t" + "msub.s $f15, $f15, %[gain], $f6 \n\t" + "sub.s $f15, $f15, $f9 \n\t" + "madd.s $f8, $f15, $f4, $f13 \n\t" + "madd.s $f8, $f8, $f5, $f7 \n\t" + "swc1 $f14, 4(%[out]) \n\t" + "lwc1 $f11, 12(%[in]) \n\t" + "mul.s $f12, $f3, $f13 \n\t" + "mul.s $f16, $f2, $f15 \n\t" + "msub.s $f16, $f16, %[gain], $f11 \n\t" + "sub.s $f16, $f16, $f12 \n\t" + "madd.s $f14, $f16, $f4, $f15 \n\t" + "madd.s $f14, $f14, $f5, $f13 \n\t" + "swc1 $f8, 8(%[out]) \n\t" + "lwc1 $f6, 16(%[in]) \n\t" + "mul.s $f9, $f3, $f15 \n\t" + "mul.s $f7, $f2, $f16 \n\t" + "msub.s $f7, $f7, %[gain], $f6 \n\t" + "sub.s $f7, $f7, $f9 \n\t" + "madd.s $f8, $f7, $f4, $f16 \n\t" + "madd.s $f8, $f8, $f5, $f15 \n\t" + "swc1 $f14, 12(%[out]) \n\t" + "lwc1 $f11, 20(%[in]) \n\t" + "mul.s $f12, $f3, $f16 \n\t" + "mul.s $f13, $f2, $f7 \n\t" + "msub.s $f13, $f13, %[gain], $f11 \n\t" + "sub.s $f13, $f13, $f12 \n\t" + "madd.s $f14, $f13, $f4, $f7 \n\t" + "madd.s $f14, $f14, $f5, $f16 \n\t" + "swc1 $f8, 16(%[out]) \n\t" + "lwc1 $f6, 24(%[in]) \n\t" + "mul.s $f9, $f3, $f7 \n\t" + "mul.s $f15, $f2, $f13 \n\t" + "msub.s $f15, $f15, %[gain], $f6 \n\t" + "sub.s $f1, $f15, $f9 \n\t" + "madd.s $f8, $f1, $f4, $f13 \n\t" + "madd.s $f8, $f8, $f5, $f7 \n\t" + "swc1 $f14, 20(%[out]) \n\t" + "lwc1 $f11, 28(%[in]) \n\t" + "mul.s $f12, $f3, $f13 \n\t" + "mul.s $f16, $f2, $f1 \n\t" + "msub.s $f16, $f16, %[gain], $f11 \n\t" + "sub.s $f0, $f16, $f12 \n\t" + "madd.s $f14, $f0, $f4, $f1 \n\t" + "madd.s $f14, $f14, $f5, $f13 \n\t" + "swc1 $f8, 24(%[out]) \n\t" + PTR_ADDIU "%[out], 32 \n\t" + PTR_ADDIU "%[in], 32 \n\t" + "addiu %[n], -8 \n\t" + "swc1 $f14, -4(%[out]) \n\t" + "bnez %[n], ff_acelp_apply_order_2_transfer_function_madd%= \n\t" + "swc1 $f1, 4(%[mem]) \n\t" + "swc1 $f0, 0(%[mem]) \n\t" + + "ff_acelp_apply_order_2_transfer_function_end%=: \n\t" + + : [out] "+r" (out), + [in] "+r" (in), [gain] "+f" (gain), + [n] "+r" (n), [mem] "+r" (mem) + : [zero_coeffs] "r" (zero_coeffs), + [pole_coeffs] "r" (pole_coeffs) + : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", + "$f6", "$f7", "$f8", "$f9", "$f10", "$f11", + "$f12", "$f13", "$f14", "$f15", "$f16", "memory" + ); +} +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_INLINE_ASM */ + +void ff_acelp_filter_init_mips(ACELPFContext *c) +{ +#if HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 + c->acelp_interpolatef = ff_acelp_interpolatef_mips; + c->acelp_apply_order_2_transfer_function = ff_acelp_apply_order_2_transfer_function_mips; +#endif +#endif +} diff --git a/libavcodec/mips/acelp_vectors_mips.c b/libavcodec/mips/acelp_vectors_mips.c new file mode 100644 index 0000000000..0ab2b6a87b --- /dev/null +++ b/libavcodec/mips/acelp_vectors_mips.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Nedeljko Babic (nbabic@mips.com) + * + * adaptive and fixed codebook vector operations for ACELP-based codecs + * optimized for MIPS + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/acelp_vectors.c + */ +#include "config.h" +#include "libavcodec/acelp_vectors.h" +#include "libavutil/mips/asmdefs.h" + +#if HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 +static void ff_weighted_vector_sumf_mips( + float *out, const float *in_a, const float *in_b, + float weight_coeff_a, float weight_coeff_b, int length) +{ + const float *a_end = in_a + length; + + /* loop unrolled two times */ + __asm__ volatile ( + "blez %[length], ff_weighted_vector_sumf_end%= \n\t" + + "ff_weighted_vector_sumf_madd%=: \n\t" + "lwc1 $f0, 0(%[in_a]) \n\t" + "lwc1 $f3, 4(%[in_a]) \n\t" + "lwc1 $f1, 0(%[in_b]) \n\t" + "lwc1 $f4, 4(%[in_b]) \n\t" + "mul.s $f2, %[weight_coeff_a], $f0 \n\t" + "mul.s $f5, %[weight_coeff_a], $f3 \n\t" + "madd.s $f2, $f2, %[weight_coeff_b], $f1 \n\t" + "madd.s $f5, $f5, %[weight_coeff_b], $f4 \n\t" + PTR_ADDIU "%[in_a],8 \n\t" + PTR_ADDIU "%[in_b],8 \n\t" + "swc1 $f2, 0(%[out]) \n\t" + "swc1 $f5, 4(%[out]) \n\t" + PTR_ADDIU "%[out], 8 \n\t" + "bne %[in_a], %[a_end], ff_weighted_vector_sumf_madd%= \n\t" + + "ff_weighted_vector_sumf_end%=: \n\t" + + : [out] "+r" (out), [in_a] "+r" (in_a), [in_b] "+r" (in_b) + : [weight_coeff_a] "f" (weight_coeff_a), + [weight_coeff_b] "f" (weight_coeff_b), + [length] "r" (length), [a_end]"r"(a_end) + : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "memory" + ); +} +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_INLINE_ASM */ + +void ff_acelp_vectors_init_mips(ACELPVContext *c) +{ +#if HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 + c->weighted_vector_sumf = ff_weighted_vector_sumf_mips; +#endif +#endif +} diff --git a/libavcodec/mips/amrwbdec_mips.c b/libavcodec/mips/amrwbdec_mips.c new file mode 100644 index 0000000000..5dc054361b --- /dev/null +++ b/libavcodec/mips/amrwbdec_mips.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Nedeljko Babic (nbabic@mips.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/amrwbdec.c + */ +#include "libavutil/avutil.h" +#include "libavcodec/amrwbdata.h" +#include "amrwbdec_mips.h" + +#if HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 +void ff_hb_fir_filter_mips(float *out, const float fir_coef[HB_FIR_SIZE + 1], + float mem[HB_FIR_SIZE], const float *in) +{ + int i; + float data[AMRWB_SFR_SIZE_16k + HB_FIR_SIZE]; // past and current samples + + memcpy(data, mem, HB_FIR_SIZE * sizeof(float)); + memcpy(data + HB_FIR_SIZE, in, AMRWB_SFR_SIZE_16k * sizeof(float)); + + for (i = 0; i < AMRWB_SFR_SIZE_16k; i++) { + float output; + float * p_data = (data+i); + + /** + * inner loop is entirely unrolled and instructions are scheduled + * to minimize pipeline stall + */ + __asm__ volatile( + "mtc1 $zero, %[output] \n\t" + "lwc1 $f0, 0(%[p_data]) \n\t" + "lwc1 $f1, 0(%[fir_coef]) \n\t" + "lwc1 $f2, 4(%[p_data]) \n\t" + "madd.s %[output], %[output], $f0, $f1 \n\t" + "lwc1 $f3, 4(%[fir_coef]) \n\t" + "lwc1 $f4, 8(%[p_data]) \n\t" + "madd.s %[output], %[output], $f2, $f3 \n\t" + "lwc1 $f5, 8(%[fir_coef]) \n\t" + + "lwc1 $f0, 12(%[p_data]) \n\t" + "lwc1 $f1, 12(%[fir_coef]) \n\t" + "madd.s %[output], %[output], $f4, $f5 \n\t" + "lwc1 $f2, 16(%[p_data]) \n\t" + "madd.s %[output], %[output], $f0, $f1 \n\t" + "lwc1 $f3, 16(%[fir_coef]) \n\t" + "lwc1 $f4, 20(%[p_data]) \n\t" + "lwc1 $f5, 20(%[fir_coef]) \n\t" + "madd.s %[output], %[output], $f2, $f3 \n\t" + + "lwc1 $f0, 24(%[p_data]) \n\t" + "lwc1 $f1, 24(%[fir_coef]) \n\t" + "lwc1 $f2, 28(%[p_data]) \n\t" + "madd.s %[output], %[output], $f4, $f5 \n\t" + "lwc1 $f3, 28(%[fir_coef]) \n\t" + "madd.s %[output], %[output], $f0, $f1 \n\t" + "lwc1 $f4, 32(%[p_data]) \n\t" + "madd.s %[output], %[output], $f2, $f3 \n\t" + "lwc1 $f5, 32(%[fir_coef]) \n\t" + "madd.s %[output], %[output], $f4, $f5 \n\t" + + "lwc1 $f0, 36(%[p_data]) \n\t" + "lwc1 $f1, 36(%[fir_coef]) \n\t" + "lwc1 $f2, 40(%[p_data]) \n\t" + "lwc1 $f3, 40(%[fir_coef]) \n\t" + "madd.s %[output], %[output], $f0, $f1 \n\t" + "lwc1 $f4, 44(%[p_data]) \n\t" + "lwc1 $f5, 44(%[fir_coef]) \n\t" + "madd.s %[output], %[output], $f2, $f3 \n\t" + + "lwc1 $f0, 48(%[p_data]) \n\t" + "lwc1 $f1, 48(%[fir_coef]) \n\t" + "lwc1 $f2, 52(%[p_data]) \n\t" + "madd.s %[output], %[output], $f4, $f5 \n\t" + "lwc1 $f3, 52(%[fir_coef]) \n\t" + "lwc1 $f4, 56(%[p_data]) \n\t" + "madd.s %[output], %[output], $f0, $f1 \n\t" + "lwc1 $f5, 56(%[fir_coef]) \n\t" + "madd.s %[output], %[output], $f2, $f3 \n\t" + + "lwc1 $f0, 60(%[p_data]) \n\t" + "lwc1 $f1, 60(%[fir_coef]) \n\t" + "lwc1 $f2, 64(%[p_data]) \n\t" + "madd.s %[output], %[output], $f4, $f5 \n\t" + "lwc1 $f3, 64(%[fir_coef]) \n\t" + "madd.s %[output], %[output], $f0, $f1 \n\t" + "lwc1 $f4, 68(%[p_data]) \n\t" + "madd.s %[output], %[output], $f2, $f3 \n\t" + "lwc1 $f5, 68(%[fir_coef]) \n\t" + "madd.s %[output], %[output], $f4, $f5 \n\t" + + "lwc1 $f0, 72(%[p_data]) \n\t" + "lwc1 $f1, 72(%[fir_coef]) \n\t" + "lwc1 $f2, 76(%[p_data]) \n\t" + "lwc1 $f3, 76(%[fir_coef]) \n\t" + "madd.s %[output], %[output], $f0, $f1 \n\t" + "lwc1 $f4, 80(%[p_data]) \n\t" + "lwc1 $f5, 80(%[fir_coef]) \n\t" + "madd.s %[output], %[output], $f2, $f3 \n\t" + + "lwc1 $f0, 84(%[p_data]) \n\t" + "lwc1 $f1, 84(%[fir_coef]) \n\t" + "lwc1 $f2, 88(%[p_data]) \n\t" + "madd.s %[output], %[output], $f4, $f5 \n\t" + "lwc1 $f3, 88(%[fir_coef]) \n\t" + "lwc1 $f4, 92(%[p_data]) \n\t" + "madd.s %[output], %[output], $f0, $f1 \n\t" + "lwc1 $f5, 92(%[fir_coef]) \n\t" + "madd.s %[output], %[output], $f2, $f3 \n\t" + + "lwc1 $f0, 96(%[p_data]) \n\t" + "lwc1 $f1, 96(%[fir_coef]) \n\t" + "lwc1 $f2, 100(%[p_data]) \n\t" + "madd.s %[output], %[output], $f4, $f5 \n\t" + "lwc1 $f3, 100(%[fir_coef]) \n\t" + "lwc1 $f4, 104(%[p_data]) \n\t" + "madd.s %[output], %[output], $f0, $f1 \n\t" + "lwc1 $f5, 104(%[fir_coef]) \n\t" + "madd.s %[output], %[output], $f2, $f3 \n\t" + + "lwc1 $f0, 108(%[p_data]) \n\t" + "lwc1 $f1, 108(%[fir_coef]) \n\t" + "madd.s %[output], %[output], $f4, $f5 \n\t" + "lwc1 $f2, 112(%[p_data]) \n\t" + "lwc1 $f3, 112(%[fir_coef]) \n\t" + "madd.s %[output], %[output], $f0, $f1 \n\t" + "lwc1 $f4, 116(%[p_data]) \n\t" + "lwc1 $f5, 116(%[fir_coef]) \n\t" + "lwc1 $f0, 120(%[p_data]) \n\t" + "madd.s %[output], %[output], $f2, $f3 \n\t" + "lwc1 $f1, 120(%[fir_coef]) \n\t" + "madd.s %[output], %[output], $f4, $f5 \n\t" + "madd.s %[output], %[output], $f0, $f1 \n\t" + + : [output]"=&f"(output) + : [fir_coef]"r"(fir_coef), [p_data]"r"(p_data) + : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "memory" + ); + out[i] = output; + } + memcpy(mem, data + AMRWB_SFR_SIZE_16k, HB_FIR_SIZE * sizeof(float)); +} +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_INLINE_ASM */ diff --git a/libavcodec/mips/amrwbdec_mips.h b/libavcodec/mips/amrwbdec_mips.h new file mode 100644 index 0000000000..a9f66fef94 --- /dev/null +++ b/libavcodec/mips/amrwbdec_mips.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Nedeljko Babic (nbabic@mips.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/amrwbdec.c + */ +#ifndef AVCODEC_MIPS_AMRWBDEC_MIPS_H +#define AVCODEC_MIPS_AMRWBDEC_MIPS_H +#include "config.h" + +#if HAVE_MIPSFPU && HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 +void ff_hb_fir_filter_mips(float *out, const float fir_coef[], + float mem[], const float *in); +#define hb_fir_filter ff_hb_fir_filter_mips +#endif +#endif + +#endif /* AVCODEC_MIPS_AMRWBDEC_MIPS_H */ diff --git a/libavcodec/mips/blockdsp_init_mips.c b/libavcodec/mips/blockdsp_init_mips.c new file mode 100644 index 0000000000..30ae95fa10 --- /dev/null +++ b/libavcodec/mips/blockdsp_init_mips.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2015 Parag Salasakar (parag.salasakar@imgtec.com) + * Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "blockdsp_mips.h" + +#if HAVE_MSA +static av_cold void blockdsp_init_msa(BlockDSPContext *c) +{ + c->clear_block = ff_clear_block_msa; + c->clear_blocks = ff_clear_blocks_msa; + + c->fill_block_tab[0] = ff_fill_block16_msa; + c->fill_block_tab[1] = ff_fill_block8_msa; +} +#endif // #if HAVE_MSA + +#if HAVE_MMI +static av_cold void blockdsp_init_mmi(BlockDSPContext *c) +{ + c->clear_block = ff_clear_block_mmi; + c->clear_blocks = ff_clear_blocks_mmi; + + c->fill_block_tab[0] = ff_fill_block16_mmi; + c->fill_block_tab[1] = ff_fill_block8_mmi; +} +#endif /* HAVE_MMI */ + +void ff_blockdsp_init_mips(BlockDSPContext *c) +{ +#if HAVE_MSA + blockdsp_init_msa(c); +#endif // #if HAVE_MSA +#if HAVE_MMI + blockdsp_init_mmi(c); +#endif /* HAVE_MMI */ +} diff --git a/libavcodec/mips/blockdsp_mips.h b/libavcodec/mips/blockdsp_mips.h new file mode 100644 index 0000000000..9559d40eaa --- /dev/null +++ b/libavcodec/mips/blockdsp_mips.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2015 Parag Salasakar (parag.salasakar@imgtec.com) + * Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_BLOCKDSP_MIPS_H +#define AVCODEC_MIPS_BLOCKDSP_MIPS_H + +#include "../mpegvideo.h" + +void ff_fill_block16_msa(uint8_t *src, uint8_t val, int stride, int height); +void ff_fill_block8_msa(uint8_t *src, uint8_t val, int stride, int height); +void ff_clear_block_msa(int16_t *block); +void ff_clear_blocks_msa(int16_t *block); + +void ff_fill_block16_mmi(uint8_t *block, uint8_t value, int line_size, int h); +void ff_fill_block8_mmi(uint8_t *block, uint8_t value, int line_size, int h); +void ff_clear_block_mmi(int16_t *block); +void ff_clear_blocks_mmi(int16_t *block); + +#endif // #ifndef AVCODEC_MIPS_BLOCKDSP_MIPS_H diff --git a/libavcodec/mips/blockdsp_mmi.c b/libavcodec/mips/blockdsp_mmi.c new file mode 100644 index 0000000000..6eb2bd7d2c --- /dev/null +++ b/libavcodec/mips/blockdsp_mmi.c @@ -0,0 +1,158 @@ +/* + * Loongson SIMD optimized blockdsp + * + * Copyright (c) 2015 Loongson Technology Corporation Limited + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "blockdsp_mips.h" +#include "libavutil/mips/asmdefs.h" + +void ff_fill_block16_mmi(uint8_t *block, uint8_t value, int line_size, int h) +{ + double ftmp[1]; + + __asm__ volatile ( + "mtc1 %[value], %[ftmp0] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "1: \n\t" + "gssdlc1 %[ftmp0], 0x07(%[block]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[block]) \n\t" + PTR_ADDI "%[h], %[h], -0x01 \n\t" + "gssdlc1 %[ftmp0], 0x0f(%[block]) \n\t" + "gssdrc1 %[ftmp0], 0x08(%[block]) \n\t" + PTR_ADDU "%[block], %[block], %[line_size] \n\t" + "bnez %[h], 1b \n\t" + : [block]"+&r"(block), [h]"+&r"(h), + [ftmp0]"=&f"(ftmp[0]) + : [value]"r"(value), [line_size]"r"((mips_reg)line_size) + : "memory" + ); +} + +void ff_fill_block8_mmi(uint8_t *block, uint8_t value, int line_size, int h) +{ + double ftmp0; + + __asm__ volatile ( + "mtc1 %[value], %[ftmp0] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "1: \n\t" + "gssdlc1 %[ftmp0], 0x07(%[block]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[block]) \n\t" + PTR_ADDI "%[h], %[h], -0x01 \n\t" + PTR_ADDU "%[block], %[block], %[line_size] \n\t" + "bnez %[h], 1b \n\t" + : [block]"+&r"(block), [h]"+&r"(h), + [ftmp0]"=&f"(ftmp0) + : [value]"r"(value), [line_size]"r"((mips_reg)line_size) + : "memory" + ); +} + +void ff_clear_block_mmi(int16_t *block) +{ + double ftmp[2]; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x00(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x10(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x20(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x30(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x40(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x50(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x60(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x70(%[block]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]) + : [block]"r"(block) + : "memory" + ); +} + +void ff_clear_blocks_mmi(int16_t *block) +{ + double ftmp[2]; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x00(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x10(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x20(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x30(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x40(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x50(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x60(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x70(%[block]) \n\t" + + "gssqc1 %[ftmp0], %[ftmp1], 0x80(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x90(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0xa0(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0xb0(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0xc0(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0xd0(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0xe0(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0xf0(%[block]) \n\t" + + "gssqc1 %[ftmp0], %[ftmp1], 0x100(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x110(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x120(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x130(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x140(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x150(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x160(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x170(%[block]) \n\t" + + "gssqc1 %[ftmp0], %[ftmp1], 0x180(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x190(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x1a0(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x1b0(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x1c0(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x1d0(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x1e0(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x1f0(%[block]) \n\t" + + "gssqc1 %[ftmp0], %[ftmp1], 0x200(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x210(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x220(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x230(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x240(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x250(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x260(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x270(%[block]) \n\t" + + "gssqc1 %[ftmp0], %[ftmp1], 0x280(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x290(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x2a0(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x2b0(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x2c0(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x2d0(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x2e0(%[block]) \n\t" + "gssqc1 %[ftmp0], %[ftmp1], 0x2f0(%[block]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]) + : [block]"r"((mips_reg)block) + : "memory" + ); +} diff --git a/libavcodec/mips/blockdsp_msa.c b/libavcodec/mips/blockdsp_msa.c new file mode 100644 index 0000000000..32ac858e1d --- /dev/null +++ b/libavcodec/mips/blockdsp_msa.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2015 Parag Salasakar (parag.salasakar@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "blockdsp_mips.h" + +static void copy_8bit_value_width8_msa(uint8_t *src, uint8_t val, + int32_t src_stride, int32_t height) +{ + int32_t cnt; + uint64_t dst0; + v16u8 val0; + + val0 = (v16u8) __msa_fill_b(val); + dst0 = __msa_copy_u_d((v2i64) val0, 0); + + for (cnt = (height >> 2); cnt--;) { + SD4(dst0, dst0, dst0, dst0, src, src_stride); + src += (4 * src_stride); + } +} + +static void copy_8bit_value_width16_msa(uint8_t *src, uint8_t val, + int32_t src_stride, int32_t height) +{ + int32_t cnt; + v16u8 val0; + + val0 = (v16u8) __msa_fill_b(val); + + for (cnt = (height >> 3); cnt--;) { + ST_UB8(val0, val0, val0, val0, val0, val0, val0, val0, src, src_stride); + src += (8 * src_stride); + } +} + +static void memset_zero_16width_msa(uint8_t *src, int32_t stride, + int32_t height) +{ + int8_t cnt; + v16u8 zero = { 0 }; + + for (cnt = (height / 2); cnt--;) { + ST_UB(zero, src); + src += stride; + ST_UB(zero, src); + src += stride; + } +} + +void ff_fill_block16_msa(uint8_t *src, uint8_t val, int stride, int height) +{ + copy_8bit_value_width16_msa(src, val, stride, height); +} + +void ff_fill_block8_msa(uint8_t *src, uint8_t val, int stride, int height) +{ + copy_8bit_value_width8_msa(src, val, stride, height); +} + +void ff_clear_block_msa(int16_t *block) +{ + memset_zero_16width_msa((uint8_t *) block, 16, 8); +} + +void ff_clear_blocks_msa(int16_t *block) +{ + memset_zero_16width_msa((uint8_t *) block, 16, 8 * 6); +} diff --git a/libavcodec/mips/celp_filters_mips.c b/libavcodec/mips/celp_filters_mips.c new file mode 100644 index 0000000000..926f1cb334 --- /dev/null +++ b/libavcodec/mips/celp_filters_mips.c @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Nedeljko Babic (nbabic@mips.com) + * + * various filters for CELP-based codecs optimized for MIPS + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/celp_filters.c + */ +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/common.h" +#include "libavcodec/celp_filters.h" +#include "libavutil/mips/asmdefs.h" + +#if HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 +static void ff_celp_lp_synthesis_filterf_mips(float *out, + const float *filter_coeffs, + const float* in, int buffer_length, + int filter_length) +{ + int i,n; + + float out0, out1, out2, out3; + float old_out0, old_out1, old_out2, old_out3; + float a,b,c; + const float *p_filter_coeffs; + float *p_out; + + a = filter_coeffs[0]; + b = filter_coeffs[1]; + c = filter_coeffs[2]; + b -= filter_coeffs[0] * filter_coeffs[0]; + c -= filter_coeffs[1] * filter_coeffs[0]; + c -= filter_coeffs[0] * b; + + old_out0 = out[-4]; + old_out1 = out[-3]; + old_out2 = out[-2]; + old_out3 = out[-1]; + for (n = 0; n <= buffer_length - 4; n+=4) { + p_filter_coeffs = filter_coeffs; + p_out = out; + + out0 = in[0]; + out1 = in[1]; + out2 = in[2]; + out3 = in[3]; + + __asm__ volatile( + "lwc1 $f2, 8(%[filter_coeffs]) \n\t" + "lwc1 $f1, 4(%[filter_coeffs]) \n\t" + "lwc1 $f0, 0(%[filter_coeffs]) \n\t" + "nmsub.s %[out0], %[out0], $f2, %[old_out1] \n\t" + "nmsub.s %[out1], %[out1], $f2, %[old_out2] \n\t" + "nmsub.s %[out2], %[out2], $f2, %[old_out3] \n\t" + "lwc1 $f3, 12(%[filter_coeffs]) \n\t" + "nmsub.s %[out0], %[out0], $f1, %[old_out2] \n\t" + "nmsub.s %[out1], %[out1], $f1, %[old_out3] \n\t" + "nmsub.s %[out2], %[out2], $f3, %[old_out2] \n\t" + "nmsub.s %[out0], %[out0], $f0, %[old_out3] \n\t" + "nmsub.s %[out3], %[out3], $f3, %[old_out3] \n\t" + "nmsub.s %[out1], %[out1], $f3, %[old_out1] \n\t" + "nmsub.s %[out0], %[out0], $f3, %[old_out0] \n\t" + + : [out0]"+f"(out0), [out1]"+f"(out1), + [out2]"+f"(out2), [out3]"+f"(out3) + : [old_out0]"f"(old_out0), [old_out1]"f"(old_out1), + [old_out2]"f"(old_out2), [old_out3]"f"(old_out3), + [filter_coeffs]"r"(filter_coeffs) + : "$f0", "$f1", "$f2", "$f3", "$f4", "memory" + ); + + for (i = 5; i <= filter_length; i += 2) { + __asm__ volatile( + "lwc1 %[old_out3], -20(%[p_out]) \n\t" + "lwc1 $f5, 16(%[p_filter_coeffs]) \n\t" + PTR_ADDIU "%[p_out], -8 \n\t" + PTR_ADDIU "%[p_filter_coeffs], 8 \n\t" + "nmsub.s %[out1], %[out1], $f5, %[old_out0] \n\t" + "nmsub.s %[out3], %[out3], $f5, %[old_out2] \n\t" + "lwc1 $f4, 12(%[p_filter_coeffs]) \n\t" + "lwc1 %[old_out2], -16(%[p_out]) \n\t" + "nmsub.s %[out0], %[out0], $f5, %[old_out3] \n\t" + "nmsub.s %[out2], %[out2], $f5, %[old_out1] \n\t" + "nmsub.s %[out1], %[out1], $f4, %[old_out3] \n\t" + "nmsub.s %[out3], %[out3], $f4, %[old_out1] \n\t" + "mov.s %[old_out1], %[old_out3] \n\t" + "nmsub.s %[out0], %[out0], $f4, %[old_out2] \n\t" + "nmsub.s %[out2], %[out2], $f4, %[old_out0] \n\t" + + : [out0]"+f"(out0), [out1]"+f"(out1), + [out2]"+f"(out2), [out3]"+f"(out3), [old_out0]"+f"(old_out0), + [old_out1]"+f"(old_out1), [old_out2]"+f"(old_out2), + [old_out3]"+f"(old_out3),[p_filter_coeffs]"+r"(p_filter_coeffs), + [p_out]"+r"(p_out) + : + : "$f4", "$f5", "memory" + ); + FFSWAP(float, old_out0, old_out2); + } + + __asm__ volatile( + "nmsub.s %[out3], %[out3], %[a], %[out2] \n\t" + "nmsub.s %[out2], %[out2], %[a], %[out1] \n\t" + "nmsub.s %[out3], %[out3], %[b], %[out1] \n\t" + "nmsub.s %[out1], %[out1], %[a], %[out0] \n\t" + "nmsub.s %[out2], %[out2], %[b], %[out0] \n\t" + "nmsub.s %[out3], %[out3], %[c], %[out0] \n\t" + + : [out0]"+f"(out0), [out1]"+f"(out1), + [out2]"+f"(out2), [out3]"+f"(out3) + : [a]"f"(a), [b]"f"(b), [c]"f"(c) + ); + + out[0] = out0; + out[1] = out1; + out[2] = out2; + out[3] = out3; + + old_out0 = out0; + old_out1 = out1; + old_out2 = out2; + old_out3 = out3; + + out += 4; + in += 4; + } + + out -= n; + in -= n; + for (; n < buffer_length; n++) { + float out_val, out_val_i, fc_val; + p_filter_coeffs = filter_coeffs; + p_out = &out[n]; + out_val = in[n]; + for (i = 1; i <= filter_length; i++) { + __asm__ volatile( + "lwc1 %[fc_val], 0(%[p_filter_coeffs]) \n\t" + "lwc1 %[out_val_i], -4(%[p_out]) \n\t" + PTR_ADDIU "%[p_filter_coeffs], 4 \n\t" + PTR_ADDIU "%[p_out], -4 \n\t" + "nmsub.s %[out_val], %[out_val], %[fc_val], %[out_val_i] \n\t" + + : [fc_val]"=&f"(fc_val), [out_val]"+f"(out_val), + [out_val_i]"=&f"(out_val_i), [p_out]"+r"(p_out), + [p_filter_coeffs]"+r"(p_filter_coeffs) + : + : "memory" + ); + } + out[n] = out_val; + } +} + +static void ff_celp_lp_zero_synthesis_filterf_mips(float *out, + const float *filter_coeffs, + const float *in, int buffer_length, + int filter_length) +{ + int i,n; + float sum_out8, sum_out7, sum_out6, sum_out5, sum_out4, fc_val; + float sum_out3, sum_out2, sum_out1; + const float *p_filter_coeffs, *p_in; + + for (n = 0; n < buffer_length; n+=8) { + p_in = &in[n]; + p_filter_coeffs = filter_coeffs; + sum_out8 = in[n+7]; + sum_out7 = in[n+6]; + sum_out6 = in[n+5]; + sum_out5 = in[n+4]; + sum_out4 = in[n+3]; + sum_out3 = in[n+2]; + sum_out2 = in[n+1]; + sum_out1 = in[n]; + i = filter_length; + + /* i is always greater than 0 + * outer loop is unrolled eight times so there is less memory access + * inner loop is unrolled two times + */ + __asm__ volatile( + "filt_lp_inner%=: \n\t" + "lwc1 %[fc_val], 0(%[p_filter_coeffs]) \n\t" + "lwc1 $f7, 6*4(%[p_in]) \n\t" + "lwc1 $f6, 5*4(%[p_in]) \n\t" + "lwc1 $f5, 4*4(%[p_in]) \n\t" + "lwc1 $f4, 3*4(%[p_in]) \n\t" + "lwc1 $f3, 2*4(%[p_in]) \n\t" + "lwc1 $f2, 4(%[p_in]) \n\t" + "lwc1 $f1, 0(%[p_in]) \n\t" + "lwc1 $f0, -4(%[p_in]) \n\t" + "addiu %[i], -2 \n\t" + "madd.s %[sum_out8], %[sum_out8], %[fc_val], $f7 \n\t" + "madd.s %[sum_out7], %[sum_out7], %[fc_val], $f6 \n\t" + "madd.s %[sum_out6], %[sum_out6], %[fc_val], $f5 \n\t" + "madd.s %[sum_out5], %[sum_out5], %[fc_val], $f4 \n\t" + "madd.s %[sum_out4], %[sum_out4], %[fc_val], $f3 \n\t" + "madd.s %[sum_out3], %[sum_out3], %[fc_val], $f2 \n\t" + "madd.s %[sum_out2], %[sum_out2], %[fc_val], $f1 \n\t" + "madd.s %[sum_out1], %[sum_out1], %[fc_val], $f0 \n\t" + "lwc1 %[fc_val], 4(%[p_filter_coeffs]) \n\t" + "lwc1 $f7, -8(%[p_in]) \n\t" + PTR_ADDIU "%[p_filter_coeffs], 8 \n\t" + PTR_ADDIU "%[p_in], -8 \n\t" + "madd.s %[sum_out8], %[sum_out8], %[fc_val], $f6 \n\t" + "madd.s %[sum_out7], %[sum_out7], %[fc_val], $f5 \n\t" + "madd.s %[sum_out6], %[sum_out6], %[fc_val], $f4 \n\t" + "madd.s %[sum_out5], %[sum_out5], %[fc_val], $f3 \n\t" + "madd.s %[sum_out4], %[sum_out4], %[fc_val], $f2 \n\t" + "madd.s %[sum_out3], %[sum_out3], %[fc_val], $f1 \n\t" + "madd.s %[sum_out2], %[sum_out2], %[fc_val], $f0 \n\t" + "madd.s %[sum_out1], %[sum_out1], %[fc_val], $f7 \n\t" + "bgtz %[i], filt_lp_inner%= \n\t" + + : [sum_out8]"+f"(sum_out8), [sum_out7]"+f"(sum_out7), + [sum_out6]"+f"(sum_out6), [sum_out5]"+f"(sum_out5), + [sum_out4]"+f"(sum_out4), [sum_out3]"+f"(sum_out3), + [sum_out2]"+f"(sum_out2), [sum_out1]"+f"(sum_out1), + [fc_val]"=&f"(fc_val), [p_filter_coeffs]"+r"(p_filter_coeffs), + [p_in]"+r"(p_in), [i]"+r"(i) + : + : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", "memory" + ); + + out[n+7] = sum_out8; + out[n+6] = sum_out7; + out[n+5] = sum_out6; + out[n+4] = sum_out5; + out[n+3] = sum_out4; + out[n+2] = sum_out3; + out[n+1] = sum_out2; + out[n] = sum_out1; + } +} +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_INLINE_ASM */ + +void ff_celp_filter_init_mips(CELPFContext *c) +{ +#if HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 + c->celp_lp_synthesis_filterf = ff_celp_lp_synthesis_filterf_mips; + c->celp_lp_zero_synthesis_filterf = ff_celp_lp_zero_synthesis_filterf_mips; +#endif +#endif +} diff --git a/libavcodec/mips/celp_math_mips.c b/libavcodec/mips/celp_math_mips.c new file mode 100644 index 0000000000..ce711bd63c --- /dev/null +++ b/libavcodec/mips/celp_math_mips.c @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Nedeljko Babic (nbabic@mips.com) + * + * Math operations optimized for MIPS + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/celp_math.c + */ +#include "config.h" +#include "libavcodec/celp_math.h" +#include "libavutil/mips/asmdefs.h" + +#if HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 +static float ff_dot_productf_mips(const float* a, const float* b, + int length) +{ + float sum; + const float* a_end = a + length; + + __asm__ volatile ( + "mtc1 $zero, %[sum] \n\t" + "blez %[length], ff_dot_productf_end%= \n\t" + "ff_dot_productf_madd%=: \n\t" + "lwc1 $f2, 0(%[a]) \n\t" + "lwc1 $f1, 0(%[b]) \n\t" + PTR_ADDIU "%[a], %[a], 4 \n\t" + PTR_ADDIU "%[b], %[b], 4 \n\t" + "madd.s %[sum], %[sum], $f1, $f2 \n\t" + "bne %[a], %[a_end], ff_dot_productf_madd%= \n\t" + "ff_dot_productf_end%=: \n\t" + + : [sum] "=&f" (sum), [a] "+r" (a), [b] "+r" (b) + : [a_end]"r"(a_end), [length] "r" (length) + : "$f1", "$f2", "memory" + ); + return sum; +} +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_INLINE_ASM */ + +void ff_celp_math_init_mips(CELPMContext *c) +{ +#if HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 + c->dot_productf = ff_dot_productf_mips; +#endif +#endif +} diff --git a/libavcodec/mips/compute_antialias_fixed.h b/libavcodec/mips/compute_antialias_fixed.h new file mode 100644 index 0000000000..a967f67de7 --- /dev/null +++ b/libavcodec/mips/compute_antialias_fixed.h @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Bojan Zivkovic (bojan@mips.com) + * + * Compute antialias function optimised for MIPS fixed-point architecture + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/mpegaudiodec.c + */ + +#ifndef AVCODEC_MIPS_COMPUTE_ANTIALIAS_FIXED_H +#define AVCODEC_MIPS_COMPUTE_ANTIALIAS_FIXED_H + +#if HAVE_INLINE_ASM +static void compute_antialias_mips_fixed(MPADecodeContext *s, + GranuleDef *g) +{ + int32_t *ptr, *csa; + int n, i; + int MAX_lo = 0xffffffff; + + /* we antialias only "long" bands */ + if (g->block_type == 2) { + if (!g->switch_point) + return; + /* XXX: check this for 8000Hz case */ + n = 1; + } else { + n = SBLIMIT - 1; + } + + + ptr = g->sb_hybrid + 18; + + for(i = n;i > 0;i--) { + int tmp0, tmp1, tmp2, tmp00, tmp11; + int temp_reg1, temp_reg2, temp_reg3, temp_reg4, temp_reg5, temp_reg6; + csa = &csa_table[0][0]; + + /** + * instructions are scheduled to minimize pipeline stall. + */ + __asm__ volatile ( + "lw %[tmp0], -1*4(%[ptr]) \n\t" + "lw %[tmp1], 0*4(%[ptr]) \n\t" + "lw %[temp_reg1], 0*4(%[csa]) \n\t" + "lw %[temp_reg2], 2*4(%[csa]) \n\t" + "add %[tmp2], %[tmp0], %[tmp1] \n\t" + "lw %[temp_reg3], 3*4(%[csa]) \n\t" + "mult $ac0, %[tmp2], %[temp_reg1] \n\t" + "mult $ac1, %[tmp2], %[temp_reg1] \n\t" + "lw %[tmp00], -2*4(%[ptr]) \n\t" + "lw %[tmp11], 1*4(%[ptr]) \n\t" + "lw %[temp_reg4], 4*4(%[csa]) \n\t" + "mtlo %[MAX_lo], $ac0 \n\t" + "mtlo $zero, $ac1 \n\t" + "msub $ac0, %[tmp1], %[temp_reg2] \n\t" + "madd $ac1, %[tmp0], %[temp_reg3] \n\t" + "add %[tmp2], %[tmp00], %[tmp11] \n\t" + "lw %[temp_reg5], 6*4(%[csa]) \n\t" + "mult $ac2, %[tmp2], %[temp_reg4] \n\t" + "mult $ac3, %[tmp2], %[temp_reg4] \n\t" + "mfhi %[temp_reg1], $ac0 \n\t" + "mfhi %[temp_reg2], $ac1 \n\t" + "lw %[temp_reg6], 7*4(%[csa]) \n\t" + "mtlo %[MAX_lo], $ac2 \n\t" + "msub $ac2, %[tmp11], %[temp_reg5] \n\t" + "mtlo $zero, $ac3 \n\t" + "madd $ac3, %[tmp00], %[temp_reg6] \n\t" + "sll %[temp_reg1], %[temp_reg1], 2 \n\t" + "sw %[temp_reg1], -1*4(%[ptr]) \n\t" + "mfhi %[temp_reg4], $ac2 \n\t" + "sll %[temp_reg2], %[temp_reg2], 2 \n\t" + "mfhi %[temp_reg5], $ac3 \n\t" + "sw %[temp_reg2], 0*4(%[ptr]) \n\t" + "lw %[tmp0], -3*4(%[ptr]) \n\t" + "lw %[tmp1], 2*4(%[ptr]) \n\t" + "lw %[temp_reg1], 8*4(%[csa]) \n\t" + "sll %[temp_reg4], %[temp_reg4], 2 \n\t" + "add %[tmp2], %[tmp0], %[tmp1] \n\t" + "sll %[temp_reg5], %[temp_reg5], 2 \n\t" + "mult $ac0, %[tmp2], %[temp_reg1] \n\t" + "mult $ac1, %[tmp2], %[temp_reg1] \n\t" + "sw %[temp_reg4], -2*4(%[ptr]) \n\t" + "sw %[temp_reg5], 1*4(%[ptr]) \n\t" + "lw %[temp_reg2], 10*4(%[csa]) \n\t" + "mtlo %[MAX_lo], $ac0 \n\t" + "lw %[temp_reg3], 11*4(%[csa]) \n\t" + "msub $ac0, %[tmp1], %[temp_reg2] \n\t" + "mtlo $zero, $ac1 \n\t" + "madd $ac1, %[tmp0], %[temp_reg3] \n\t" + "lw %[tmp00], -4*4(%[ptr]) \n\t" + "lw %[tmp11], 3*4(%[ptr]) \n\t" + "mfhi %[temp_reg1], $ac0 \n\t" + "lw %[temp_reg4], 12*4(%[csa]) \n\t" + "mfhi %[temp_reg2], $ac1 \n\t" + "add %[tmp2], %[tmp00], %[tmp11] \n\t" + "mult $ac2, %[tmp2], %[temp_reg4] \n\t" + "mult $ac3, %[tmp2], %[temp_reg4] \n\t" + "lw %[temp_reg5], 14*4(%[csa]) \n\t" + "lw %[temp_reg6], 15*4(%[csa]) \n\t" + "sll %[temp_reg1], %[temp_reg1], 2 \n\t" + "mtlo %[MAX_lo], $ac2 \n\t" + "msub $ac2, %[tmp11], %[temp_reg5] \n\t" + "mtlo $zero, $ac3 \n\t" + "madd $ac3, %[tmp00], %[temp_reg6] \n\t" + "sll %[temp_reg2], %[temp_reg2], 2 \n\t" + "sw %[temp_reg1], -3*4(%[ptr]) \n\t" + "mfhi %[temp_reg4], $ac2 \n\t" + "sw %[temp_reg2], 2*4(%[ptr]) \n\t" + "mfhi %[temp_reg5], $ac3 \n\t" + "lw %[tmp0], -5*4(%[ptr]) \n\t" + "lw %[tmp1], 4*4(%[ptr]) \n\t" + "lw %[temp_reg1], 16*4(%[csa]) \n\t" + "lw %[temp_reg2], 18*4(%[csa]) \n\t" + "add %[tmp2], %[tmp0], %[tmp1] \n\t" + "lw %[temp_reg3], 19*4(%[csa]) \n\t" + "mult $ac0, %[tmp2], %[temp_reg1] \n\t" + "mult $ac1, %[tmp2], %[temp_reg1] \n\t" + "sll %[temp_reg4], %[temp_reg4], 2 \n\t" + "sll %[temp_reg5], %[temp_reg5], 2 \n\t" + "sw %[temp_reg4], -4*4(%[ptr]) \n\t" + "mtlo %[MAX_lo], $ac0 \n\t" + "msub $ac0, %[tmp1], %[temp_reg2] \n\t" + "mtlo $zero, $ac1 \n\t" + "madd $ac1, %[tmp0], %[temp_reg3] \n\t" + "sw %[temp_reg5], 3*4(%[ptr]) \n\t" + "lw %[tmp00], -6*4(%[ptr]) \n\t" + "mfhi %[temp_reg1], $ac0 \n\t" + "lw %[tmp11], 5*4(%[ptr]) \n\t" + "mfhi %[temp_reg2], $ac1 \n\t" + "lw %[temp_reg4], 20*4(%[csa]) \n\t" + "add %[tmp2], %[tmp00], %[tmp11] \n\t" + "lw %[temp_reg5], 22*4(%[csa]) \n\t" + "mult $ac2, %[tmp2], %[temp_reg4] \n\t" + "mult $ac3, %[tmp2], %[temp_reg4] \n\t" + "lw %[temp_reg6], 23*4(%[csa]) \n\t" + "sll %[temp_reg1], %[temp_reg1], 2 \n\t" + "sll %[temp_reg2], %[temp_reg2], 2 \n\t" + "mtlo %[MAX_lo], $ac2 \n\t" + "msub $ac2, %[tmp11], %[temp_reg5] \n\t" + "mtlo $zero, $ac3 \n\t" + "madd $ac3, %[tmp00], %[temp_reg6] \n\t" + "sw %[temp_reg1], -5*4(%[ptr]) \n\t" + "sw %[temp_reg2], 4*4(%[ptr]) \n\t" + "mfhi %[temp_reg4], $ac2 \n\t" + "lw %[tmp0], -7*4(%[ptr]) \n\t" + "mfhi %[temp_reg5], $ac3 \n\t" + "lw %[tmp1], 6*4(%[ptr]) \n\t" + "lw %[temp_reg1], 24*4(%[csa]) \n\t" + "lw %[temp_reg2], 26*4(%[csa]) \n\t" + "add %[tmp2], %[tmp0], %[tmp1] \n\t" + "lw %[temp_reg3], 27*4(%[csa]) \n\t" + "mult $ac0, %[tmp2], %[temp_reg1] \n\t" + "mult $ac1, %[tmp2], %[temp_reg1] \n\t" + "sll %[temp_reg4], %[temp_reg4], 2 \n\t" + "sll %[temp_reg5], %[temp_reg5], 2 \n\t" + "sw %[temp_reg4], -6*4(%[ptr]) \n\t" + "mtlo %[MAX_lo], $ac0 \n\t" + "msub $ac0, %[tmp1], %[temp_reg2] \n\t" + "mtlo $zero, $ac1 \n\t" + "madd $ac1, %[tmp0], %[temp_reg3] \n\t" + "sw %[temp_reg5], 5*4(%[ptr]) \n\t" + "lw %[tmp00], -8*4(%[ptr]) \n\t" + "mfhi %[temp_reg1], $ac0 \n\t" + "lw %[tmp11], 7*4(%[ptr]) \n\t" + "mfhi %[temp_reg2], $ac1 \n\t" + "lw %[temp_reg4], 28*4(%[csa]) \n\t" + "add %[tmp2], %[tmp00], %[tmp11] \n\t" + "lw %[temp_reg5], 30*4(%[csa]) \n\t" + "mult $ac2, %[tmp2], %[temp_reg4] \n\t" + "mult $ac3, %[tmp2], %[temp_reg4] \n\t" + "lw %[temp_reg6], 31*4(%[csa]) \n\t" + "sll %[temp_reg1], %[temp_reg1], 2 \n\t" + "sll %[temp_reg2], %[temp_reg2], 2 \n\t" + "mtlo %[MAX_lo], $ac2 \n\t" + "msub $ac2, %[tmp11], %[temp_reg5] \n\t" + "mtlo $zero, $ac3 \n\t" + "madd $ac3, %[tmp00], %[temp_reg6] \n\t" + "sw %[temp_reg1], -7*4(%[ptr]) \n\t" + "sw %[temp_reg2], 6*4(%[ptr]) \n\t" + "mfhi %[temp_reg4], $ac2 \n\t" + "mfhi %[temp_reg5], $ac3 \n\t" + "sll %[temp_reg4], %[temp_reg4], 2 \n\t" + "sll %[temp_reg5], %[temp_reg5], 2 \n\t" + "sw %[temp_reg4], -8*4(%[ptr]) \n\t" + "sw %[temp_reg5], 7*4(%[ptr]) \n\t" + + : [tmp0] "=&r" (tmp0), [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), + [tmp00] "=&r" (tmp00), [tmp11] "=&r" (tmp11), + [temp_reg1] "=&r" (temp_reg1), [temp_reg2] "=&r" (temp_reg2), + [temp_reg3] "=&r" (temp_reg3), [temp_reg4] "=&r" (temp_reg4), + [temp_reg5] "=&r" (temp_reg5), [temp_reg6] "=&r" (temp_reg6) + : [csa] "r" (csa), [ptr] "r" (ptr), + [MAX_lo] "r" (MAX_lo) + : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo", + "$ac3hi", "$ac3lo" + ); + + ptr += 18; + } +} +#define compute_antialias compute_antialias_mips_fixed +#endif /* HAVE_INLINE_ASM */ + +#endif /* AVCODEC_MIPS_COMPUTE_ANTIALIAS_FIXED_H */ diff --git a/libavcodec/mips/compute_antialias_float.h b/libavcodec/mips/compute_antialias_float.h new file mode 100644 index 0000000000..e2b4f29f4a --- /dev/null +++ b/libavcodec/mips/compute_antialias_float.h @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Bojan Zivkovic (bojan@mips.com) + * + * Compute antialias function optimised for MIPS floating-point architecture + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/mpegaudiodec.c + */ + +#ifndef AVCODEC_MIPS_COMPUTE_ANTIALIAS_FLOAT_H +#define AVCODEC_MIPS_COMPUTE_ANTIALIAS_FLOAT_H + +#include "libavutil/mips/asmdefs.h" + +#if HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 +static void compute_antialias_mips_float(MPADecodeContext *s, + GranuleDef *g) +{ + float *ptr, *ptr_end; + float *csa = &csa_table[0][0]; + /* temporary variables */ + float in1, in2, in3, in4, in5, in6, in7, in8; + float out1, out2, out3, out4; + + ptr = g->sb_hybrid + 18; + /* we antialias only "long" bands */ + if (g->block_type == 2) { + if (!g->switch_point) + return; + /* XXX: check this for 8000Hz case */ + ptr_end = ptr + 18; + } else { + ptr_end = ptr + 558; + } + + /** + * instructions are scheduled to minimize pipeline stall. + */ + + __asm__ volatile ( + "compute_antialias_float_loop%=: \t\n" + "lwc1 %[in1], -1*4(%[ptr]) \t\n" + "lwc1 %[in2], 0(%[csa]) \t\n" + "lwc1 %[in3], 1*4(%[csa]) \t\n" + "lwc1 %[in4], 0(%[ptr]) \t\n" + "lwc1 %[in5], -2*4(%[ptr]) \t\n" + "lwc1 %[in6], 4*4(%[csa]) \t\n" + "mul.s %[out1], %[in1], %[in2] \t\n" + "mul.s %[out2], %[in1], %[in3] \t\n" + "lwc1 %[in7], 5*4(%[csa]) \t\n" + "lwc1 %[in8], 1*4(%[ptr]) \t\n" + "nmsub.s %[out1], %[out1], %[in3], %[in4] \t\n" + "madd.s %[out2], %[out2], %[in2], %[in4] \t\n" + "mul.s %[out3], %[in5], %[in6] \t\n" + "mul.s %[out4], %[in5], %[in7] \t\n" + "lwc1 %[in1], -3*4(%[ptr]) \t\n" + "swc1 %[out1], -1*4(%[ptr]) \t\n" + "swc1 %[out2], 0(%[ptr]) \t\n" + "nmsub.s %[out3], %[out3], %[in7], %[in8] \t\n" + "madd.s %[out4], %[out4], %[in6], %[in8] \t\n" + "lwc1 %[in2], 8*4(%[csa]) \t\n" + "swc1 %[out3], -2*4(%[ptr]) \t\n" + "swc1 %[out4], 1*4(%[ptr]) \t\n" + "lwc1 %[in3], 9*4(%[csa]) \t\n" + "lwc1 %[in4], 2*4(%[ptr]) \t\n" + "mul.s %[out1], %[in1], %[in2] \t\n" + "lwc1 %[in5], -4*4(%[ptr]) \t\n" + "lwc1 %[in6], 12*4(%[csa]) \t\n" + "mul.s %[out2], %[in1], %[in3] \t\n" + "lwc1 %[in7], 13*4(%[csa]) \t\n" + "nmsub.s %[out1], %[out1], %[in3], %[in4] \t\n" + "lwc1 %[in8], 3*4(%[ptr]) \t\n" + "mul.s %[out3], %[in5], %[in6] \t\n" + "madd.s %[out2], %[out2], %[in2], %[in4] \t\n" + "mul.s %[out4], %[in5], %[in7] \t\n" + "swc1 %[out1], -3*4(%[ptr]) \t\n" + "lwc1 %[in1], -5*4(%[ptr]) \t\n" + "nmsub.s %[out3], %[out3], %[in7], %[in8] \t\n" + "swc1 %[out2], 2*4(%[ptr]) \t\n" + "madd.s %[out4], %[out4], %[in6], %[in8] \t\n" + "lwc1 %[in2], 16*4(%[csa]) \t\n" + "lwc1 %[in3], 17*4(%[csa]) \t\n" + "swc1 %[out3], -4*4(%[ptr]) \t\n" + "lwc1 %[in4], 4*4(%[ptr]) \t\n" + "swc1 %[out4], 3*4(%[ptr]) \t\n" + "mul.s %[out1], %[in1], %[in2] \t\n" + "mul.s %[out2], %[in1], %[in3] \t\n" + "lwc1 %[in5], -6*4(%[ptr]) \t\n" + "lwc1 %[in6], 20*4(%[csa]) \t\n" + "lwc1 %[in7], 21*4(%[csa]) \t\n" + "nmsub.s %[out1], %[out1], %[in3], %[in4] \t\n" + "madd.s %[out2], %[out2], %[in2], %[in4] \t\n" + "lwc1 %[in8], 5*4(%[ptr]) \t\n" + "mul.s %[out3], %[in5], %[in6] \t\n" + "mul.s %[out4], %[in5], %[in7] \t\n" + "swc1 %[out1], -5*4(%[ptr]) \t\n" + "swc1 %[out2], 4*4(%[ptr]) \t\n" + "lwc1 %[in1], -7*4(%[ptr]) \t\n" + "nmsub.s %[out3], %[out3], %[in7], %[in8] \t\n" + "madd.s %[out4], %[out4], %[in6], %[in8] \t\n" + "lwc1 %[in2], 24*4(%[csa]) \t\n" + "lwc1 %[in3], 25*4(%[csa]) \t\n" + "lwc1 %[in4], 6*4(%[ptr]) \t\n" + "swc1 %[out3], -6*4(%[ptr]) \t\n" + "swc1 %[out4], 5*4(%[ptr]) \t\n" + "mul.s %[out1], %[in1], %[in2] \t\n" + "lwc1 %[in5], -8*4(%[ptr]) \t\n" + "mul.s %[out2], %[in1], %[in3] \t\n" + "lwc1 %[in6], 28*4(%[csa]) \t\n" + "lwc1 %[in7], 29*4(%[csa]) \t\n" + "nmsub.s %[out1], %[out1], %[in3], %[in4] \t\n" + "lwc1 %[in8], 7*4(%[ptr]) \t\n" + "madd.s %[out2], %[out2], %[in2], %[in4] \t\n" + "mul.s %[out3], %[in5], %[in6] \t\n" + "mul.s %[out4], %[in5], %[in7] \t\n" + "swc1 %[out1], -7*4(%[ptr]) \t\n" + "swc1 %[out2], 6*4(%[ptr]) \t\n" + PTR_ADDIU "%[ptr],%[ptr], 72 \t\n" + "nmsub.s %[out3], %[out3], %[in7], %[in8] \t\n" + "madd.s %[out4], %[out4], %[in6], %[in8] \t\n" + "swc1 %[out3], -26*4(%[ptr]) \t\n" + "swc1 %[out4], -11*4(%[ptr]) \t\n" + "bne %[ptr], %[ptr_end], compute_antialias_float_loop%= \t\n" + + : [ptr] "+r" (ptr), + [in1] "=&f" (in1), [in2] "=&f" (in2), + [in3] "=&f" (in3), [in4] "=&f" (in4), + [in5] "=&f" (in5), [in6] "=&f" (in6), + [in7] "=&f" (in7), [in8] "=&f" (in8), + [out1] "=&f" (out1), [out2] "=&f" (out2), + [out3] "=&f" (out3), [out4] "=&f" (out4) + : [csa] "r" (csa), [ptr_end] "r" (ptr_end) + : "memory" + ); +} +#define compute_antialias compute_antialias_mips_float +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_INLINE_ASM */ + +#endif /* AVCODEC_MIPS_COMPUTE_ANTIALIAS_FLOAT_H */ diff --git a/libavcodec/mips/constants.c b/libavcodec/mips/constants.c new file mode 100644 index 0000000000..3503fad37b --- /dev/null +++ b/libavcodec/mips/constants.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2015 Loongson Technology Corporation Limited + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/mem.h" +#include "constants.h" + +DECLARE_ALIGNED(8, const uint64_t, ff_pw_1) = {0x0001000100010001ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_2) = {0x0002000200020002ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_3) = {0x0003000300030003ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_4) = {0x0004000400040004ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_5) = {0x0005000500050005ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_8) = {0x0008000800080008ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_9) = {0x0009000900090009ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_10) = {0x000A000A000A000AULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_16) = {0x0010001000100010ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_18) = {0x0012001200120012ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = {0x0014001400140014ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_28) = {0x001C001C001C001CULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_32) = {0x0020002000200020ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = {0x0035003500350035ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_64) = {0x0040004000400040ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = {0x0080008000800080ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_512) = {0x0200020002000200ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_m8tom5) = {0xFFFBFFFAFFF9FFF8ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_m4tom1) = {0xFFFFFFFEFFFDFFFCULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_1to4) = {0x0004000300020001ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_5to8) = {0x0008000700060005ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_0to3) = {0x0003000200010000ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_4to7) = {0x0007000600050004ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_8tob) = {0x000b000a00090008ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_ctof) = {0x000f000e000d000cULL}; + +DECLARE_ALIGNED(8, const uint64_t, ff_pb_1) = {0x0101010101010101ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_3) = {0x0303030303030303ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_80) = {0x8080808080808080ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1) = {0xA1A1A1A1A1A1A1A1ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_FE) = {0xFEFEFEFEFEFEFEFEULL}; + +DECLARE_ALIGNED(8, const uint64_t, ff_rnd) = {0x0004000400040004ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_rnd2) = {0x0040004000400040ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_rnd3) = {0x0020002000200020ULL}; + +DECLARE_ALIGNED(8, const uint64_t, ff_wm1010) = {0xFFFF0000FFFF0000ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_d40000) = {0x0000000000040000ULL}; diff --git a/libavcodec/mips/constants.h b/libavcodec/mips/constants.h new file mode 100644 index 0000000000..19d2d73c29 --- /dev/null +++ b/libavcodec/mips/constants.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2015 Loongson Technology Corporation Limited + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_CONSTANTS_H +#define AVCODEC_MIPS_CONSTANTS_H + +#include <stdint.h> + +extern const uint64_t ff_pw_1; +extern const uint64_t ff_pw_2; +extern const uint64_t ff_pw_3; +extern const uint64_t ff_pw_4; +extern const uint64_t ff_pw_5; +extern const uint64_t ff_pw_8; +extern const uint64_t ff_pw_9; +extern const uint64_t ff_pw_10; +extern const uint64_t ff_pw_16; +extern const uint64_t ff_pw_18; +extern const uint64_t ff_pw_20; +extern const uint64_t ff_pw_28; +extern const uint64_t ff_pw_32; +extern const uint64_t ff_pw_53; +extern const uint64_t ff_pw_64; +extern const uint64_t ff_pw_128; +extern const uint64_t ff_pw_512; +extern const uint64_t ff_pw_m8tom5; +extern const uint64_t ff_pw_m4tom1; +extern const uint64_t ff_pw_1to4; +extern const uint64_t ff_pw_5to8; +extern const uint64_t ff_pw_0to3; +extern const uint64_t ff_pw_4to7; +extern const uint64_t ff_pw_8tob; +extern const uint64_t ff_pw_ctof; + +extern const uint64_t ff_pb_1; +extern const uint64_t ff_pb_3; +extern const uint64_t ff_pb_80; +extern const uint64_t ff_pb_A1; +extern const uint64_t ff_pb_FE; + +extern const uint64_t ff_rnd; +extern const uint64_t ff_rnd2; +extern const uint64_t ff_rnd3; + +extern const uint64_t ff_wm1010; +extern const uint64_t ff_d40000; + +#endif /* AVCODEC_MIPS_CONSTANTS_H */ diff --git a/libavcodec/mips/fft_mips.c b/libavcodec/mips/fft_mips.c new file mode 100644 index 0000000000..03dcbad4d8 --- /dev/null +++ b/libavcodec/mips/fft_mips.c @@ -0,0 +1,517 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Stanislav Ocovaj (socovaj@mips.com) + * Author: Zoran Lukic (zoranl@mips.com) + * + * Optimized MDCT/IMDCT and FFT transforms + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "config.h" +#include "libavcodec/fft.h" +#include "libavcodec/fft_table.h" +#include "libavutil/mips/asmdefs.h" + +/** + * FFT transform + */ + +#if HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 +static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z) +{ + int nbits, i, n, num_transforms, offset, step; + int n4, n2, n34; + FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + FFTComplex *tmpz; + float w_re, w_im; + float *w_re_ptr, *w_im_ptr; + const int fft_size = (1 << s->nbits); + float pom, pom1, pom2, pom3; + float temp, temp1, temp3, temp4; + FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4; + FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i; + + num_transforms = (21845 >> (17 - s->nbits)) | 1; + + for (n=0; n<num_transforms; n++) { + offset = ff_fft_offsets_lut[n] << 2; + tmpz = z + offset; + + tmp1 = tmpz[0].re + tmpz[1].re; + tmp5 = tmpz[2].re + tmpz[3].re; + tmp2 = tmpz[0].im + tmpz[1].im; + tmp6 = tmpz[2].im + tmpz[3].im; + tmp3 = tmpz[0].re - tmpz[1].re; + tmp8 = tmpz[2].im - tmpz[3].im; + tmp4 = tmpz[0].im - tmpz[1].im; + tmp7 = tmpz[2].re - tmpz[3].re; + + tmpz[0].re = tmp1 + tmp5; + tmpz[2].re = tmp1 - tmp5; + tmpz[0].im = tmp2 + tmp6; + tmpz[2].im = tmp2 - tmp6; + tmpz[1].re = tmp3 + tmp8; + tmpz[3].re = tmp3 - tmp8; + tmpz[1].im = tmp4 - tmp7; + tmpz[3].im = tmp4 + tmp7; + + } + + if (fft_size < 8) + return; + + num_transforms = (num_transforms >> 1) | 1; + + for (n=0; n<num_transforms; n++) { + offset = ff_fft_offsets_lut[n] << 3; + tmpz = z + offset; + + __asm__ volatile ( + "lwc1 %[tmp1], 32(%[tmpz]) \n\t" + "lwc1 %[pom], 40(%[tmpz]) \n\t" + "lwc1 %[tmp3], 48(%[tmpz]) \n\t" + "lwc1 %[pom1], 56(%[tmpz]) \n\t" + "lwc1 %[tmp2], 36(%[tmpz]) \n\t" + "lwc1 %[pom2], 44(%[tmpz]) \n\t" + "lwc1 %[pom3], 60(%[tmpz]) \n\t" + "lwc1 %[tmp4], 52(%[tmpz]) \n\t" + "add.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re + tmpz[5].re; + "add.s %[tmp3], %[tmp3], %[pom1] \n\t" // tmp3 = tmpz[6].re + tmpz[7].re; + "add.s %[tmp2], %[tmp2], %[pom2] \n\t" // tmp2 = tmpz[4].im + tmpz[5].im; + "lwc1 %[pom], 40(%[tmpz]) \n\t" + "add.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im + tmpz[7].im; + "add.s %[tmp5], %[tmp1], %[tmp3] \n\t" // tmp5 = tmp1 + tmp3; + "sub.s %[tmp7], %[tmp1], %[tmp3] \n\t" // tmp7 = tmp1 - tmp3; + "lwc1 %[tmp1], 32(%[tmpz]) \n\t" + "lwc1 %[pom1], 44(%[tmpz]) \n\t" + "add.s %[tmp6], %[tmp2], %[tmp4] \n\t" // tmp6 = tmp2 + tmp4; + "sub.s %[tmp8], %[tmp2], %[tmp4] \n\t" // tmp8 = tmp2 - tmp4; + "lwc1 %[tmp2], 36(%[tmpz]) \n\t" + "lwc1 %[pom2], 56(%[tmpz]) \n\t" + "lwc1 %[pom3], 60(%[tmpz]) \n\t" + "lwc1 %[tmp3], 48(%[tmpz]) \n\t" + "lwc1 %[tmp4], 52(%[tmpz]) \n\t" + "sub.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re - tmpz[5].re; + "lwc1 %[pom], 0(%[tmpz]) \n\t" + "sub.s %[tmp2], %[tmp2], %[pom1] \n\t" // tmp2 = tmpz[4].im - tmpz[5].im; + "sub.s %[tmp3], %[tmp3], %[pom2] \n\t" // tmp3 = tmpz[6].re - tmpz[7].re; + "lwc1 %[pom2], 4(%[tmpz]) \n\t" + "sub.s %[pom1], %[pom], %[tmp5] \n\t" + "sub.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im - tmpz[7].im; + "add.s %[pom3], %[pom], %[tmp5] \n\t" + "sub.s %[pom], %[pom2], %[tmp6] \n\t" + "add.s %[pom2], %[pom2], %[tmp6] \n\t" + "swc1 %[pom1], 32(%[tmpz]) \n\t" // tmpz[4].re = tmpz[0].re - tmp5; + "swc1 %[pom3], 0(%[tmpz]) \n\t" // tmpz[0].re = tmpz[0].re + tmp5; + "swc1 %[pom], 36(%[tmpz]) \n\t" // tmpz[4].im = tmpz[0].im - tmp6; + "swc1 %[pom2], 4(%[tmpz]) \n\t" // tmpz[0].im = tmpz[0].im + tmp6; + "lwc1 %[pom1], 16(%[tmpz]) \n\t" + "lwc1 %[pom3], 20(%[tmpz]) \n\t" + "li.s %[pom], 0.7071067812 \n\t" // float pom = 0.7071067812f; + "add.s %[temp1],%[tmp1], %[tmp2] \n\t" + "sub.s %[temp], %[pom1], %[tmp8] \n\t" + "add.s %[pom2], %[pom3], %[tmp7] \n\t" + "sub.s %[temp3],%[tmp3], %[tmp4] \n\t" + "sub.s %[temp4],%[tmp2], %[tmp1] \n\t" + "swc1 %[temp], 48(%[tmpz]) \n\t" // tmpz[6].re = tmpz[2].re - tmp8; + "swc1 %[pom2], 52(%[tmpz]) \n\t" // tmpz[6].im = tmpz[2].im + tmp7; + "add.s %[pom1], %[pom1], %[tmp8] \n\t" + "sub.s %[pom3], %[pom3], %[tmp7] \n\t" + "add.s %[tmp3], %[tmp3], %[tmp4] \n\t" + "mul.s %[tmp5], %[pom], %[temp1] \n\t" // tmp5 = pom * (tmp1 + tmp2); + "mul.s %[tmp7], %[pom], %[temp3] \n\t" // tmp7 = pom * (tmp3 - tmp4); + "mul.s %[tmp6], %[pom], %[temp4] \n\t" // tmp6 = pom * (tmp2 - tmp1); + "mul.s %[tmp8], %[pom], %[tmp3] \n\t" // tmp8 = pom * (tmp3 + tmp4); + "swc1 %[pom1], 16(%[tmpz]) \n\t" // tmpz[2].re = tmpz[2].re + tmp8; + "swc1 %[pom3], 20(%[tmpz]) \n\t" // tmpz[2].im = tmpz[2].im - tmp7; + "add.s %[tmp1], %[tmp5], %[tmp7] \n\t" // tmp1 = tmp5 + tmp7; + "sub.s %[tmp3], %[tmp5], %[tmp7] \n\t" // tmp3 = tmp5 - tmp7; + "add.s %[tmp2], %[tmp6], %[tmp8] \n\t" // tmp2 = tmp6 + tmp8; + "sub.s %[tmp4], %[tmp6], %[tmp8] \n\t" // tmp4 = tmp6 - tmp8; + "lwc1 %[temp], 8(%[tmpz]) \n\t" + "lwc1 %[temp1],12(%[tmpz]) \n\t" + "lwc1 %[pom], 24(%[tmpz]) \n\t" + "lwc1 %[pom2], 28(%[tmpz]) \n\t" + "sub.s %[temp4],%[temp], %[tmp1] \n\t" + "sub.s %[temp3],%[temp1], %[tmp2] \n\t" + "add.s %[temp], %[temp], %[tmp1] \n\t" + "add.s %[temp1],%[temp1], %[tmp2] \n\t" + "sub.s %[pom1], %[pom], %[tmp4] \n\t" + "add.s %[pom3], %[pom2], %[tmp3] \n\t" + "add.s %[pom], %[pom], %[tmp4] \n\t" + "sub.s %[pom2], %[pom2], %[tmp3] \n\t" + "swc1 %[temp4],40(%[tmpz]) \n\t" // tmpz[5].re = tmpz[1].re - tmp1; + "swc1 %[temp3],44(%[tmpz]) \n\t" // tmpz[5].im = tmpz[1].im - tmp2; + "swc1 %[temp], 8(%[tmpz]) \n\t" // tmpz[1].re = tmpz[1].re + tmp1; + "swc1 %[temp1],12(%[tmpz]) \n\t" // tmpz[1].im = tmpz[1].im + tmp2; + "swc1 %[pom1], 56(%[tmpz]) \n\t" // tmpz[7].re = tmpz[3].re - tmp4; + "swc1 %[pom3], 60(%[tmpz]) \n\t" // tmpz[7].im = tmpz[3].im + tmp3; + "swc1 %[pom], 24(%[tmpz]) \n\t" // tmpz[3].re = tmpz[3].re + tmp4; + "swc1 %[pom2], 28(%[tmpz]) \n\t" // tmpz[3].im = tmpz[3].im - tmp3; + : [tmp1]"=&f"(tmp1), [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2), + [tmp3]"=&f"(tmp3), [tmp2]"=&f"(tmp2), [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp7]"=&f"(tmp7), + [tmp6]"=&f"(tmp6), [tmp8]"=&f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1), + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4) + : [tmpz]"r"(tmpz) + : "memory" + ); + } + + step = 1 << (MAX_LOG2_NFFT - 4); + n4 = 4; + + for (nbits=4; nbits<=s->nbits; nbits++) { + num_transforms = (num_transforms >> 1) | 1; + n2 = 2 * n4; + n34 = 3 * n4; + + for (n=0; n<num_transforms; n++) { + offset = ff_fft_offsets_lut[n] << nbits; + tmpz = z + offset; + + tmpz_n2 = tmpz + n2; + tmpz_n4 = tmpz + n4; + tmpz_n34 = tmpz + n34; + + __asm__ volatile ( + "lwc1 %[pom1], 0(%[tmpz_n2]) \n\t" + "lwc1 %[pom], 0(%[tmpz_n34]) \n\t" + "lwc1 %[pom2], 4(%[tmpz_n2]) \n\t" + "lwc1 %[pom3], 4(%[tmpz_n34]) \n\t" + "lwc1 %[temp1],0(%[tmpz]) \n\t" + "lwc1 %[temp3],4(%[tmpz]) \n\t" + "add.s %[tmp5], %[pom1], %[pom] \n\t" // tmp5 = tmpz[ n2].re + tmpz[n34].re; + "sub.s %[tmp1], %[pom1], %[pom] \n\t" // tmp1 = tmpz[ n2].re - tmpz[n34].re; + "add.s %[tmp6], %[pom2], %[pom3] \n\t" // tmp6 = tmpz[ n2].im + tmpz[n34].im; + "sub.s %[tmp2], %[pom2], %[pom3] \n\t" // tmp2 = tmpz[ n2].im - tmpz[n34].im; + "sub.s %[temp], %[temp1], %[tmp5] \n\t" + "add.s %[temp1],%[temp1], %[tmp5] \n\t" + "sub.s %[temp4],%[temp3], %[tmp6] \n\t" + "add.s %[temp3],%[temp3], %[tmp6] \n\t" + "swc1 %[temp], 0(%[tmpz_n2]) \n\t" // tmpz[ n2].re = tmpz[ 0].re - tmp5; + "swc1 %[temp1],0(%[tmpz]) \n\t" // tmpz[ 0].re = tmpz[ 0].re + tmp5; + "lwc1 %[pom1], 0(%[tmpz_n4]) \n\t" + "swc1 %[temp4],4(%[tmpz_n2]) \n\t" // tmpz[ n2].im = tmpz[ 0].im - tmp6; + "lwc1 %[temp], 4(%[tmpz_n4]) \n\t" + "swc1 %[temp3],4(%[tmpz]) \n\t" // tmpz[ 0].im = tmpz[ 0].im + tmp6; + "sub.s %[pom], %[pom1], %[tmp2] \n\t" + "add.s %[pom1], %[pom1], %[tmp2] \n\t" + "add.s %[temp1],%[temp], %[tmp1] \n\t" + "sub.s %[temp], %[temp], %[tmp1] \n\t" + "swc1 %[pom], 0(%[tmpz_n34]) \n\t" // tmpz[n34].re = tmpz[n4].re - tmp2; + "swc1 %[pom1], 0(%[tmpz_n4]) \n\t" // tmpz[ n4].re = tmpz[n4].re + tmp2; + "swc1 %[temp1],4(%[tmpz_n34]) \n\t" // tmpz[n34].im = tmpz[n4].im + tmp1; + "swc1 %[temp], 4(%[tmpz_n4]) \n\t" // tmpz[ n4].im = tmpz[n4].im - tmp1; + : [tmp5]"=&f"(tmp5), + [tmp1]"=&f"(tmp1), [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2), + [tmp2]"=&f"(tmp2), [tmp6]"=&f"(tmp6), [pom3]"=&f"(pom3), + [temp]"=&f"(temp), [temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4) + : [tmpz]"r"(tmpz), [tmpz_n2]"r"(tmpz_n2), [tmpz_n34]"r"(tmpz_n34), [tmpz_n4]"r"(tmpz_n4) + : "memory" + ); + + w_re_ptr = (float*)(ff_cos_131072 + step); + w_im_ptr = (float*)(ff_cos_131072 + MAX_FFT_SIZE/4 - step); + + for (i=1; i<n4; i++) { + w_re = w_re_ptr[0]; + w_im = w_im_ptr[0]; + tmpz_n2_i = tmpz_n2 + i; + tmpz_n4_i = tmpz_n4 + i; + tmpz_n34_i= tmpz_n34 + i; + tmpz_i = tmpz + i; + + __asm__ volatile ( + "lwc1 %[temp], 0(%[tmpz_n2_i]) \n\t" + "lwc1 %[temp1], 4(%[tmpz_n2_i]) \n\t" + "lwc1 %[pom], 0(%[tmpz_n34_i]) \n\t" + "lwc1 %[pom1], 4(%[tmpz_n34_i]) \n\t" + "mul.s %[temp3], %[w_im], %[temp] \n\t" + "mul.s %[temp4], %[w_im], %[temp1] \n\t" + "mul.s %[pom2], %[w_im], %[pom1] \n\t" + "mul.s %[pom3], %[w_im], %[pom] \n\t" + "msub.s %[tmp2], %[temp3], %[w_re], %[temp1] \n\t" // tmp2 = w_re * tmpz[ n2+i].im - w_im * tmpz[ n2+i].re; + "madd.s %[tmp1], %[temp4], %[w_re], %[temp] \n\t" // tmp1 = w_re * tmpz[ n2+i].re + w_im * tmpz[ n2+i].im; + "msub.s %[tmp3], %[pom2], %[w_re], %[pom] \n\t" // tmp3 = w_re * tmpz[n34+i].re - w_im * tmpz[n34+i].im; + "madd.s %[tmp4], %[pom3], %[w_re], %[pom1] \n\t" // tmp4 = w_re * tmpz[n34+i].im + w_im * tmpz[n34+i].re; + "lwc1 %[temp], 0(%[tmpz_i]) \n\t" + "lwc1 %[pom], 4(%[tmpz_i]) \n\t" + "add.s %[tmp5], %[tmp1], %[tmp3] \n\t" // tmp5 = tmp1 + tmp3; + "sub.s %[tmp1], %[tmp1], %[tmp3] \n\t" // tmp1 = tmp1 - tmp3; + "add.s %[tmp6], %[tmp2], %[tmp4] \n\t" // tmp6 = tmp2 + tmp4; + "sub.s %[tmp2], %[tmp2], %[tmp4] \n\t" // tmp2 = tmp2 - tmp4; + "sub.s %[temp1], %[temp], %[tmp5] \n\t" + "add.s %[temp], %[temp], %[tmp5] \n\t" + "sub.s %[pom1], %[pom], %[tmp6] \n\t" + "add.s %[pom], %[pom], %[tmp6] \n\t" + "lwc1 %[temp3], 0(%[tmpz_n4_i]) \n\t" + "lwc1 %[pom2], 4(%[tmpz_n4_i]) \n\t" + "swc1 %[temp1], 0(%[tmpz_n2_i]) \n\t" // tmpz[ n2+i].re = tmpz[ i].re - tmp5; + "swc1 %[temp], 0(%[tmpz_i]) \n\t" // tmpz[ i].re = tmpz[ i].re + tmp5; + "swc1 %[pom1], 4(%[tmpz_n2_i]) \n\t" // tmpz[ n2+i].im = tmpz[ i].im - tmp6; + "swc1 %[pom] , 4(%[tmpz_i]) \n\t" // tmpz[ i].im = tmpz[ i].im + tmp6; + "sub.s %[temp4], %[temp3], %[tmp2] \n\t" + "add.s %[pom3], %[pom2], %[tmp1] \n\t" + "add.s %[temp3], %[temp3], %[tmp2] \n\t" + "sub.s %[pom2], %[pom2], %[tmp1] \n\t" + "swc1 %[temp4], 0(%[tmpz_n34_i]) \n\t" // tmpz[n34+i].re = tmpz[n4+i].re - tmp2; + "swc1 %[pom3], 4(%[tmpz_n34_i]) \n\t" // tmpz[n34+i].im = tmpz[n4+i].im + tmp1; + "swc1 %[temp3], 0(%[tmpz_n4_i]) \n\t" // tmpz[ n4+i].re = tmpz[n4+i].re + tmp2; + "swc1 %[pom2], 4(%[tmpz_n4_i]) \n\t" // tmpz[ n4+i].im = tmpz[n4+i].im - tmp1; + : [tmp1]"=&f"(tmp1), [tmp2]"=&f" (tmp2), [temp]"=&f"(temp), [tmp3]"=&f"(tmp3), + [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp6]"=&f"(tmp6), + [temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), + [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2), [pom3]"=&f"(pom3) + : [w_re]"f"(w_re), [w_im]"f"(w_im), + [tmpz_i]"r"(tmpz_i),[tmpz_n2_i]"r"(tmpz_n2_i), + [tmpz_n34_i]"r"(tmpz_n34_i), [tmpz_n4_i]"r"(tmpz_n4_i) + : "memory" + ); + w_re_ptr += step; + w_im_ptr -= step; + } + } + step >>= 1; + n4 <<= 1; + } +} + +/** + * MDCT/IMDCT transforms. + */ + +static void ff_imdct_half_mips(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + int k, n8, n4, n2, n, j; + const uint16_t *revtab = s->revtab; + const FFTSample *tcos = s->tcos; + const FFTSample *tsin = s->tsin; + const FFTSample *in1, *in2, *in3, *in4; + FFTComplex *z = (FFTComplex *)output; + + int j1; + const float *tcos1, *tsin1, *tcos2, *tsin2; + float temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, + temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16; + FFTComplex *z1, *z2; + + n = 1 << s->mdct_bits; + n2 = n >> 1; + n4 = n >> 2; + n8 = n >> 3; + + /* pre rotation */ + in1 = input; + in2 = input + n2 - 1; + in3 = input + 2; + in4 = input + n2 - 3; + + tcos1 = tcos; + tsin1 = tsin; + + /* n4 = 64 or 128 */ + for(k = 0; k < n4; k += 2) { + j = revtab[k ]; + j1 = revtab[k + 1]; + + __asm__ volatile ( + "lwc1 %[temp1], 0(%[in2]) \t\n" + "lwc1 %[temp2], 0(%[tcos1]) \t\n" + "lwc1 %[temp3], 0(%[tsin1]) \t\n" + "lwc1 %[temp4], 0(%[in1]) \t\n" + "lwc1 %[temp5], 0(%[in4]) \t\n" + "mul.s %[temp9], %[temp1], %[temp2] \t\n" + "mul.s %[temp10], %[temp1], %[temp3] \t\n" + "lwc1 %[temp6], 4(%[tcos1]) \t\n" + "lwc1 %[temp7], 4(%[tsin1]) \t\n" + "nmsub.s %[temp9], %[temp9], %[temp4], %[temp3] \t\n" + "madd.s %[temp10], %[temp10], %[temp4], %[temp2] \t\n" + "mul.s %[temp11], %[temp5], %[temp6] \t\n" + "mul.s %[temp12], %[temp5], %[temp7] \t\n" + "lwc1 %[temp8], 0(%[in3]) \t\n" + PTR_ADDIU " %[tcos1], %[tcos1], 8 \t\n" + PTR_ADDIU " %[tsin1], %[tsin1], 8 \t\n" + PTR_ADDIU " %[in1], %[in1], 16 \t\n" + "nmsub.s %[temp11], %[temp11], %[temp8], %[temp7] \t\n" + "madd.s %[temp12], %[temp12], %[temp8], %[temp6] \t\n" + PTR_ADDIU " %[in2], %[in2], -16 \t\n" + PTR_ADDIU " %[in3], %[in3], 16 \t\n" + PTR_ADDIU " %[in4], %[in4], -16 \t\n" + + : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), + [temp5]"=&f"(temp5), [temp6]"=&f"(temp6), + [temp7]"=&f"(temp7), [temp8]"=&f"(temp8), + [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), + [temp11]"=&f"(temp11), [temp12]"=&f"(temp12), + [tsin1]"+r"(tsin1), [tcos1]"+r"(tcos1), + [in1]"+r"(in1), [in2]"+r"(in2), + [in3]"+r"(in3), [in4]"+r"(in4) + : + : "memory" + ); + + z[j ].re = temp9; + z[j ].im = temp10; + z[j1].re = temp11; + z[j1].im = temp12; + } + + s->fft_calc(s, z); + + /* post rotation + reordering */ + /* n8 = 32 or 64 */ + for(k = 0; k < n8; k += 2) { + tcos1 = &tcos[n8 - k - 2]; + tsin1 = &tsin[n8 - k - 2]; + tcos2 = &tcos[n8 + k]; + tsin2 = &tsin[n8 + k]; + z1 = &z[n8 - k - 2]; + z2 = &z[n8 + k ]; + + __asm__ volatile ( + "lwc1 %[temp1], 12(%[z1]) \t\n" + "lwc1 %[temp2], 4(%[tsin1]) \t\n" + "lwc1 %[temp3], 4(%[tcos1]) \t\n" + "lwc1 %[temp4], 8(%[z1]) \t\n" + "lwc1 %[temp5], 4(%[z1]) \t\n" + "mul.s %[temp9], %[temp1], %[temp2] \t\n" + "mul.s %[temp10], %[temp1], %[temp3] \t\n" + "lwc1 %[temp6], 0(%[tsin1]) \t\n" + "lwc1 %[temp7], 0(%[tcos1]) \t\n" + "nmsub.s %[temp9], %[temp9], %[temp4], %[temp3] \t\n" + "madd.s %[temp10], %[temp10], %[temp4], %[temp2] \t\n" + "mul.s %[temp11], %[temp5], %[temp6] \t\n" + "mul.s %[temp12], %[temp5], %[temp7] \t\n" + "lwc1 %[temp8], 0(%[z1]) \t\n" + "lwc1 %[temp1], 4(%[z2]) \t\n" + "lwc1 %[temp2], 0(%[tsin2]) \t\n" + "lwc1 %[temp3], 0(%[tcos2]) \t\n" + "nmsub.s %[temp11], %[temp11], %[temp8], %[temp7] \t\n" + "madd.s %[temp12], %[temp12], %[temp8], %[temp6] \t\n" + "mul.s %[temp13], %[temp1], %[temp2] \t\n" + "mul.s %[temp14], %[temp1], %[temp3] \t\n" + "lwc1 %[temp4], 0(%[z2]) \t\n" + "lwc1 %[temp5], 12(%[z2]) \t\n" + "lwc1 %[temp6], 4(%[tsin2]) \t\n" + "lwc1 %[temp7], 4(%[tcos2]) \t\n" + "nmsub.s %[temp13], %[temp13], %[temp4], %[temp3] \t\n" + "madd.s %[temp14], %[temp14], %[temp4], %[temp2] \t\n" + "mul.s %[temp15], %[temp5], %[temp6] \t\n" + "mul.s %[temp16], %[temp5], %[temp7] \t\n" + "lwc1 %[temp8], 8(%[z2]) \t\n" + "nmsub.s %[temp15], %[temp15], %[temp8], %[temp7] \t\n" + "madd.s %[temp16], %[temp16], %[temp8], %[temp6] \t\n" + : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), + [temp5]"=&f"(temp5), [temp6]"=&f"(temp6), + [temp7]"=&f"(temp7), [temp8]"=&f"(temp8), + [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), + [temp11]"=&f"(temp11), [temp12]"=&f"(temp12), + [temp13]"=&f"(temp13), [temp14]"=&f"(temp14), + [temp15]"=&f"(temp15), [temp16]"=&f"(temp16) + : [z1]"r"(z1), [z2]"r"(z2), + [tsin1]"r"(tsin1), [tcos1]"r"(tcos1), + [tsin2]"r"(tsin2), [tcos2]"r"(tcos2) + : "memory" + ); + + z1[1].re = temp9; + z1[1].im = temp14; + z2[0].re = temp13; + z2[0].im = temp10; + + z1[0].re = temp11; + z1[0].im = temp16; + z2[1].re = temp15; + z2[1].im = temp12; + } +} + +/** + * Compute inverse MDCT of size N = 2^nbits + * @param output N samples + * @param input N/2 samples + */ +static void ff_imdct_calc_mips(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + int k; + int n = 1 << s->mdct_bits; + int n2 = n >> 1; + int n4 = n >> 2; + + ff_imdct_half_mips(s, output+n4, input); + + for(k = 0; k < n4; k+=4) { + output[k] = -output[n2-k-1]; + output[k+1] = -output[n2-k-2]; + output[k+2] = -output[n2-k-3]; + output[k+3] = -output[n2-k-4]; + + output[n-k-1] = output[n2+k]; + output[n-k-2] = output[n2+k+1]; + output[n-k-3] = output[n2+k+2]; + output[n-k-4] = output[n2+k+3]; + } +} +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_INLINE_ASM */ + +av_cold void ff_fft_init_mips(FFTContext *s) +{ + int n=0; + + ff_fft_lut_init(ff_fft_offsets_lut, 0, 1 << 17, &n); + ff_init_ff_cos_tabs(17); + +#if HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 + s->fft_calc = ff_fft_calc_mips; +#if CONFIG_MDCT + s->imdct_calc = ff_imdct_calc_mips; + s->imdct_half = ff_imdct_half_mips; +#endif +#endif +#endif +} diff --git a/libavcodec/mips/fmtconvert_mips.c b/libavcodec/mips/fmtconvert_mips.c new file mode 100644 index 0000000000..990958402c --- /dev/null +++ b/libavcodec/mips/fmtconvert_mips.c @@ -0,0 +1,141 @@ +/* + * Format Conversion Utils for MIPS + * + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of is + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Zoran Lukic (zoranl@mips.com) + * Author: Nedeljko Babic (nbabic@mips.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "config.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/fmtconvert.h" +#include "libavutil/mips/asmdefs.h" + +#if HAVE_INLINE_ASM +static void int32_to_float_fmul_scalar_mips(float *dst, const int *src, + float mul, int len) +{ + /* + * variables used in inline assembler + */ + float temp1, temp3, temp5, temp7, temp9, temp11, temp13, temp15; + + int rpom1, rpom2, rpom11, rpom21, rpom12, rpom22, rpom13, rpom23; + const int *src_end = src + len; + /* + * loop is 8 times unrolled in assembler in order to achieve better performance + */ + __asm__ volatile ( + "i32tf_lp%=: \n\t" + "lw %[rpom11], 0(%[src]) \n\t" + "lw %[rpom21], 4(%[src]) \n\t" + "lw %[rpom1], 8(%[src]) \n\t" + "lw %[rpom2], 12(%[src]) \n\t" + "mtc1 %[rpom11], %[temp1] \n\t" + "mtc1 %[rpom21], %[temp3] \n\t" + "mtc1 %[rpom1], %[temp5] \n\t" + "mtc1 %[rpom2], %[temp7] \n\t" + + "lw %[rpom13], 16(%[src]) \n\t" + "lw %[rpom23], 20(%[src]) \n\t" + "lw %[rpom12], 24(%[src]) \n\t" + "lw %[rpom22], 28(%[src]) \n\t" + "mtc1 %[rpom13], %[temp9] \n\t" + "mtc1 %[rpom23], %[temp11] \n\t" + "mtc1 %[rpom12], %[temp13] \n\t" + "mtc1 %[rpom22], %[temp15] \n\t" + + PTR_ADDIU "%[src], 32 \n\t" + "cvt.s.w %[temp1], %[temp1] \n\t" + "cvt.s.w %[temp3], %[temp3] \n\t" + "cvt.s.w %[temp5], %[temp5] \n\t" + "cvt.s.w %[temp7], %[temp7] \n\t" + + "cvt.s.w %[temp9], %[temp9] \n\t" + "cvt.s.w %[temp11], %[temp11] \n\t" + "cvt.s.w %[temp13], %[temp13] \n\t" + "cvt.s.w %[temp15], %[temp15] \n\t" + + "mul.s %[temp1], %[temp1], %[mul] \n\t" + "mul.s %[temp3], %[temp3], %[mul] \n\t" + "mul.s %[temp5], %[temp5], %[mul] \n\t" + "mul.s %[temp7], %[temp7], %[mul] \n\t" + + "mul.s %[temp9], %[temp9], %[mul] \n\t" + "mul.s %[temp11], %[temp11], %[mul] \n\t" + "mul.s %[temp13], %[temp13], %[mul] \n\t" + "mul.s %[temp15], %[temp15], %[mul] \n\t" + + "swc1 %[temp1], 0(%[dst]) \n\t" /*dst[i] = src[i] * mul; */ + "swc1 %[temp3], 4(%[dst]) \n\t" /*dst[i+1] = src[i+1] * mul;*/ + "swc1 %[temp5], 8(%[dst]) \n\t" /*dst[i+2] = src[i+2] * mul;*/ + "swc1 %[temp7], 12(%[dst]) \n\t" /*dst[i+3] = src[i+3] * mul;*/ + + "swc1 %[temp9], 16(%[dst]) \n\t" /*dst[i+4] = src[i+4] * mul;*/ + "swc1 %[temp11], 20(%[dst]) \n\t" /*dst[i+5] = src[i+5] * mul;*/ + "swc1 %[temp13], 24(%[dst]) \n\t" /*dst[i+6] = src[i+6] * mul;*/ + "swc1 %[temp15], 28(%[dst]) \n\t" /*dst[i+7] = src[i+7] * mul;*/ + PTR_ADDIU "%[dst], 32 \n\t" + "bne %[src], %[src_end], i32tf_lp%= \n\t" + : [temp1]"=&f"(temp1), [temp11]"=&f"(temp11), + [temp13]"=&f"(temp13), [temp15]"=&f"(temp15), + [temp3]"=&f"(temp3), [temp5]"=&f"(temp5), + [temp7]"=&f"(temp7), [temp9]"=&f"(temp9), + [rpom1]"=&r"(rpom1), [rpom2]"=&r"(rpom2), + [rpom11]"=&r"(rpom11), [rpom21]"=&r"(rpom21), + [rpom12]"=&r"(rpom12), [rpom22]"=&r"(rpom22), + [rpom13]"=&r"(rpom13), [rpom23]"=&r"(rpom23), + [dst]"+r"(dst), [src]"+r"(src) + : [mul]"f"(mul), [src_end]"r"(src_end) + : "memory" + ); +} +#endif /* HAVE_INLINE_ASM */ + +av_cold void ff_fmt_convert_init_mips(FmtConvertContext *c) +{ +#if HAVE_INLINE_ASM + c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_mips; +#endif +} diff --git a/libavcodec/mips/h263dsp_init_mips.c b/libavcodec/mips/h263dsp_init_mips.c new file mode 100644 index 0000000000..09bd93707d --- /dev/null +++ b/libavcodec/mips/h263dsp_init_mips.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "h263dsp_mips.h" + +#if HAVE_MSA +static av_cold void h263dsp_init_msa(H263DSPContext *c) +{ + c->h263_h_loop_filter = ff_h263_h_loop_filter_msa; + c->h263_v_loop_filter = ff_h263_v_loop_filter_msa; +} +#endif // #if HAVE_MSA + +av_cold void ff_h263dsp_init_mips(H263DSPContext *c) +{ +#if HAVE_MSA + h263dsp_init_msa(c); +#endif // #if HAVE_MSA +} diff --git a/libavcodec/mips/h263dsp_mips.h b/libavcodec/mips/h263dsp_mips.h new file mode 100644 index 0000000000..99a43cd44a --- /dev/null +++ b/libavcodec/mips/h263dsp_mips.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_H263DSP_MIPS_H +#define AVCODEC_MIPS_H263DSP_MIPS_H + +#include "libavcodec/mpegvideo.h" + +void ff_h263_h_loop_filter_msa(uint8_t *src, int stride, int q_scale); +void ff_h263_v_loop_filter_msa(uint8_t *src, int stride, int q_scale); +void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s, int16_t *block, + int32_t index, int32_t q_scale); +void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s, int16_t *block, + int32_t index, int32_t q_scale); +void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s, int16_t *block, + int32_t index, int32_t q_scale); +int ff_pix_sum_msa(uint8_t *pix, int line_size); + +#endif // #ifndef AVCODEC_MIPS_H263DSP_MIPS_H diff --git a/libavcodec/mips/h263dsp_msa.c b/libavcodec/mips/h263dsp_msa.c new file mode 100644 index 0000000000..472bcbd70a --- /dev/null +++ b/libavcodec/mips/h263dsp_msa.c @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "h263dsp_mips.h" + +static const uint8_t h263_loop_filter_strength_msa[32] = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 7, + 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12 +}; + +static void h263_h_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale) +{ + int32_t strength = h263_loop_filter_strength_msa[qscale]; + v16u8 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 temp0, temp1, temp2; + v8i16 diff0, diff2, diff4, diff6, diff8; + v8i16 d0, a_d0, str_x2, str; + + src -= 2; + LD_UB8(src, stride, in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in3, in2, in1); + + temp0 = (v8i16) __msa_ilvr_b((v16i8) in0, (v16i8) in1); + a_d0 = __msa_hsub_u_h((v16u8) temp0, (v16u8) temp0); + temp2 = (v8i16) __msa_ilvr_b((v16i8) in2, (v16i8) in3); + temp2 = __msa_hsub_u_h((v16u8) temp2, (v16u8) temp2); + temp2 <<= 2; + diff0 = a_d0 + temp2; + diff2 = -(-diff0 >> 3); + str_x2 = __msa_fill_h(-(strength << 1)); + temp0 = (str_x2 <= diff2); + diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) temp0, (v16u8) temp0); + temp2 = str_x2 - diff2; + str = __msa_fill_h(-strength); + temp0 = (diff2 < str); + diff2 = (v8i16) __msa_bmnz_v((v16u8) diff2, (v16u8) temp2, (v16u8) temp0); + diff4 = diff0 >> 3; + str_x2 = __msa_fill_h(strength << 1); + temp0 = (diff4 <= str_x2); + diff4 = (v8i16) __msa_bmz_v((v16u8) diff4, (v16u8) temp0, (v16u8) temp0); + temp2 = str_x2 - diff4; + str = __msa_fill_h(strength); + temp0 = (str < diff4); + diff4 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) temp2, (v16u8) temp0); + temp0 = __msa_clti_s_h(diff0, 0); + d0 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0); + diff2 = -diff2 >> 1; + diff4 >>= 1; + diff8 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0); + diff6 = (-a_d0) >> 2; + diff6 = -(diff6); + temp2 = -diff8; + temp0 = (diff6 < temp2); + diff6 = (v8i16) __msa_bmnz_v((v16u8) diff6, (v16u8) temp2, (v16u8) temp0); + diff2 = a_d0 >> 2; + temp0 = (diff2 <= diff8); + diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) diff8, (v16u8) temp0); + temp0 = __msa_clti_s_h(a_d0, 0); + diff6 = (v8i16) __msa_bmz_v((v16u8) diff6, (v16u8) diff2, (v16u8) temp0); + PCKEV_B2_SH(a_d0, diff6, a_d0, d0, diff6, d0); + in0 = (v16u8) ((v16i8) in0 - (v16i8) diff6); + in1 = (v16u8) ((v16i8) in1 + (v16i8) diff6); + in3 = __msa_xori_b(in3, 128); + in3 = (v16u8) __msa_adds_s_b((v16i8) in3, (v16i8) d0); + in3 = __msa_xori_b(in3, 128); + in2 = __msa_subsus_u_b(in2, (v16i8) d0); + ILVR_B2_SH(in3, in0, in1, in2, temp0, temp1); + in0 = (v16u8) __msa_ilvr_h(temp1, temp0); + in3 = (v16u8) __msa_ilvl_h(temp1, temp0); + ST4x4_UB(in0, in0, 0, 1, 2, 3, src, stride); + src += 4 * stride; + ST4x4_UB(in3, in3, 0, 1, 2, 3, src, stride); + src += 4 * stride; +} + +static void h263_v_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale) +{ + int32_t strength = h263_loop_filter_strength_msa[qscale]; + uint64_t res0, res1, res2, res3; + v16u8 in0, in1, in2, in3; + v8i16 temp0, temp2, diff0, diff2, diff4, diff6, diff8; + v8i16 d0, a_d0, str_x2, str; + + src -= 2 * stride; + LD_UB4(src, stride, in0, in3, in2, in1); + temp0 = (v8i16) __msa_ilvr_b((v16i8) in0, (v16i8) in1); + a_d0 = __msa_hsub_u_h((v16u8) temp0, (v16u8) temp0); + temp2 = (v8i16) __msa_ilvr_b((v16i8) in2, (v16i8) in3); + temp2 = __msa_hsub_u_h((v16u8) temp2, (v16u8) temp2); + temp2 <<= 2; + diff0 = a_d0 + temp2; + diff2 = -(-diff0 >> 3); + str_x2 = __msa_fill_h(-(strength << 1)); + temp0 = (str_x2 <= diff2); + diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) temp0, (v16u8) temp0); + temp2 = str_x2 - diff2; + str = __msa_fill_h(-strength); + temp0 = (diff2 < str); + diff2 = (v8i16) __msa_bmnz_v((v16u8) diff2, (v16u8) temp2, (v16u8) temp0); + diff4 = diff0 >> 3; + str_x2 = __msa_fill_h(strength << 1); + temp0 = (diff4 <= str_x2); + diff4 = (v8i16) __msa_bmz_v((v16u8) diff4, (v16u8) temp0, (v16u8) temp0); + temp2 = str_x2 - diff4; + str = __msa_fill_h(strength); + temp0 = (str < diff4); + diff4 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) temp2, (v16u8) temp0); + temp0 = __msa_clti_s_h(diff0, 0); + d0 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0); + diff2 = -diff2 >> 1; + diff4 >>= 1; + diff8 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0); + diff6 = (-a_d0) >> 2; + diff6 = -(diff6); + temp2 = -diff8; + temp0 = (diff6 < temp2); + diff6 = (v8i16) __msa_bmnz_v((v16u8) diff6, (v16u8) temp2, (v16u8) temp0); + diff2 = a_d0 >> 2; + temp0 = (diff2 <= diff8); + diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) diff8, (v16u8) temp0); + temp0 = __msa_clti_s_h(a_d0, 0); + diff6 = (v8i16) __msa_bmz_v((v16u8) diff6, (v16u8) diff2, (v16u8) temp0); + PCKEV_B2_SH(a_d0, diff6, a_d0, d0, diff6, d0); + in0 = (v16u8) ((v16i8) in0 - (v16i8) diff6); + in1 = (v16u8) ((v16i8) in1 + (v16i8) diff6); + in3 = __msa_xori_b(in3, 128); + in3 = (v16u8) __msa_adds_s_b((v16i8) in3, (v16i8) d0); + in3 = __msa_xori_b(in3, 128); + in2 = __msa_subsus_u_b(in2, (v16i8) d0); + res0 = __msa_copy_u_d((v2i64) in0, 0); + res1 = __msa_copy_u_d((v2i64) in3, 0); + res2 = __msa_copy_u_d((v2i64) in2, 0); + res3 = __msa_copy_u_d((v2i64) in1, 0); + SD4(res0, res1, res2, res3, src, stride); +} + +void ff_h263_h_loop_filter_msa(uint8_t *src, int32_t stride, int32_t q_scale) +{ + h263_h_loop_filter_msa(src, stride, q_scale); +} + +void ff_h263_v_loop_filter_msa(uint8_t *src, int32_t stride, int32_t q_scale) +{ + h263_v_loop_filter_msa(src, stride, q_scale); +} diff --git a/libavcodec/mips/h264chroma_init_mips.c b/libavcodec/mips/h264chroma_init_mips.c new file mode 100644 index 0000000000..122148dc78 --- /dev/null +++ b/libavcodec/mips/h264chroma_init_mips.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "h264chroma_mips.h" + +#if HAVE_MSA +static av_cold void h264chroma_init_msa(H264ChromaContext *c, int bit_depth) +{ + const int high_bit_depth = bit_depth > 8; + + if (!high_bit_depth) { + c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_msa; + c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_msa; + c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_msa; + + c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_msa; + c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_msa; + c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_msa; + } +} +#endif // #if HAVE_MSA + +#if HAVE_MMI +static av_cold void h264chroma_init_mmi(H264ChromaContext *c, int bit_depth) +{ + int high_bit_depth = bit_depth > 8; + + if (!high_bit_depth) { + c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmi; + c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_mmi; + c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmi; + c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmi; + } +} +#endif /* HAVE_MMI */ + +av_cold void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth) +{ +#if HAVE_MSA + h264chroma_init_msa(c, bit_depth); +#endif // #if HAVE_MSA +#if HAVE_MMI + h264chroma_init_mmi(c, bit_depth); +#endif /* HAVE_MMI */ +} diff --git a/libavcodec/mips/h264chroma_mips.h b/libavcodec/mips/h264chroma_mips.h new file mode 100644 index 0000000000..0ef6c74691 --- /dev/null +++ b/libavcodec/mips/h264chroma_mips.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_H264CHROMA_MIPS_H +#define AVCODEC_MIPS_H264CHROMA_MIPS_H + +#include "libavcodec/h264.h" +void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride, + int height, int x, int y); +void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride, + int height, int x, int y); +void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride, + int height, int x, int y); +void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride, + int height, int x, int y); +void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride, + int height, int x, int y); +void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride, + int height, int x, int y); + +void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride, + int h, int x, int y); +void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride, + int h, int x, int y); +void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride, + int h, int x, int y); +void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride, + int h, int x, int y); + +#endif /* AVCODEC_MIPS_H264CHROMA_MIPS_H */ diff --git a/libavcodec/mips/h264chroma_mmi.c b/libavcodec/mips/h264chroma_mmi.c new file mode 100644 index 0000000000..3dd123da36 --- /dev/null +++ b/libavcodec/mips/h264chroma_mmi.c @@ -0,0 +1,717 @@ +/* + * Loongson SIMD optimized h264chroma + * + * Copyright (c) 2015 Loongson Technology Corporation Limited + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "h264chroma_mips.h" +#include "constants.h" +#include "libavutil/mips/asmdefs.h" + +void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride, + int h, int x, int y) +{ + const int A = (8 - x) * (8 - y); + const int B = x * (8 - y); + const int C = (8 - x) * y; + const int D = x * y; + const int E = B + C; + double ftmp[10]; + uint64_t tmp[1]; + mips_reg addr[1]; + + if (D) { + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x06 \n\t" + "pshufh %[A], %[A], %[ftmp0] \n\t" + "pshufh %[B], %[B], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "pshufh %[C], %[C], %[ftmp0] \n\t" + "pshufh %[D], %[D], %[ftmp0] \n\t" + "1: \n\t" + PTR_ADDU "%[addr0], %[src], %[stride] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x08(%[src]) \n\t" + "gsldrc1 %[ftmp2], 0x01(%[src]) \n\t" + "gsldlc1 %[ftmp3], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr0]) \n\t" + "gsldlc1 %[ftmp4], 0x08(%[addr0]) \n\t" + "gsldrc1 %[ftmp4], 0x01(%[addr0]) \n\t" + + "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t" + "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t" + "pmullh %[ftmp5], %[ftmp5], %[A] \n\t" + "pmullh %[ftmp7], %[ftmp7], %[B] \n\t" + "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[A] \n\t" + "pmullh %[ftmp8], %[ftmp8], %[B] \n\t" + "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t" + + "punpcklbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" + "punpckhbh %[ftmp6], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t" + "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" + "pmullh %[ftmp5], %[ftmp5], %[C] \n\t" + "pmullh %[ftmp7], %[ftmp7], %[D] \n\t" + "paddh %[ftmp3], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[C] \n\t" + "pmullh %[ftmp8], %[ftmp8], %[D] \n\t" + "paddh %[ftmp4], %[ftmp6], %[ftmp8] \n\t" + + "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "addi %[h], %[h], -0x01 \n\t" + "sdc1 %[ftmp1], 0x00(%[dst]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), + [dst]"+&r"(dst), [src]"+&r"(src), + [h]"+&r"(h) + : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32), + [A]"f"(A), [B]"f"(B), + [C]"f"(C), [D]"f"(D) + : "memory" + ); + } else if (E) { + const int step = C ? stride : 1; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x06 \n\t" + "pshufh %[A], %[A], %[ftmp0] \n\t" + "pshufh %[E], %[E], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" + "1: \n\t" + PTR_ADDU "%[addr0], %[src], %[step] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr0]) \n\t" + + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" + "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[A] \n\t" + "pmullh %[ftmp5], %[ftmp5], %[E] \n\t" + "paddh %[ftmp1], %[ftmp3], %[ftmp5] \n\t" + "pmullh %[ftmp4], %[ftmp4], %[A] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[E] \n\t" + "paddh %[ftmp2], %[ftmp4], %[ftmp6] \n\t" + + "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "addi %[h], %[h], -0x01 \n\t" + "sdc1 %[ftmp1], 0x00(%[dst]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), + [dst]"+&r"(dst), [src]"+&r"(src), + [h]"+&r"(h) + : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step), + [ff_pw_32]"f"(ff_pw_32), + [A]"f"(A), [E]"f"(E) + : "memory" + ); + } else { + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x06 \n\t" + "pshufh %[A], %[A], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp4] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" + "pmullh %[ftmp1], %[ftmp2], %[A] \n\t" + "pmullh %[ftmp2], %[ftmp3], %[A] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp4] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "sdc1 %[ftmp1], 0x00(%[dst]) \n\t" + + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" + "pmullh %[ftmp1], %[ftmp2], %[A] \n\t" + "pmullh %[ftmp2], %[ftmp3], %[A] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp4] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "addi %[h], %[h], -0x02 \n\t" + "sdc1 %[ftmp1], 0x00(%[dst]) \n\t" + + PTR_ADDU "%[src], %[src], %[stride] \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), + [tmp0]"=&r"(tmp[0]), + [dst]"+&r"(dst), [src]"+&r"(src), + [h]"+&r"(h) + : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32), + [A]"f"(A) + : "memory" + ); + } +} + +void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride, + int h, int x, int y) +{ + const int A = (8 - x) * (8 - y); + const int B = x * (8 - y); + const int C = (8 - x) * y; + const int D = x * y; + const int E = B + C; + double ftmp[10]; + uint64_t tmp[1]; + mips_reg addr[1]; + + if (D) { + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x06 \n\t" + "pshufh %[A], %[A], %[ftmp0] \n\t" + "pshufh %[B], %[B], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "pshufh %[C], %[C], %[ftmp0] \n\t" + "pshufh %[D], %[D], %[ftmp0] \n\t" + "1: \n\t" + PTR_ADDU "%[addr0], %[src], %[stride] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x08(%[src]) \n\t" + "gsldrc1 %[ftmp2], 0x01(%[src]) \n\t" + "gsldlc1 %[ftmp3], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr0]) \n\t" + "gsldlc1 %[ftmp4], 0x08(%[addr0]) \n\t" + "gsldrc1 %[ftmp4], 0x01(%[addr0]) \n\t" + + "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t" + "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t" + "pmullh %[ftmp5], %[ftmp5], %[A] \n\t" + "pmullh %[ftmp7], %[ftmp7], %[B] \n\t" + "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[A] \n\t" + "pmullh %[ftmp8], %[ftmp8], %[B] \n\t" + "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t" + + "punpcklbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" + "punpckhbh %[ftmp6], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t" + "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" + "pmullh %[ftmp5], %[ftmp5], %[C] \n\t" + "pmullh %[ftmp7], %[ftmp7], %[D] \n\t" + "paddh %[ftmp3], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[C] \n\t" + "pmullh %[ftmp8], %[ftmp8], %[D] \n\t" + "paddh %[ftmp4], %[ftmp6], %[ftmp8] \n\t" + + "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "ldc1 %[ftmp2], 0x00(%[dst]) \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "addi %[h], %[h], -0x01 \n\t" + "sdc1 %[ftmp1], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), + [dst]"+&r"(dst), [src]"+&r"(src), + [h]"+&r"(h) + : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32), + [A]"f"(A), [B]"f"(B), + [C]"f"(C), [D]"f"(D) + : "memory" + ); + } else if (E) { + const int step = C ? stride : 1; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x06 \n\t" + "pshufh %[A], %[A], %[ftmp0] \n\t" + "pshufh %[E], %[E], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" + "1: \n\t" + PTR_ADDU "%[addr0], %[src], %[step] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr0]) \n\t" + + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" + "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[A] \n\t" + "pmullh %[ftmp5], %[ftmp5], %[E] \n\t" + "paddh %[ftmp1], %[ftmp3], %[ftmp5] \n\t" + "pmullh %[ftmp4], %[ftmp4], %[A] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[E] \n\t" + "paddh %[ftmp2], %[ftmp4], %[ftmp6] \n\t" + + "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "ldc1 %[ftmp2], 0x00(%[dst]) \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "addi %[h], %[h], -0x01 \n\t" + "sdc1 %[ftmp1], 0x00(%[dst]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), + [dst]"+&r"(dst), [src]"+&r"(src), + [h]"+&r"(h) + : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step), + [ff_pw_32]"f"(ff_pw_32), + [A]"f"(A), [E]"f"(E) + : "memory" + ); + } else { + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x06 \n\t" + "pshufh %[A], %[A], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp4] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" + "pmullh %[ftmp1], %[ftmp2], %[A] \n\t" + "pmullh %[ftmp2], %[ftmp3], %[A] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp4] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "ldc1 %[ftmp2], 0x00(%[dst]) \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "sdc1 %[ftmp1], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" + "pmullh %[ftmp1], %[ftmp2], %[A] \n\t" + "pmullh %[ftmp2], %[ftmp3], %[A] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp4] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "ldc1 %[ftmp2], 0x00(%[dst]) \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "addi %[h], %[h], -0x02 \n\t" + "sdc1 %[ftmp1], 0x00(%[dst]) \n\t" + + PTR_ADDU "%[src], %[src], %[stride] \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), + [tmp0]"=&r"(tmp[0]), + [dst]"+&r"(dst), [src]"+&r"(src), + [h]"+&r"(h) + : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32), + [A]"f"(A) + : "memory" + ); + } +} + +void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride, + int h, int x, int y) +{ + const int A = (8 - x) * (8 - y); + const int B = x * (8 - y); + const int C = (8 - x) * y; + const int D = x * y; + const int E = B + C; + double ftmp[8]; + uint64_t tmp[1]; + mips_reg addr[1]; + uint64_t low32; + + if (D) { + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x06 \n\t" + "pshufh %[A], %[A], %[ftmp0] \n\t" + "pshufh %[B], %[B], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" + "pshufh %[C], %[C], %[ftmp0] \n\t" + "pshufh %[D], %[D], %[ftmp0] \n\t" + "1: \n\t" + PTR_ADDU "%[addr0], %[src], %[stride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "uld %[low32], 0x01(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "uld %[low32], 0x00(%[addr0]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "uld %[low32], 0x01(%[addr0]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + + "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" + "pmullh %[ftmp5], %[ftmp5], %[A] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[B] \n\t" + "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + + "punpcklbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp4], %[ftmp0] \n\t" + "pmullh %[ftmp5], %[ftmp5], %[C] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[D] \n\t" + "paddh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + + "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "addi %[h], %[h], -0x01 \n\t" + "swc1 %[ftmp1], 0x00(%[dst]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), + [dst]"+&r"(dst), [src]"+&r"(src), + [h]"+&r"(h), + [low32]"=&r"(low32) + : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32), + [A]"f"(A), [B]"f"(B), + [C]"f"(C), [D]"f"(D) + : "memory" + ); + } else if (E) { + const int step = C ? stride : 1; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x06 \n\t" + "pshufh %[A], %[A], %[ftmp0] \n\t" + "pshufh %[E], %[E], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp5] \n\t" + "1: \n\t" + PTR_ADDU "%[addr0], %[src], %[step] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "uld %[low32], 0x00(%[addr0]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[A] \n\t" + "pmullh %[ftmp4], %[ftmp4], %[E] \n\t" + "paddh %[ftmp1], %[ftmp3], %[ftmp4] \n\t" + + "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "addi %[h], %[h], -0x01 \n\t" + "swc1 %[ftmp1], 0x00(%[dst]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), + [dst]"+&r"(dst), [src]"+&r"(src), + [h]"+&r"(h), + [low32]"=&r"(low32) + : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step), + [ff_pw_32]"f"(ff_pw_32), + [A]"f"(A), [E]"f"(E) + : "memory" + ); + } else { + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x06 \n\t" + "pshufh %[A], %[A], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp3] \n\t" + "1: \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "pmullh %[ftmp1], %[ftmp2], %[A] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "swc1 %[ftmp1], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "pmullh %[ftmp1], %[ftmp2], %[A] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "addi %[h], %[h], -0x02 \n\t" + "swc1 %[ftmp1], 0x00(%[dst]) \n\t" + + PTR_ADDU "%[src], %[src], %[stride] \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [tmp0]"=&r"(tmp[0]), + [dst]"+&r"(dst), [src]"+&r"(src), + [h]"+&r"(h), + [low32]"=&r"(low32) + : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32), + [A]"f"(A) + : "memory" + ); + } +} + +void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride, + int h, int x, int y) +{ + const int A = (8 - x) *(8 - y); + const int B = x * (8 - y); + const int C = (8 - x) * y; + const int D = x * y; + const int E = B + C; + double ftmp[8]; + uint64_t tmp[1]; + mips_reg addr[1]; + uint64_t low32; + + if (D) { + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x06 \n\t" + "pshufh %[A], %[A], %[ftmp0] \n\t" + "pshufh %[B], %[B], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" + "pshufh %[C], %[C], %[ftmp0] \n\t" + "pshufh %[D], %[D], %[ftmp0] \n\t" + "1: \n\t" + PTR_ADDU "%[addr0], %[src], %[stride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "uld %[low32], 0x01(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "uld %[low32], 0x00(%[addr0]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "uld %[low32], 0x01(%[addr0]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + + "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" + "pmullh %[ftmp5], %[ftmp5], %[A] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[B] \n\t" + "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + + "punpcklbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp4], %[ftmp0] \n\t" + "pmullh %[ftmp5], %[ftmp5], %[C] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[D] \n\t" + "paddh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + + "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "lwc1 %[ftmp2], 0x00(%[dst]) \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "addi %[h], %[h], -0x01 \n\t" + "swc1 %[ftmp1], 0x00(%[dst]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), + [dst]"+&r"(dst), [src]"+&r"(src), + [h]"+&r"(h), + [low32]"=&r"(low32) + : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32), + [A]"f"(A), [B]"f"(B), + [C]"f"(C), [D]"f"(D) + : "memory" + ); + } else if (E) { + const int step = C ? stride : 1; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x06 \n\t" + "pshufh %[A], %[A], %[ftmp0] \n\t" + "pshufh %[E], %[E], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp5] \n\t" + "1: \n\t" + PTR_ADDU "%[addr0], %[src], %[step] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "uld %[low32], 0x00(%[addr0]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[A] \n\t" + "pmullh %[ftmp4], %[ftmp4], %[E] \n\t" + "paddh %[ftmp1], %[ftmp3], %[ftmp4] \n\t" + + "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "lwc1 %[ftmp2], 0x00(%[dst]) \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "addi %[h], %[h], -0x01 \n\t" + "swc1 %[ftmp1], 0x00(%[dst]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), + [dst]"+&r"(dst), [src]"+&r"(src), + [h]"+&r"(h), + [low32]"=&r"(low32) + : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step), + [ff_pw_32]"f"(ff_pw_32), + [A]"f"(A), [E]"f"(E) + : "memory" + ); + } else { + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x06 \n\t" + "pshufh %[A], %[A], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp3] \n\t" + "1: \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "pmullh %[ftmp1], %[ftmp2], %[A] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "lwc1 %[ftmp2], 0x00(%[dst]) \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "swc1 %[ftmp1], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "pmullh %[ftmp1], %[ftmp2], %[A] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "lwc1 %[ftmp2], 0x00(%[dst]) \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "addi %[h], %[h], -0x02 \n\t" + "swc1 %[ftmp1], 0x00(%[dst]) \n\t" + + PTR_ADDU "%[src], %[src], %[stride] \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [tmp0]"=&r"(tmp[0]), + [dst]"+&r"(dst), [src]"+&r"(src), + [h]"+&r"(h), + [low32]"=&r"(low32) + : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32), + [A]"f"(A) + : "memory" + ); + } +} diff --git a/libavcodec/mips/h264chroma_msa.c b/libavcodec/mips/h264chroma_msa.c new file mode 100644 index 0000000000..67d0bc12ab --- /dev/null +++ b/libavcodec/mips/h264chroma_msa.c @@ -0,0 +1,2003 @@ +/* + * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "h264chroma_mips.h" + +static const uint8_t chroma_mask_arr[16 * 5] = { + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + 0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + 0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8, + 0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20 +}; + +static void avc_chroma_hz_2x2_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1) +{ + uint16_t out0, out1; + v16i8 src0, src1; + v8u16 res_r; + v8i16 res; + v16i8 mask; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + mask = LD_SB(&chroma_mask_arr[0]); + + LD_SB2(src, src_stride, src0, src1); + + src0 = __msa_vshf_b(mask, src1, src0); + res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec); + res_r <<= 3; + res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); + res_r = __msa_sat_u_h(res_r, 7); + res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); + + out0 = __msa_copy_u_h(res, 0); + out1 = __msa_copy_u_h(res, 2); + + SH(out0, dst); + dst += dst_stride; + SH(out1, dst); +} + +static void avc_chroma_hz_2x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1) +{ + v16u8 src0, src1, src2, src3; + v8u16 res_r; + v8i16 res; + v16i8 mask; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + mask = LD_SB(&chroma_mask_arr[64]); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2); + + src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0); + + res_r = __msa_dotp_u_h(src0, coeff_vec); + res_r <<= 3; + res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); + res_r = __msa_sat_u_h(res_r, 7); + res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); + + ST2x4_UB(res, 0, dst, dst_stride); +} + +static void avc_chroma_hz_2x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1) +{ + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v8u16 res_r; + v8i16 res; + v16i8 mask; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + mask = LD_SB(&chroma_mask_arr[64]); + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6); + + ILVR_D2_UB(src2, src0, src6, src4, src0, src4); + + res_r = __msa_dotp_u_h(src0, coeff_vec); + res_r <<= 3; + res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); + res_r = __msa_sat_u_h(res_r, 7); + res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); + + ST2x4_UB(res, 0, dst, dst_stride); + dst += (4 * dst_stride); + + res_r = __msa_dotp_u_h(src4, coeff_vec); + res_r <<= 3; + res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); + res_r = __msa_sat_u_h(res_r, 7); + res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); + + ST2x4_UB(res, 0, dst, dst_stride); +} + +static void avc_chroma_hz_2w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1, + int32_t height) +{ + if (2 == height) { + avc_chroma_hz_2x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1); + } else if (4 == height) { + avc_chroma_hz_2x4_msa(src, src_stride, dst, dst_stride, coeff0, coeff1); + } else if (8 == height) { + avc_chroma_hz_2x8_msa(src, src_stride, dst, dst_stride, coeff0, coeff1); + } +} + +static void avc_chroma_hz_4x2_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1) +{ + v16i8 src0, src1; + v8u16 res_r; + v4i32 res; + v16i8 mask; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + mask = LD_SB(&chroma_mask_arr[0]); + + LD_SB2(src, src_stride, src0, src1); + + src0 = __msa_vshf_b(mask, src1, src0); + res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec); + res_r <<= 3; + res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); + res_r = __msa_sat_u_h(res_r, 7); + res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); + + ST4x2_UB(res, dst, dst_stride); +} + +static void avc_chroma_hz_4x4multiple_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1, + int32_t height) +{ + uint32_t row; + v16u8 src0, src1, src2, src3; + v8u16 res0_r, res1_r; + v4i32 res0, res1; + v16i8 mask; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + mask = LD_SB(&chroma_mask_arr[0]); + + for (row = (height >> 2); row--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2); + DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r); + + res0_r <<= 3; + res1_r <<= 3; + + SRARI_H2_UH(res0_r, res1_r, 6); + SAT_UH2_UH(res0_r, res1_r, 7); + PCKEV_B2_SW(res0_r, res0_r, res1_r, res1_r, res0, res1); + + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avc_chroma_hz_4w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1, + int32_t height) +{ + if (2 == height) { + avc_chroma_hz_4x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1); + } else { + avc_chroma_hz_4x4multiple_msa(src, src_stride, dst, dst_stride, coeff0, + coeff1, height); + } +} + +static void avc_chroma_hz_8w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1, + int32_t height) +{ + uint32_t row; + v16u8 src0, src1, src2, src3, out0, out1; + v8u16 res0, res1, res2, res3; + v16i8 mask; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + mask = LD_SB(&chroma_mask_arr[32]); + + for (row = height >> 2; row--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3); + DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, + coeff_vec, res0, res1, res2, res3); + SLLI_4V(res0, res1, res2, res3, 3); + SRARI_H4_UH(res0, res1, res2, res3, 6); + SAT_UH4_UH(res0, res1, res2, res3, 7); + PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + } + + if (0 != (height % 4)) { + for (row = (height % 4); row--;) { + src0 = LD_UB(src); + src += src_stride; + + src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0); + + res0 = __msa_dotp_u_h(src0, coeff_vec); + res0 <<= 3; + res0 = (v8u16) __msa_srari_h((v8i16) res0, 6); + res0 = __msa_sat_u_h(res0, 7); + res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0); + + ST8x1_UB(res0, dst); + dst += dst_stride; + } + } +} + +static void avc_chroma_vt_2x2_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1) +{ + uint16_t out0, out1; + v16i8 src0, src1, src2; + v16u8 tmp0, tmp1; + v8i16 res; + v8u16 res_r; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + LD_SB3(src, src_stride, src0, src1, src2); + + ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1); + + tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0); + + res_r = __msa_dotp_u_h(tmp0, coeff_vec); + res_r <<= 3; + res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); + res_r = __msa_sat_u_h(res_r, 7); + res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); + + out0 = __msa_copy_u_h(res, 0); + out1 = __msa_copy_u_h(res, 2); + + SH(out0, dst); + dst += dst_stride; + SH(out1, dst); +} + +static void avc_chroma_vt_2x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1) +{ + v16u8 src0, src1, src2, src3, src4; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 res; + v8u16 res_r; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, + tmp0, tmp1, tmp2, tmp3); + ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); + + tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0); + + res_r = __msa_dotp_u_h(tmp0, coeff_vec); + res_r <<= 3; + res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); + res_r = __msa_sat_u_h(res_r, 7); + + res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); + + ST2x4_UB(res, 0, dst, dst_stride); +} + +static void avc_chroma_vt_2x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1) +{ + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 res; + v8u16 res_r; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + LD_UB4(src, src_stride, src5, src6, src7, src8); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, + tmp0, tmp1, tmp2, tmp3); + ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); + + tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0); + + res_r = __msa_dotp_u_h(tmp0, coeff_vec); + res_r <<= 3; + res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); + res_r = __msa_sat_u_h(res_r, 7); + + res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); + + ST2x4_UB(res, 0, dst, dst_stride); + dst += (4 * dst_stride); + + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, + tmp0, tmp1, tmp2, tmp3); + ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); + + tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0); + + res_r = __msa_dotp_u_h(tmp0, coeff_vec); + res_r <<= 3; + res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); + res_r = __msa_sat_u_h(res_r, 7); + + res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); + + ST2x4_UB(res, 0, dst, dst_stride); + dst += (4 * dst_stride); +} + +static void avc_chroma_vt_2w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1, + int32_t height) +{ + if (2 == height) { + avc_chroma_vt_2x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1); + } else if (4 == height) { + avc_chroma_vt_2x4_msa(src, src_stride, dst, dst_stride, coeff0, coeff1); + } else if (8 == height) { + avc_chroma_vt_2x8_msa(src, src_stride, dst, dst_stride, coeff0, coeff1); + } +} + +static void avc_chroma_vt_4x2_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1) +{ + v16u8 src0, src1, src2; + v16u8 tmp0, tmp1; + v4i32 res; + v8u16 res_r; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + LD_UB3(src, src_stride, src0, src1, src2); + ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1); + + tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0); + res_r = __msa_dotp_u_h(tmp0, coeff_vec); + res_r <<= 3; + res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); + res_r = __msa_sat_u_h(res_r, 7); + res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); + + ST4x2_UB(res, dst, dst_stride); +} + +static void avc_chroma_vt_4x4multiple_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1, + int32_t height) +{ + uint32_t row; + v16u8 src0, src1, src2, src3, src4; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8u16 res0_r, res1_r; + v4i32 res0, res1; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + src0 = LD_UB(src); + src += src_stride; + + for (row = (height >> 2); row--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, + tmp0, tmp1, tmp2, tmp3); + ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); + DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r); + + res0_r <<= 3; + res1_r <<= 3; + + SRARI_H2_UH(res0_r, res1_r, 6); + SAT_UH2_UH(res0_r, res1_r, 7); + PCKEV_B2_SW(res0_r, res0_r, res1_r, res1_r, res0, res1); + + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + src0 = src4; + } +} + +static void avc_chroma_vt_4w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1, + int32_t height) +{ + if (2 == height) { + avc_chroma_vt_4x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1); + } else { + avc_chroma_vt_4x4multiple_msa(src, src_stride, dst, dst_stride, coeff0, + coeff1, height); + } +} + +static void avc_chroma_vt_8w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1, + int32_t height) +{ + uint32_t row; + v16u8 src0, src1, src2, src3, src4, out0, out1; + v8u16 res0, res1, res2, res3; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + src0 = LD_UB(src); + src += src_stride; + + for (row = height >> 2; row--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, + src0, src1, src2, src3); + DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, + coeff_vec, res0, res1, res2, res3); + SLLI_4V(res0, res1, res2, res3, 3); + SRARI_H4_UH(res0, res1, res2, res3, 6); + SAT_UH4_UH(res0, res1, res2, res3, 7); + PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); + + ST8x4_UB(out0, out1, dst, dst_stride); + + dst += (4 * dst_stride); + src0 = src4; + } +} + +static void avc_chroma_hv_2x2_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coef_hor0, uint32_t coef_hor1, + uint32_t coef_ver0, uint32_t coef_ver1) +{ + uint16_t out0, out1; + v16u8 src0, src1, src2; + v8u16 res_hz0, res_hz1, res_vt0, res_vt1; + v8i16 res_vert; + v16i8 mask; + v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); + v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); + v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); + v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); + v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); + + mask = LD_SB(&chroma_mask_arr[48]); + + LD_UB3(src, src_stride, src0, src1, src2); + VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); + DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); + MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); + + res_vt0 += res_vt1; + res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); + res_vt0 = __msa_sat_u_h(res_vt0, 7); + res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); + + out0 = __msa_copy_u_h(res_vert, 0); + out1 = __msa_copy_u_h(res_vert, 1); + + SH(out0, dst); + dst += dst_stride; + SH(out1, dst); +} + +static void avc_chroma_hv_2x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coef_hor0, uint32_t coef_hor1, + uint32_t coef_ver0, uint32_t coef_ver1) +{ + v16u8 src0, src1, src2, src3, src4; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8u16 res_hz0, res_hz1, res_vt0, res_vt1; + v8i16 res; + v16i8 mask; + v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); + v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); + v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); + v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); + v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); + + mask = LD_SB(&chroma_mask_arr[48]); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1); + VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3); + ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); + DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); + MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); + + res_vt0 += res_vt1; + res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); + res_vt0 = __msa_sat_u_h(res_vt0, 7); + res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); + + ST2x4_UB(res, 0, dst, dst_stride); +} + +static void avc_chroma_hv_2x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coef_hor0, uint32_t coef_hor1, + uint32_t coef_ver0, uint32_t coef_ver1) +{ + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8u16 res_hz0, res_hz1, res_vt0, res_vt1; + v8i16 res; + v16i8 mask; + v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); + v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); + v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); + v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); + v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); + + mask = LD_SB(&chroma_mask_arr[48]); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + LD_UB4(src, src_stride, src5, src6, src7, src8); + + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1); + VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3); + ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1); + VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3); + ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5); + DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); + MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); + + res_vt0 += res_vt1; + res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); + res_vt0 = __msa_sat_u_h(res_vt0, 7); + + res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); + + ST2x4_UB(res, 0, dst, dst_stride); + dst += (4 * dst_stride); + + DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); + MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); + + res_vt0 += res_vt1; + res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); + res_vt0 = __msa_sat_u_h(res_vt0, 7); + + res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); + + ST2x4_UB(res, 0, dst, dst_stride); +} + +static void avc_chroma_hv_2w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coef_hor0, uint32_t coef_hor1, + uint32_t coef_ver0, uint32_t coef_ver1, + int32_t height) +{ + if (2 == height) { + avc_chroma_hv_2x2_msa(src, src_stride, dst, dst_stride, coef_hor0, + coef_hor1, coef_ver0, coef_ver1); + } else if (4 == height) { + avc_chroma_hv_2x4_msa(src, src_stride, dst, dst_stride, coef_hor0, + coef_hor1, coef_ver0, coef_ver1); + } else if (8 == height) { + avc_chroma_hv_2x8_msa(src, src_stride, dst, dst_stride, coef_hor0, + coef_hor1, coef_ver0, coef_ver1); + } +} + +static void avc_chroma_hv_4x2_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coef_hor0, uint32_t coef_hor1, + uint32_t coef_ver0, uint32_t coef_ver1) +{ + v16u8 src0, src1, src2; + v8u16 res_hz0, res_hz1, res_vt0, res_vt1; + v16i8 mask; + v4i32 res; + v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); + v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); + v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); + v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); + v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); + + mask = LD_SB(&chroma_mask_arr[0]); + LD_UB3(src, src_stride, src0, src1, src2); + VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); + DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); + MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); + + res_vt0 += res_vt1; + res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); + res_vt0 = __msa_sat_u_h(res_vt0, 7); + res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); + + ST4x2_UB(res, dst, dst_stride); +} + +static void avc_chroma_hv_4x4multiple_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coef_hor0, + uint32_t coef_hor1, + uint32_t coef_ver0, + uint32_t coef_ver1, + int32_t height) +{ + uint32_t row; + v16u8 src0, src1, src2, src3, src4; + v8u16 res_hz0, res_hz1, res_hz2, res_hz3; + v8u16 res_vt0, res_vt1, res_vt2, res_vt3; + v16i8 mask; + v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); + v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); + v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); + v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); + v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); + v4i32 res0, res1; + + mask = LD_SB(&chroma_mask_arr[0]); + + src0 = LD_UB(src); + src += src_stride; + + for (row = (height >> 2); row--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); + VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3); + DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec, + coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, + res_hz3); + MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, + coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, + res_vt3); + ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1); + SRARI_H2_UH(res_vt0, res_vt1, 6); + SAT_UH2_UH(res_vt0, res_vt1, 7); + PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1); + + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + src0 = src4; + } +} + +static void avc_chroma_hv_4w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coef_hor0, uint32_t coef_hor1, + uint32_t coef_ver0, uint32_t coef_ver1, + int32_t height) +{ + if (2 == height) { + avc_chroma_hv_4x2_msa(src, src_stride, dst, dst_stride, coef_hor0, + coef_hor1, coef_ver0, coef_ver1); + } else { + avc_chroma_hv_4x4multiple_msa(src, src_stride, dst, dst_stride, + coef_hor0, coef_hor1, coef_ver0, + coef_ver1, height); + } +} + +static void avc_chroma_hv_8w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coef_hor0, uint32_t coef_hor1, + uint32_t coef_ver0, uint32_t coef_ver1, + int32_t height) +{ + uint32_t row; + v16u8 src0, src1, src2, src3, src4, out0, out1; + v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4; + v8u16 res_vt0, res_vt1, res_vt2, res_vt3; + v16i8 mask; + v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); + v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); + v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); + v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); + v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); + + mask = LD_SB(&chroma_mask_arr[32]); + + src0 = LD_UB(src); + src += src_stride; + + src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0); + res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec); + + for (row = (height >> 2); row--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2); + VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4); + DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec, + coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, + res_hz4); + MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, + coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, + res_vt3); + + res_vt0 += (res_hz0 * coeff_vt_vec1); + res_vt1 += (res_hz1 * coeff_vt_vec1); + res_vt2 += (res_hz2 * coeff_vt_vec1); + res_vt3 += (res_hz3 * coeff_vt_vec1); + + SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6); + SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7); + PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + + dst += (4 * dst_stride); + + res_hz0 = res_hz4; + } +} + +static void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1) +{ + uint16_t out0, out1; + uint32_t load0, load1; + v16i8 src0, src1; + v16u8 dst_data = { 0 }; + v8u16 res_r; + v16u8 res; + v16i8 mask; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + mask = LD_SB(&chroma_mask_arr[0]); + + LD_SB2(src, src_stride, src0, src1); + + load0 = LW(dst); + load1 = LW(dst + dst_stride); + + INSERT_W2_UB(load0, load1, dst_data); + + src0 = __msa_vshf_b(mask, src1, src0); + + res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec); + res_r <<= 3; + res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); + res_r = __msa_sat_u_h(res_r, 7); + + res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); + dst_data = __msa_aver_u_b(res, dst_data); + + out0 = __msa_copy_u_h((v8i16) dst_data, 0); + out1 = __msa_copy_u_h((v8i16) dst_data, 2); + + SH(out0, dst); + dst += dst_stride; + SH(out1, dst); +} + +static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1) +{ + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v8u16 res_r; + v16i8 res, mask; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + mask = LD_SB(&chroma_mask_arr[64]); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1); + dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2); + dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3); + + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2); + + src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0); + + res_r = __msa_dotp_u_h(src0, coeff_vec); + res_r <<= 3; + res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); + res_r = __msa_sat_u_h(res_r, 7); + + res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r); + dst0 = __msa_aver_u_b((v16u8) res, dst0); + + ST2x4_UB(dst0, 0, dst, dst_stride); +} + +static void avc_chroma_hz_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1) +{ + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8u16 res0_r, res1_r; + v16u8 res0, res1, mask; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + mask = LD_UB(&chroma_mask_arr[64]); + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + + dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1); + dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2); + dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3); + + dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5); + dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6); + dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7); + + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6); + ILVR_D2_UB(src2, src0, src6, src4, src0, src4); + DOTP_UB2_UH(src0, src4, coeff_vec, coeff_vec, res0_r, res1_r); + + res0_r <<= 3; + res1_r <<= 3; + + SRARI_H2_UH(res0_r, res1_r, 6); + SAT_UH2_UH(res0_r, res1_r, 7); + PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst4, dst0, dst4); + + ST2x4_UB(dst0, 0, dst, dst_stride); + dst += (4 * dst_stride); + ST2x4_UB(dst4, 0, dst, dst_stride); +} + +static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1, + int32_t height) +{ + if (2 == height) { + avc_chroma_hz_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride, + coeff0, coeff1); + } else if (4 == height) { + avc_chroma_hz_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride, + coeff0, coeff1); + } else if (8 == height) { + avc_chroma_hz_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride, + coeff0, coeff1); + } +} + +static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1) +{ + uint32_t load0, load1; + v16i8 src0, src1; + v16u8 dst_data = { 0 }; + v8u16 res_r; + v16i8 res, mask; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + mask = LD_SB(&chroma_mask_arr[0]); + + LD_SB2(src, src_stride, src0, src1); + + load0 = LW(dst); + load1 = LW(dst + dst_stride); + + INSERT_W2_UB(load0, load1, dst_data); + + src0 = __msa_vshf_b(mask, src1, src0); + + res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec); + res_r <<= 3; + res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); + res_r = __msa_sat_u_h(res_r, 7); + res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r); + dst_data = __msa_aver_u_b((v16u8) res, dst_data); + + ST4x2_UB(dst_data, dst, dst_stride); +} + +static void avc_chroma_hz_and_aver_dst_4x4multiple_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + uint32_t coeff0, + uint32_t coeff1, + int32_t height) +{ + uint32_t load0, load1; + uint32_t row; + v16u8 src0, src1, src2, src3; + v16u8 dst0 = { 0 }; + v16u8 dst1 = { 0 }; + v8u16 res0_r, res1_r; + v16u8 res0, res1, mask; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + mask = LD_UB(&chroma_mask_arr[0]); + + for (row = (height >> 2); row--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + load0 = LW(dst); + load1 = LW(dst + dst_stride); + + INSERT_W2_UB(load0, load1, dst0); + + load0 = LW(dst + 2 * dst_stride); + load1 = LW(dst + 3 * dst_stride); + + INSERT_W2_UB(load0, load1, dst1); + + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2); + DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r); + + res0_r <<= 3; + res1_r <<= 3; + + SRARI_H2_UH(res0_r, res1_r, 6); + SAT_UH2_UH(res0_r, res1_r, 7); + PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1); + + ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1, + int32_t height) +{ + if (2 == height) { + avc_chroma_hz_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride, + coeff0, coeff1); + } else { + avc_chroma_hz_and_aver_dst_4x4multiple_msa(src, src_stride, + dst, dst_stride, + coeff0, coeff1, height); + } +} + +static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1, + int32_t height) +{ + uint32_t row; + v16u8 src0, src1, src2, src3, out0, out1; + v8u16 res0, res1, res2, res3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 mask; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + mask = LD_SB(&chroma_mask_arr[32]); + + for (row = height >> 2; row--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3); + DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, + coeff_vec, res0, res1, res2, res3); + SLLI_4V(res0, res1, res2, res3, 3); + SRARI_H4_UH(res0, res1, res2, res3, 6); + SAT_UH4_UH(res0, res1, res2, res3, 7); + PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); + PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1) +{ + uint16_t out0, out1; + uint32_t load0, load1; + v16i8 src0, src1, src2, tmp0, tmp1, res; + v16u8 dst_data = { 0 }; + v8u16 res_r; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + LD_SB3(src, src_stride, src0, src1, src2); + load0 = LW(dst); + load1 = LW(dst + dst_stride); + + INSERT_W2_UB(load0, load1, dst_data); + + ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1); + + tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0); + res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec); + res_r <<= 3; + res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); + res_r = __msa_sat_u_h(res_r, 7); + res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r); + dst_data = __msa_aver_u_b((v16u8) res, dst_data); + out0 = __msa_copy_u_h((v8i16) dst_data, 0); + out1 = __msa_copy_u_h((v8i16) dst_data, 2); + + SH(out0, dst); + dst += dst_stride; + SH(out1, dst); +} + +static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1) +{ + uint32_t load0, load1; + v16i8 src0, src1, src2, src3, src4; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8u16 res_r; + v8i16 res; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + v16u8 dst_data = { 0 }; + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + + load0 = LW(dst); + load1 = LW(dst + dst_stride); + + dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, load0); + dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, load1); + + load0 = LW(dst + 2 * dst_stride); + load1 = LW(dst + 3 * dst_stride); + + dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, load0); + dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, load1); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, + tmp0, tmp1, tmp2, tmp3); + ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); + + tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0); + + res_r = __msa_dotp_u_h(tmp0, coeff_vec); + res_r <<= 3; + res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); + res_r = __msa_sat_u_h(res_r, 7); + + res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); + res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data); + + ST2x4_UB(res, 0, dst, dst_stride); + dst += (4 * dst_stride); +} + +static void avc_chroma_vt_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1) +{ + uint32_t load0, load1, load2, load3; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 res; + v8u16 res_r; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + v16u8 dst_data0 = { 0 }; + v16u8 dst_data1 = { 0 }; + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + LD_SB4(src, src_stride, src5, src6, src7, src8); + + LW4(dst, dst_stride, load0, load1, load2, load3); + + dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 0, load0); + dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 1, load1); + dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 2, load2); + dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 3, load3); + + LW4(dst + 4 * dst_stride, dst_stride, load0, load1, load2, load3); + + dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 0, load0); + dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 1, load1); + dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 2, load2); + dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 3, load3); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, + tmp0, tmp1, tmp2, tmp3); + + ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); + + tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0); + + res_r = __msa_dotp_u_h(tmp0, coeff_vec); + res_r <<= 3; + res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); + res_r = __msa_sat_u_h(res_r, 7); + + res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); + res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data0); + + ST2x4_UB(res, 0, dst, dst_stride); + dst += (4 * dst_stride); + + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, + tmp0, tmp1, tmp2, tmp3); + + ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); + + tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0); + + res_r = __msa_dotp_u_h(tmp0, coeff_vec); + res_r <<= 3; + res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); + res_r = __msa_sat_u_h(res_r, 7); + + res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); + res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data1); + + ST2x4_UB(res, 0, dst, dst_stride); +} + +static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1, + int32_t height) +{ + if (2 == height) { + avc_chroma_vt_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride, + coeff0, coeff1); + } else if (4 == height) { + avc_chroma_vt_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride, + coeff0, coeff1); + } else if (8 == height) { + avc_chroma_vt_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride, + coeff0, coeff1); + } +} + +static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1) +{ + uint32_t load0, load1; + v16i8 src0, src1, src2, tmp0, tmp1; + v16u8 dst_data = { 0 }; + v8u16 res_r; + v16u8 res; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + LD_SB3(src, src_stride, src0, src1, src2); + + load0 = LW(dst); + load1 = LW(dst + dst_stride); + + INSERT_W2_UB(load0, load1, dst_data); + ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1); + + tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0); + + res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec); + res_r <<= 3; + res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); + res_r = __msa_sat_u_h(res_r, 7); + res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); + res = __msa_aver_u_b(res, dst_data); + + ST4x2_UB(res, dst, dst_stride); +} + +static void avc_chroma_vt_and_aver_dst_4x4mul_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + uint32_t coeff0, + uint32_t coeff1, + int32_t height) +{ + uint32_t load0, load1, row; + v16i8 src0, src1, src2, src3, src4; + v16u8 tmp0, tmp1, tmp2, tmp3; + v16u8 dst0 = { 0 }; + v16u8 dst1 = { 0 }; + v8u16 res0_r, res1_r; + v16u8 res0, res1; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + src0 = LD_SB(src); + src += src_stride; + + for (row = (height >> 2); row--;) { + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + load0 = LW(dst); + load1 = LW(dst + dst_stride); + + INSERT_W2_UB(load0, load1, dst0); + load0 = LW(dst + 2 * dst_stride); + load1 = LW(dst + 3 * dst_stride); + INSERT_W2_UB(load0, load1, dst1); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, + tmp0, tmp1, tmp2, tmp3); + ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); + DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r); + + res0_r <<= 3; + res1_r <<= 3; + + SRARI_H2_UH(res0_r, res1_r, 6); + SAT_UH2_UH(res0_r, res1_r, 7); + PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + src0 = src4; + } +} + +static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1, + int32_t height) +{ + if (2 == height) { + avc_chroma_vt_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride, + coeff0, coeff1); + } else { + avc_chroma_vt_and_aver_dst_4x4mul_msa(src, src_stride, dst, dst_stride, + coeff0, coeff1, height); + } +} + +static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coeff0, uint32_t coeff1, + int32_t height) +{ + uint32_t row; + v16u8 src0, src1, src2, src3, src4; + v16u8 out0, out1; + v8u16 res0, res1, res2, res3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 coeff_vec0 = __msa_fill_b(coeff0); + v16i8 coeff_vec1 = __msa_fill_b(coeff1); + v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); + + src0 = LD_UB(src); + src += src_stride; + + for (row = height >> 2; row--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, + src0, src1, src2, src3); + DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, + coeff_vec, res0, res1, res2, res3); + SLLI_4V(res0, res1, res2, res3, 3); + SRARI_H4_UH(res0, res1, res2, res3, 6); + SAT_UH4_UH(res0, res1, res2, res3, 7); + PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); + PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + + dst += (4 * dst_stride); + src0 = src4; + } +} + +static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coef_hor0, + uint32_t coef_hor1, + uint32_t coef_ver0, + uint32_t coef_ver1) +{ + uint16_t out0, out1; + v16u8 dst0, dst1; + v16u8 src0, src1, src2; + v8u16 res_hz0, res_hz1, res_vt0, res_vt1; + v16i8 res, mask; + v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); + v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); + v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); + v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); + v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); + + mask = LD_SB(&chroma_mask_arr[48]); + + LD_UB3(src, src_stride, src0, src1, src2); + LD_UB2(dst, dst_stride, dst0, dst1); + VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); + DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); + MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); + + res_vt0 += res_vt1; + res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); + res_vt0 = __msa_sat_u_h(res_vt0, 7); + res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); + dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1); + dst0 = __msa_aver_u_b((v16u8) res, dst0); + out0 = __msa_copy_u_h((v8i16) dst0, 0); + out1 = __msa_copy_u_h((v8i16) dst0, 1); + + SH(out0, dst); + dst += dst_stride; + SH(out1, dst); +} + +static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coef_hor0, + uint32_t coef_hor1, + uint32_t coef_ver0, + uint32_t coef_ver1) +{ + v16u8 src0, src1, src2, src3, src4; + v16u8 tmp0, tmp1, tmp2, tmp3; + v16u8 dst0, dst1, dst2, dst3; + v8u16 res_hz0, res_hz1, res_vt0, res_vt1; + v16i8 res, mask; + v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); + v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); + v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); + v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); + v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); + + mask = LD_SB(&chroma_mask_arr[48]); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1); + VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3); + ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); + DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); + MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); + + res_vt0 += res_vt1; + res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); + res_vt0 = __msa_sat_u_h(res_vt0, 7); + res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); + + dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1); + dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2); + dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3); + dst0 = __msa_aver_u_b((v16u8) res, dst0); + + ST2x4_UB(dst0, 0, dst, dst_stride); +} + +static void avc_chroma_hv_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coef_hor0, + uint32_t coef_hor1, + uint32_t coef_ver0, + uint32_t coef_ver1) +{ + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8u16 res_hz0, res_hz1, res_vt0, res_vt1; + v16i8 res, mask; + v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); + v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); + v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); + v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); + v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); + + mask = LD_SB(&chroma_mask_arr[48]); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + LD_UB4(src, src_stride, src5, src6, src7, src8); + + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + + dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1); + dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2); + dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3); + + dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5); + dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6); + dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7); + + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1); + VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3); + ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1); + VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3); + ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5); + DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); + MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); + + res_vt0 += res_vt1; + res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); + res_vt0 = __msa_sat_u_h(res_vt0, 7); + res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); + dst0 = __msa_aver_u_b((v16u8) res, dst0); + + ST2x4_UB(dst0, 0, dst, dst_stride); + dst += (4 * dst_stride); + + DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); + MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); + + res_vt0 += res_vt1; + res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); + res_vt0 = __msa_sat_u_h(res_vt0, 7); + res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); + dst4 = __msa_aver_u_b((v16u8) res, dst4); + + ST2x4_UB(dst4, 0, dst, dst_stride); +} + +static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coef_hor0, + uint32_t coef_hor1, + uint32_t coef_ver0, + uint32_t coef_ver1, + int32_t height) +{ + if (2 == height) { + avc_chroma_hv_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride, + coef_hor0, coef_hor1, + coef_ver0, coef_ver1); + } else if (4 == height) { + avc_chroma_hv_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride, + coef_hor0, coef_hor1, + coef_ver0, coef_ver1); + } else if (8 == height) { + avc_chroma_hv_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride, + coef_hor0, coef_hor1, + coef_ver0, coef_ver1); + } +} + +static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coef_hor0, + uint32_t coef_hor1, + uint32_t coef_ver0, + uint32_t coef_ver1) +{ + v16u8 src0, src1, src2; + v16u8 dst0, dst1; + v8u16 res_hz0, res_hz1, res_vt0, res_vt1; + v16i8 res, mask; + v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); + v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); + v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); + v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); + v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); + + mask = LD_SB(&chroma_mask_arr[0]); + + LD_UB3(src, src_stride, src0, src1, src2); + LD_UB2(dst, dst_stride, dst0, dst1); + VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); + DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); + MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); + + res_vt0 += res_vt1; + res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); + res_vt0 = __msa_sat_u_h(res_vt0, 7); + res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); + dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1); + dst0 = __msa_aver_u_b((v16u8) res, dst0); + + ST4x2_UB(dst0, dst, dst_stride); +} + +static void avc_chroma_hv_and_aver_dst_4x4mul_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + uint32_t coef_hor0, + uint32_t coef_hor1, + uint32_t coef_ver0, + uint32_t coef_ver1, + int32_t height) +{ + uint32_t row; + v16u8 src0, src1, src2, src3, src4; + v16u8 dst0, dst1, dst2, dst3; + v8u16 res_hz0, res_hz1, res_hz2, res_hz3; + v8u16 res_vt0, res_vt1, res_vt2, res_vt3; + v16i8 mask; + v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); + v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); + v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); + v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); + v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); + v16u8 res0, res1; + + mask = LD_SB(&chroma_mask_arr[0]); + + src0 = LD_UB(src); + src += src_stride; + + for (row = (height >> 2); row--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); + VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3); + DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec, + coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, + res_hz3); + MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, + coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, + res_vt3); + ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1); + SRARI_H2_UH(res_vt0, res_vt1, 6); + SAT_UH2_UH(res_vt0, res_vt1, 7); + PCKEV_B2_UB(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1); + + dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1); + dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3); + + AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1); + + ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + src0 = src4; + } +} + +static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coef_hor0, + uint32_t coef_hor1, + uint32_t coef_ver0, + uint32_t coef_ver1, + int32_t height) +{ + if (2 == height) { + avc_chroma_hv_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride, + coef_hor0, coef_hor1, + coef_ver0, coef_ver1); + } else { + avc_chroma_hv_and_aver_dst_4x4mul_msa(src, src_stride, dst, dst_stride, + coef_hor0, coef_hor1, + coef_ver0, coef_ver1, height); + } +} + +static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint32_t coef_hor0, + uint32_t coef_hor1, + uint32_t coef_ver0, + uint32_t coef_ver1, + int32_t height) +{ + uint32_t row; + v16u8 src0, src1, src2, src3, src4, out0, out1; + v8u16 res_hz0, res_hz1, res_hz2; + v8u16 res_hz3, res_hz4; + v8u16 res_vt0, res_vt1, res_vt2, res_vt3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 mask; + v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); + v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); + v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); + v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); + v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); + + mask = LD_SB(&chroma_mask_arr[32]); + + src0 = LD_UB(src); + src += src_stride; + + src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0); + res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec); + + for (row = (height >> 2); row--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2); + VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4); + DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec, + coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, + res_hz4); + MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, + coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, + res_vt3); + + res_vt0 += (res_hz0 * coeff_vt_vec1); + res_vt1 += (res_hz1 * coeff_vt_vec1); + res_vt2 += (res_hz2 * coeff_vt_vec1); + res_vt3 += (res_hz3 * coeff_vt_vec1); + + SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6); + SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7); + + PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1); + PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + res_hz0 = res_hz4; + } +} + +static void copy_width8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + out4 = __msa_copy_u_d((v2i64) src4, 0); + out5 = __msa_copy_u_d((v2i64) src5, 0); + out6 = __msa_copy_u_d((v2i64) src6, 0); + out7 = __msa_copy_u_d((v2i64) src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + for (cnt = height >> 3; cnt--;) { + LD_UB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + out4 = __msa_copy_u_d((v2i64) src4, 0); + out5 = __msa_copy_u_d((v2i64) src5, 0); + out6 = __msa_copy_u_d((v2i64) src6, 0); + out7 = __msa_copy_u_d((v2i64) src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 4) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 2) { + for (cnt = (height / 2); cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + + SD(out0, dst); + dst += dst_stride; + SD(out1, dst); + dst += dst_stride; + } + } +} + +static void avg_width4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + uint32_t out0, out1, out2, out3; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + + if (0 == (height % 4)) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + + out0 = __msa_copy_u_w((v4i32) dst0, 0); + out1 = __msa_copy_u_w((v4i32) dst1, 0); + out2 = __msa_copy_u_w((v4i32) dst2, 0); + out3 = __msa_copy_u_w((v4i32) dst3, 0); + SW4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == (height % 2)) { + for (cnt = (height / 2); cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + + AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1); + + out0 = __msa_copy_u_w((v4i32) dst0, 0); + out1 = __msa_copy_u_w((v4i32) dst1, 0); + SW(out0, dst); + dst += dst_stride; + SW(out1, dst); + dst += dst_stride; + } + } +} + +static void avg_width8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + uint64_t out0, out1, out2, out3; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + + out0 = __msa_copy_u_d((v2i64) dst0, 0); + out1 = __msa_copy_u_d((v2i64) dst1, 0); + out2 = __msa_copy_u_d((v2i64) dst2, 0); + out3 = __msa_copy_u_d((v2i64) dst3, 0); + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, + int stride, int height, int x, int y) +{ + av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); + + if (x && y) { + avc_chroma_hv_8w_msa(src, stride, dst, + stride, x, (8 - x), y, (8 - y), height); + } else if (x) { + avc_chroma_hz_8w_msa(src, stride, dst, stride, x, (8 - x), height); + } else if (y) { + avc_chroma_vt_8w_msa(src, stride, dst, stride, y, (8 - y), height); + } else { + copy_width8_msa(src, stride, dst, stride, height); + } +} + +void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, + int stride, int height, int x, int y) +{ + int32_t cnt; + + av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); + + if (x && y) { + avc_chroma_hv_4w_msa(src, stride, dst, + stride, x, (8 - x), y, (8 - y), height); + } else if (x) { + avc_chroma_hz_4w_msa(src, stride, dst, stride, x, (8 - x), height); + } else if (y) { + avc_chroma_vt_4w_msa(src, stride, dst, stride, y, (8 - y), height); + } else { + for (cnt = height; cnt--;) { + *((uint32_t *) dst) = *((uint32_t *) src); + + src += stride; + dst += stride; + } + } +} + +void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, + int stride, int height, int x, int y) +{ + int32_t cnt; + + av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); + + if (x && y) { + avc_chroma_hv_2w_msa(src, stride, dst, + stride, x, (8 - x), y, (8 - y), height); + } else if (x) { + avc_chroma_hz_2w_msa(src, stride, dst, stride, x, (8 - x), height); + } else if (y) { + avc_chroma_vt_2w_msa(src, stride, dst, stride, y, (8 - y), height); + } else { + for (cnt = height; cnt--;) { + *((uint16_t *) dst) = *((uint16_t *) src); + + src += stride; + dst += stride; + } + } +} + +void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, + int stride, int height, int x, int y) +{ + av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); + + + if (x && y) { + avc_chroma_hv_and_aver_dst_8w_msa(src, stride, dst, + stride, x, (8 - x), y, + (8 - y), height); + } else if (x) { + avc_chroma_hz_and_aver_dst_8w_msa(src, stride, dst, + stride, x, (8 - x), height); + } else if (y) { + avc_chroma_vt_and_aver_dst_8w_msa(src, stride, dst, + stride, y, (8 - y), height); + } else { + avg_width8_msa(src, stride, dst, stride, height); + } +} + +void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, + int stride, int height, int x, int y) +{ + av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); + + if (x && y) { + avc_chroma_hv_and_aver_dst_4w_msa(src, stride, dst, + stride, x, (8 - x), y, + (8 - y), height); + } else if (x) { + avc_chroma_hz_and_aver_dst_4w_msa(src, stride, dst, + stride, x, (8 - x), height); + } else if (y) { + avc_chroma_vt_and_aver_dst_4w_msa(src, stride, dst, + stride, y, (8 - y), height); + } else { + avg_width4_msa(src, stride, dst, stride, height); + } +} + +void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, + int stride, int height, int x, int y) +{ + int32_t cnt; + + av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); + + if (x && y) { + avc_chroma_hv_and_aver_dst_2w_msa(src, stride, dst, + stride, x, (8 - x), y, + (8 - y), height); + } else if (x) { + avc_chroma_hz_and_aver_dst_2w_msa(src, stride, dst, + stride, x, (8 - x), height); + } else if (y) { + avc_chroma_vt_and_aver_dst_2w_msa(src, stride, dst, + stride, y, (8 - y), height); + } else { + for (cnt = height; cnt--;) { + dst[0] = (dst[0] + src[0] + 1) >> 1; + dst[1] = (dst[1] + src[1] + 1) >> 1; + + src += stride; + dst += stride; + } + } +} diff --git a/libavcodec/mips/h264dsp_init_mips.c b/libavcodec/mips/h264dsp_init_mips.c new file mode 100644 index 0000000000..1fe7f8468c --- /dev/null +++ b/libavcodec/mips/h264dsp_init_mips.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "h264dsp_mips.h" + +#if HAVE_MSA +static av_cold void h264dsp_init_msa(H264DSPContext *c, + const int bit_depth, + const int chroma_format_idc) +{ + if (8 == bit_depth) { + c->h264_v_loop_filter_luma = ff_h264_v_lpf_luma_inter_msa; + c->h264_h_loop_filter_luma = ff_h264_h_lpf_luma_inter_msa; + c->h264_h_loop_filter_luma_mbaff = + ff_h264_h_loop_filter_luma_mbaff_msa; + c->h264_v_loop_filter_luma_intra = ff_h264_v_lpf_luma_intra_msa; + c->h264_h_loop_filter_luma_intra = ff_h264_h_lpf_luma_intra_msa; + c->h264_h_loop_filter_luma_mbaff_intra = + ff_h264_h_loop_filter_luma_mbaff_intra_msa; + c->h264_v_loop_filter_chroma = ff_h264_v_lpf_chroma_inter_msa; + + if (chroma_format_idc <= 1) + c->h264_h_loop_filter_chroma = ff_h264_h_lpf_chroma_inter_msa; + else + c->h264_h_loop_filter_chroma = + ff_h264_h_loop_filter_chroma422_msa; + + if (chroma_format_idc > 1) + c->h264_h_loop_filter_chroma_mbaff = + ff_h264_h_loop_filter_chroma422_mbaff_msa; + + c->h264_v_loop_filter_chroma_intra = + ff_h264_v_lpf_chroma_intra_msa; + + if (chroma_format_idc <= 1) + c->h264_h_loop_filter_chroma_intra = + ff_h264_h_lpf_chroma_intra_msa; + + /* Weighted MC */ + c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_8_msa; + c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_8_msa; + c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels4_8_msa; + + c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_8_msa; + c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_8_msa; + c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels4_8_msa; + + c->h264_idct_add = ff_h264_idct_add_msa; + c->h264_idct8_add = ff_h264_idct8_addblk_msa; + c->h264_idct_dc_add = ff_h264_idct4x4_addblk_dc_msa; + c->h264_idct8_dc_add = ff_h264_idct8_dc_addblk_msa; + c->h264_idct_add16 = ff_h264_idct_add16_msa; + c->h264_idct8_add4 = ff_h264_idct8_add4_msa; + + if (chroma_format_idc <= 1) + c->h264_idct_add8 = ff_h264_idct_add8_msa; + else + c->h264_idct_add8 = ff_h264_idct_add8_422_msa; + + c->h264_idct_add16intra = ff_h264_idct_add16_intra_msa; + c->h264_luma_dc_dequant_idct = ff_h264_deq_idct_luma_dc_msa; + } // if (8 == bit_depth) +} +#endif // #if HAVE_MSA + +#if HAVE_MMI +static av_cold void h264dsp_init_mmi(H264DSPContext * c, const int bit_depth, + const int chroma_format_idc) +{ + if (bit_depth == 8) { + c->h264_add_pixels4_clear = ff_h264_add_pixels4_8_mmi; + c->h264_idct_add = ff_h264_idct_add_8_mmi; + c->h264_idct8_add = ff_h264_idct8_add_8_mmi; + c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmi; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmi; + c->h264_idct_add16 = ff_h264_idct_add16_8_mmi; + c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmi; + c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmi; + + if (chroma_format_idc <= 1) + c->h264_idct_add8 = ff_h264_idct_add8_8_mmi; + else + c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmi; + + c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_8_mmi; + + if (chroma_format_idc <= 1) + c->h264_chroma_dc_dequant_idct = + ff_h264_chroma_dc_dequant_idct_8_mmi; + else + c->h264_chroma_dc_dequant_idct = + ff_h264_chroma422_dc_dequant_idct_8_mmi; + + c->weight_h264_pixels_tab[0] = ff_h264_weight_pixels16_8_mmi; + c->weight_h264_pixels_tab[1] = ff_h264_weight_pixels8_8_mmi; + c->weight_h264_pixels_tab[2] = ff_h264_weight_pixels4_8_mmi; + + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_pixels16_8_mmi; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_pixels8_8_mmi; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_pixels4_8_mmi; + + c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmi; + c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmi; + + if (chroma_format_idc <= 1) { + c->h264_h_loop_filter_chroma = + ff_deblock_h_chroma_8_mmi; + c->h264_h_loop_filter_chroma_intra = + ff_deblock_h_chroma_intra_8_mmi; + } + + c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_mmi; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmi; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmi; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmi; + } +} +#endif /* HAVE_MMI */ + +av_cold void ff_h264dsp_init_mips(H264DSPContext *c, const int bit_depth, + const int chroma_format_idc) +{ +#if HAVE_MSA + h264dsp_init_msa(c, bit_depth, chroma_format_idc); +#endif // #if HAVE_MSA +#if HAVE_MMI + h264dsp_init_mmi(c, bit_depth, chroma_format_idc); +#endif /* HAVE_MMI */ +} diff --git a/libavcodec/mips/h264dsp_mips.h b/libavcodec/mips/h264dsp_mips.h new file mode 100644 index 0000000000..2fdfd11d95 --- /dev/null +++ b/libavcodec/mips/h264dsp_mips.h @@ -0,0 +1,577 @@ +/* + * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) + Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_H264DSP_MIPS_H +#define AVCODEC_MIPS_H264DSP_MIPS_H + +#include "libavcodec/h264.h" +#include "constants.h" + +void ff_h264_h_lpf_luma_inter_msa(uint8_t *src, int stride, + int alpha, int beta, int8_t *tc0); +void ff_h264_v_lpf_luma_inter_msa(uint8_t *src, int stride, + int alpha, int beta, int8_t *tc0); +void ff_h264_h_lpf_chroma_inter_msa(uint8_t *src, int stride, + int alpha, int beta, int8_t *tc0); +void ff_h264_v_lpf_chroma_inter_msa(uint8_t *src, int stride, + int alpha, int beta, int8_t *tc0); +void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride, + int32_t alpha, int32_t beta, + int8_t *tc0); +void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, int32_t stride, + int32_t alpha, int32_t beta, + int8_t *tc0); +void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src, int32_t stride, + int32_t alpha, int32_t beta, + int8_t *tc0); + +void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride); +void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src, + int32_t dst_stride); +void ff_h264_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src, + int32_t de_q_val); +void ff_h264_idct_add16_msa(uint8_t *dst, const int32_t *blk_offset, + int16_t *block, int32_t stride, + const uint8_t nnzc[15 * 8]); +void ff_h264_idct_add16_intra_msa(uint8_t *dst, const int32_t *blk_offset, + int16_t *block, int32_t dst_stride, + const uint8_t nnzc[15 * 8]); +void ff_h264_idct_add8_msa(uint8_t **dst, const int32_t *blk_offset, + int16_t *block, int32_t dst_stride, + const uint8_t nnzc[15 * 8]); +void ff_h264_idct_add8_422_msa(uint8_t **dst, const int32_t *blk_offset, + int16_t *block, int32_t dst_stride, + const uint8_t nnzc[15 * 8]); +void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride); +void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src, + int32_t dst_stride); +void ff_h264_idct8_add4_msa(uint8_t *dst, const int *blk_offset, + int16_t *blk, int dst_stride, + const uint8_t nnzc[15 * 8]); + +void ff_h264_h_lpf_luma_intra_msa(uint8_t *src, int stride, + int alpha, int beta); +void ff_h264_v_lpf_luma_intra_msa(uint8_t *src, int stride, + int alpha, int beta); +void ff_h264_h_lpf_chroma_intra_msa(uint8_t *src, int stride, + int alpha, int beta); +void ff_h264_v_lpf_chroma_intra_msa(uint8_t *src, int stride, + int alpha, int beta); +void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int stride, + int alpha, int beta); + +void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src, + int stride, int height, int log2_denom, + int weightd, int weights, int offset); +void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src, + int stride, int height, int log2_denom, + int weightd, int weights, int offset); +void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src, + int stride, int height, int log2_denom, + int weightd, int weights, int offset); +void ff_weight_h264_pixels16_8_msa(uint8_t *src, int stride, int height, + int log2_denom, int weight, int offset); +void ff_weight_h264_pixels8_8_msa(uint8_t *src, int stride, int height, + int log2_denom, int weight, int offset); +void ff_weight_h264_pixels4_8_msa(uint8_t *src, int stride, int height, + int log2_denom, int weight, int offset); + +void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + +void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + +void ff_put_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + +void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + +void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + +void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + +void ff_h264_intra_predict_plane_8x8_msa(uint8_t *src, ptrdiff_t stride); +void ff_h264_intra_predict_dc_4blk_8x8_msa(uint8_t *src, ptrdiff_t stride); +void ff_h264_intra_predict_hor_dc_8x8_msa(uint8_t *src, ptrdiff_t stride); +void ff_h264_intra_predict_vert_dc_8x8_msa(uint8_t *src, ptrdiff_t stride); +void ff_h264_intra_predict_mad_cow_dc_l0t_8x8_msa(uint8_t *src, + ptrdiff_t stride); +void ff_h264_intra_predict_mad_cow_dc_0lt_8x8_msa(uint8_t *src, + ptrdiff_t stride); +void ff_h264_intra_predict_mad_cow_dc_l00_8x8_msa(uint8_t *src, + ptrdiff_t stride); +void ff_h264_intra_predict_mad_cow_dc_0l0_8x8_msa(uint8_t *src, + ptrdiff_t stride); +void ff_h264_intra_predict_plane_16x16_msa(uint8_t *src, ptrdiff_t stride); +void ff_h264_intra_pred_vert_8x8_msa(uint8_t *src, ptrdiff_t stride); +void ff_h264_intra_pred_horiz_8x8_msa(uint8_t *src, ptrdiff_t stride); +void ff_h264_intra_pred_dc_16x16_msa(uint8_t *src, ptrdiff_t stride); +void ff_h264_intra_pred_vert_16x16_msa(uint8_t *src, ptrdiff_t stride); +void ff_h264_intra_pred_horiz_16x16_msa(uint8_t *src, ptrdiff_t stride); +void ff_h264_intra_pred_dc_left_16x16_msa(uint8_t *src, ptrdiff_t stride); +void ff_h264_intra_pred_dc_top_16x16_msa(uint8_t *src, ptrdiff_t stride); +void ff_h264_intra_pred_dc_128_8x8_msa(uint8_t *src, ptrdiff_t stride); +void ff_h264_intra_pred_dc_128_16x16_msa(uint8_t *src, ptrdiff_t stride); +void ff_vp8_pred8x8_127_dc_8_msa(uint8_t *src, ptrdiff_t stride); +void ff_vp8_pred8x8_129_dc_8_msa(uint8_t *src, ptrdiff_t stride); +void ff_vp8_pred16x16_127_dc_8_msa(uint8_t *src, ptrdiff_t stride); +void ff_vp8_pred16x16_129_dc_8_msa(uint8_t *src, ptrdiff_t stride); + +void ff_h264_add_pixels4_8_mmi(uint8_t *_dst, int16_t *_src, int stride); +void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct_add16_8_mmi(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, const uint8_t nnzc[15*8]); +void ff_h264_idct_add16intra_8_mmi(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, const uint8_t nnzc[15*8]); +void ff_h264_idct8_add4_8_mmi(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, const uint8_t nnzc[15*8]); +void ff_h264_idct_add8_8_mmi(uint8_t **dest, const int *block_offset, + int16_t *block, int stride, const uint8_t nnzc[15*8]); +void ff_h264_idct_add8_422_8_mmi(uint8_t **dest, const int *block_offset, + int16_t *block, int stride, const uint8_t nnzc[15*8]); +void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input, + int qmul); +void ff_h264_chroma_dc_dequant_idct_8_mmi(int16_t *block, int qmul); +void ff_h264_chroma422_dc_dequant_idct_8_mmi(int16_t *block, int qmul); + +void ff_h264_weight_pixels16_8_mmi(uint8_t *block, int stride, int height, + int log2_denom, int weight, int offset); +void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src, + int stride, int height, int log2_denom, int weightd, int weights, + int offset); +void ff_h264_weight_pixels8_8_mmi(uint8_t *block, int stride, int height, + int log2_denom, int weight, int offset); +void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src, + int stride, int height, int log2_denom, int weightd, int weights, + int offset); +void ff_h264_weight_pixels4_8_mmi(uint8_t *block, int stride, int height, + int log2_denom, int weight, int offset); +void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src, + int stride, int height, int log2_denom, int weightd, int weights, + int offset); + +void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, + int8_t *tc0); +void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha, + int beta); +void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, + int8_t *tc0); +void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha, + int beta); +void ff_deblock_v_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, + int8_t *tc0); +void ff_deblock_v_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha, + int beta); +void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, + int8_t *tc0); +void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha, + int beta); +void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, + int8_t *tc0); +void ff_deblock_v8_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha, + int beta); + +void ff_put_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + +void ff_put_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + +void ff_put_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + +void ff_avg_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + +void ff_avg_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + +void ff_avg_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + +#endif // #ifndef AVCODEC_MIPS_H264DSP_MIPS_H diff --git a/libavcodec/mips/h264dsp_mmi.c b/libavcodec/mips/h264dsp_mmi.c new file mode 100644 index 0000000000..a62bbabc67 --- /dev/null +++ b/libavcodec/mips/h264dsp_mmi.c @@ -0,0 +1,2824 @@ +/* + * Loongson SIMD optimized h264dsp + * + * Copyright (c) 2015 Loongson Technology Corporation Limited + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn> + * Heiher <r@hev.cc> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/bit_depth_template.c" +#include "h264dsp_mips.h" +#include "libavutil/mips/asmdefs.h" + +void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride) +{ + double ftmp[9]; + uint64_t low32; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "ldc1 %[ftmp1], 0x00(%[src]) \n\t" + "ldc1 %[ftmp2], 0x08(%[src]) \n\t" + "ldc1 %[ftmp3], 0x10(%[src]) \n\t" + "ldc1 %[ftmp4], 0x18(%[src]) \n\t" + "uld %[low32], 0x00(%[dst0]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + "uld %[low32], 0x00(%[dst1]) \n\t" + "mtc1 %[low32], %[ftmp6] \n\t" + "uld %[low32], 0x00(%[dst2]) \n\t" + "mtc1 %[low32], %[ftmp7] \n\t" + "uld %[low32], 0x00(%[dst3]) \n\t" + "mtc1 %[low32], %[ftmp8] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst0]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst0]) \n\t" + "gsswlc1 %[ftmp2], 0x03(%[dst1]) \n\t" + "gsswrc1 %[ftmp2], 0x00(%[dst1]) \n\t" + "gsswlc1 %[ftmp3], 0x03(%[dst2]) \n\t" + "gsswrc1 %[ftmp3], 0x00(%[dst2]) \n\t" + "gsswlc1 %[ftmp4], 0x03(%[dst3]) \n\t" + "gsswrc1 %[ftmp4], 0x00(%[dst3]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), + [low32]"=&r"(low32) + : [dst0]"r"(dst), [dst1]"r"(dst+stride), + [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride), + [src]"r"(src) + : "memory" + ); + + memset(src, 0, 32); +} + +void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride) +{ + double ftmp[12]; + uint64_t tmp[1]; + uint64_t low32; + + __asm__ volatile ( + "dli %[tmp0], 0x01 \n\t" + "ldc1 %[ftmp0], 0x00(%[block]) \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" + "ldc1 %[ftmp1], 0x08(%[block]) \n\t" + "dli %[tmp0], 0x06 \n\t" + "ldc1 %[ftmp2], 0x10(%[block]) \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "psrah %[ftmp4], %[ftmp1], %[ftmp8] \n\t" + "ldc1 %[ftmp3], 0x18(%[block]) \n\t" + "psrah %[ftmp5], %[ftmp3], %[ftmp8] \n\t" + "psubh %[ftmp4], %[ftmp4], %[ftmp3] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "paddh %[ftmp10], %[ftmp2], %[ftmp0] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "paddh %[ftmp11], %[ftmp5], %[ftmp10] \n\t" + "psubh %[ftmp2], %[ftmp10], %[ftmp5] \n\t" + "paddh %[ftmp10], %[ftmp4], %[ftmp0] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "punpckhhw %[ftmp1], %[ftmp11], %[ftmp10] \n\t" + "punpcklhw %[ftmp5], %[ftmp11], %[ftmp10] \n\t" + "punpckhhw %[ftmp4], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "punpckhwd %[ftmp2], %[ftmp5], %[ftmp0] \n\t" + "punpcklwd %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklwd %[ftmp10], %[ftmp1], %[ftmp4] \n\t" + "punpckhwd %[ftmp0], %[ftmp1], %[ftmp4] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ff_pw_32] \n\t" + "psrah %[ftmp4], %[ftmp2], %[ftmp8] \n\t" + "psrah %[ftmp3], %[ftmp0], %[ftmp8] \n\t" + "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "paddh %[ftmp1], %[ftmp10], %[ftmp5] \n\t" + "psubh %[ftmp5], %[ftmp5], %[ftmp10] \n\t" + "paddh %[ftmp10], %[ftmp3], %[ftmp1] \n\t" + "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "paddh %[ftmp11], %[ftmp4], %[ftmp5] \n\t" + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "psubh %[ftmp5], %[ftmp5], %[ftmp4] \n\t" + "sdc1 %[ftmp7], 0x00(%[block]) \n\t" + "sdc1 %[ftmp7], 0x08(%[block]) \n\t" + "sdc1 %[ftmp7], 0x10(%[block]) \n\t" + "sdc1 %[ftmp7], 0x18(%[block]) \n\t" + "uld %[low32], 0x00(%[dst]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "psrah %[ftmp3], %[ftmp10], %[ftmp9] \n\t" + "gslwxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t" + "psrah %[ftmp4], %[ftmp11], %[ftmp9] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "packushb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "gsswlc1 %[ftmp2], 0x03(%[dst]) \n\t" + "gsswrc1 %[ftmp2], 0x00(%[dst]) \n\t" + "gsswxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + "uld %[low32], 0x00(%[dst]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + "gslwxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "gsswlc1 %[ftmp2], 0x03(%[dst]) \n\t" + "gsswrc1 %[ftmp2], 0x00(%[dst]) \n\t" + "packushb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "gsswxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [low32]"=&r"(low32) + : [dst]"r"(dst), [block]"r"(block), + [stride]"r"((mips_reg)stride), [ff_pw_32]"f"(ff_pw_32) + : "memory" + ); + + memset(block, 0, 32); +} + +void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride) +{ + double ftmp[16]; + uint64_t tmp[8]; + mips_reg addr[1]; + uint64_t low32; + + __asm__ volatile ( + "lhu %[tmp0], 0x00(%[block]) \n\t" + PTR_ADDI "$29, $29, -0x20 \n\t" + PTR_ADDIU "%[tmp0], %[tmp0], 0x20 \n\t" + "ldc1 %[ftmp1], 0x10(%[block]) \n\t" + "sh %[tmp0], 0x00(%[block]) \n\t" + "ldc1 %[ftmp2], 0x20(%[block]) \n\t" + "dli %[tmp0], 0x01 \n\t" + "ldc1 %[ftmp3], 0x30(%[block]) \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" + "ldc1 %[ftmp5], 0x50(%[block]) \n\t" + "ldc1 %[ftmp6], 0x60(%[block]) \n\t" + "ldc1 %[ftmp7], 0x70(%[block]) \n\t" + "mov.d %[ftmp0], %[ftmp1] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp8] \n\t" + "psrah %[ftmp4], %[ftmp5], %[ftmp8] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "psubh %[ftmp5], %[ftmp5], %[ftmp3] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "dli %[tmp0], 0x02 \n\t" + "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "mov.d %[ftmp7], %[ftmp1] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "psrah %[ftmp3], %[ftmp4], %[ftmp9] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "mov.d %[ftmp5], %[ftmp6] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "psrah %[ftmp4], %[ftmp2], %[ftmp8] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "psubh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "ldc1 %[ftmp2], 0x00(%[block]) \n\t" + "ldc1 %[ftmp5], 0x40(%[block]) \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp2] \n\t" + "psubh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp6] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp7] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp4] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "sdc1 %[ftmp6], 0x00(%[block]) \n\t" + "psubh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "punpckhhw %[ftmp6], %[ftmp7], %[ftmp0] \n\t" + "punpcklhw %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpckhhw %[ftmp0], %[ftmp3], %[ftmp1] \n\t" + "punpcklhw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "punpckhwd %[ftmp1], %[ftmp7], %[ftmp3] \n\t" + "punpcklwd %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "punpckhwd %[ftmp3], %[ftmp6], %[ftmp0] \n\t" + "punpcklwd %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "ldc1 %[ftmp0], 0x00(%[block]) \n\t" + "sdc1 %[ftmp7], 0x00($29) \n\t" + "sdc1 %[ftmp1], 0x10($29) \n\t" + "dmfc1 %[tmp1], %[ftmp6] \n\t" + "dmfc1 %[tmp3], %[ftmp3] \n\t" + "punpckhhw %[ftmp3], %[ftmp5], %[ftmp2] \n\t" + "punpcklhw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + "punpckhhw %[ftmp2], %[ftmp4], %[ftmp0] \n\t" + "punpcklhw %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpckhwd %[ftmp0], %[ftmp5], %[ftmp4] \n\t" + "punpcklwd %[ftmp5], %[ftmp5], %[ftmp4] \n\t" + "punpckhwd %[ftmp4], %[ftmp3], %[ftmp2] \n\t" + "punpcklwd %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "sdc1 %[ftmp5], 0x08($29) \n\t" + "sdc1 %[ftmp0], 0x18($29) \n\t" + "dmfc1 %[tmp2], %[ftmp3] \n\t" + "dmfc1 %[tmp4], %[ftmp4] \n\t" + "ldc1 %[ftmp1], 0x18(%[block]) \n\t" + "ldc1 %[ftmp6], 0x28(%[block]) \n\t" + "ldc1 %[ftmp2], 0x38(%[block]) \n\t" + "ldc1 %[ftmp0], 0x58(%[block]) \n\t" + "ldc1 %[ftmp3], 0x68(%[block]) \n\t" + "ldc1 %[ftmp4], 0x78(%[block]) \n\t" + "mov.d %[ftmp7], %[ftmp1] \n\t" + "psrah %[ftmp5], %[ftmp0], %[ftmp8] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp8] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp4] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp2] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp8] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp4] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp2] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "mov.d %[ftmp4], %[ftmp1] \n\t" + "psrah %[ftmp2], %[ftmp5], %[ftmp9] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp9] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "mov.d %[ftmp0], %[ftmp3] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "psrah %[ftmp5], %[ftmp6], %[ftmp8] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "psubh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "ldc1 %[ftmp6], 0x08(%[block]) \n\t" + "ldc1 %[ftmp0], 0x48(%[block]) \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "sdc1 %[ftmp3], 0x08(%[block]) \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "punpckhhw %[ftmp3], %[ftmp4], %[ftmp7] \n\t" + "punpcklhw %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "punpckhhw %[ftmp7], %[ftmp2], %[ftmp1] \n\t" + "punpcklhw %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "punpckhwd %[ftmp1], %[ftmp4], %[ftmp2] \n\t" + "punpcklwd %[ftmp4], %[ftmp4], %[ftmp2] \n\t" + "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t" + "punpcklwd %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "ldc1 %[ftmp7], 0x08(%[block]) \n\t" + "dmfc1 %[tmp5], %[ftmp4] \n\t" + "dmfc1 %[tmp7], %[ftmp1] \n\t" + "mov.d %[ftmp12], %[ftmp3] \n\t" + "mov.d %[ftmp14], %[ftmp2] \n\t" + "punpckhhw %[ftmp2], %[ftmp0], %[ftmp6] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "punpckhhw %[ftmp6], %[ftmp5], %[ftmp7] \n\t" + "punpcklhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "punpckhwd %[ftmp7], %[ftmp0], %[ftmp5] \n\t" + "punpcklwd %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + "punpckhwd %[ftmp5], %[ftmp2], %[ftmp6] \n\t" + "punpcklwd %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "dmfc1 %[tmp6], %[ftmp0] \n\t" + "mov.d %[ftmp11], %[ftmp7] \n\t" + "mov.d %[ftmp13], %[ftmp2] \n\t" + "mov.d %[ftmp15], %[ftmp5] \n\t" + PTR_ADDIU "%[addr0], %[dst], 0x04 \n\t" + "dmtc1 %[tmp7], %[ftmp7] \n\t" + "dmtc1 %[tmp3], %[ftmp6] \n\t" + "ldc1 %[ftmp1], 0x10($29) \n\t" + "dmtc1 %[tmp1], %[ftmp3] \n\t" + "mov.d %[ftmp4], %[ftmp1] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp8] \n\t" + "psrah %[ftmp0], %[ftmp7], %[ftmp8] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp4] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp14] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "psubh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp6] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp14] \n\t" + "psrah %[ftmp5], %[ftmp14], %[ftmp8] \n\t" + "psubh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "mov.d %[ftmp5], %[ftmp1] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "psrah %[ftmp6], %[ftmp0], %[ftmp9] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ftmp9] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp9] \n\t" + "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "mov.d %[ftmp7], %[ftmp12] \n\t" + "psrah %[ftmp2], %[ftmp12], %[ftmp8] \n\t" + "psrah %[ftmp0], %[ftmp3], %[ftmp8] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "ldc1 %[ftmp3], 0x00($29) \n\t" + "dmtc1 %[tmp5], %[ftmp7] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp2] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "sdc1 %[ftmp3], 0x00($29) \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "sdc1 %[ftmp0], 0x10($29) \n\t" + "dmfc1 %[tmp1], %[ftmp2] \n\t" + "xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "sdc1 %[ftmp2], 0x00(%[block]) \n\t" + "sdc1 %[ftmp2], 0x08(%[block]) \n\t" + "sdc1 %[ftmp2], 0x10(%[block]) \n\t" + "sdc1 %[ftmp2], 0x18(%[block]) \n\t" + "sdc1 %[ftmp2], 0x20(%[block]) \n\t" + "sdc1 %[ftmp2], 0x28(%[block]) \n\t" + "sdc1 %[ftmp2], 0x30(%[block]) \n\t" + "sdc1 %[ftmp2], 0x38(%[block]) \n\t" + "sdc1 %[ftmp2], 0x40(%[block]) \n\t" + "sdc1 %[ftmp2], 0x48(%[block]) \n\t" + "sdc1 %[ftmp2], 0x50(%[block]) \n\t" + "sdc1 %[ftmp2], 0x58(%[block]) \n\t" + "sdc1 %[ftmp2], 0x60(%[block]) \n\t" + "sdc1 %[ftmp2], 0x68(%[block]) \n\t" + "sdc1 %[ftmp2], 0x70(%[block]) \n\t" + "sdc1 %[ftmp2], 0x78(%[block]) \n\t" + "dli %[tmp3], 0x06 \n\t" + "uld %[low32], 0x00(%[dst]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "mtc1 %[tmp3], %[ftmp10] \n\t" + "gslwxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t" + "psrah %[ftmp5], %[ftmp5], %[ftmp10] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ftmp10] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "packushb %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "packushb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "gsswlc1 %[ftmp3], 0x03(%[dst]) \n\t" + "gsswrc1 %[ftmp3], 0x00(%[dst]) \n\t" + "gsswxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + "uld %[low32], 0x00(%[dst]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "gslwxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "packushb %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "packushb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "gsswlc1 %[ftmp3], 0x03(%[dst]) \n\t" + "gsswrc1 %[ftmp3], 0x00(%[dst]) \n\t" + "gsswxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t" + "ldc1 %[ftmp5], 0x00($29) \n\t" + "ldc1 %[ftmp4], 0x10($29) \n\t" + "dmtc1 %[tmp1], %[ftmp6] \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + "uld %[low32], 0x00(%[dst]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "gslwxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp10] \n\t" + "psrah %[ftmp5], %[ftmp5], %[ftmp10] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + "packushb %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "packushb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "gsswlc1 %[ftmp3], 0x03(%[dst]) \n\t" + "gsswrc1 %[ftmp3], 0x00(%[dst]) \n\t" + "gsswxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + PTR_ADDU "%[dst], %[dst], %[stride] \n\t" + "uld %[low32], 0x00(%[dst]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "gslwxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t" + "psrah %[ftmp4], %[ftmp4], %[ftmp10] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "packushb %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "packushb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "gsswlc1 %[ftmp3], 0x03(%[dst]) \n\t" + "gsswrc1 %[ftmp3], 0x00(%[dst]) \n\t" + "gsswxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t" + "dmtc1 %[tmp4], %[ftmp1] \n\t" + "dmtc1 %[tmp2], %[ftmp6] \n\t" + "ldc1 %[ftmp4], 0x18($29) \n\t" + "mov.d %[ftmp5], %[ftmp4] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "psrah %[ftmp7], %[ftmp11], %[ftmp8] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp11] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp15] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" + "psubh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "psubh %[ftmp3], %[ftmp11], %[ftmp1] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp8] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp15] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" + "psrah %[ftmp2], %[ftmp15], %[ftmp8] \n\t" + "psubh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "mov.d %[ftmp2], %[ftmp4] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ftmp9] \n\t" + "psrah %[ftmp1], %[ftmp7], %[ftmp9] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "mov.d %[ftmp3], %[ftmp13] \n\t" + "psrah %[ftmp0], %[ftmp13], %[ftmp8] \n\t" + "psrah %[ftmp7], %[ftmp6], %[ftmp8] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "ldc1 %[ftmp6], 0x08($29) \n\t" + "dmtc1 %[tmp6], %[ftmp3] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp6] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp7] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "sdc1 %[ftmp6], 0x08($29) \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "sdc1 %[ftmp7], 0x18($29) \n\t" + "dmfc1 %[tmp2], %[ftmp0] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "uld %[low32], 0x00(%[addr0]) \n\t" + "mtc1 %[low32], %[ftmp6] \n\t" + "gslwxc1 %[ftmp7], 0x00(%[addr0], %[stride]) \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp10] \n\t" + "psrah %[ftmp5], %[ftmp5], %[ftmp10] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "gsswlc1 %[ftmp6], 0x03(%[addr0]) \n\t" + "gsswrc1 %[ftmp6], 0x00(%[addr0]) \n\t" + "gsswxc1 %[ftmp7], 0x00(%[addr0], %[stride]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "uld %[low32], 0x00(%[addr0]) \n\t" + "mtc1 %[low32], %[ftmp6] \n\t" + "gslwxc1 %[ftmp7], 0x00(%[addr0], %[stride]) \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ftmp10] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp4] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "gsswlc1 %[ftmp6], 0x03(%[addr0]) \n\t" + "gsswrc1 %[ftmp6], 0x00(%[addr0]) \n\t" + "gsswxc1 %[ftmp7], 0x00(%[addr0], %[stride]) \n\t" + "ldc1 %[ftmp2], 0x08($29) \n\t" + "ldc1 %[ftmp5], 0x18($29) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "dmtc1 %[tmp2], %[ftmp1] \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "uld %[low32], 0x00(%[addr0]) \n\t" + "mtc1 %[low32], %[ftmp6] \n\t" + "gslwxc1 %[ftmp7], 0x00(%[addr0], %[stride]) \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp10] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp10] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp2] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "gsswlc1 %[ftmp6], 0x03(%[addr0]) \n\t" + "gsswrc1 %[ftmp6], 0x00(%[addr0]) \n\t" + "gsswxc1 %[ftmp7], 0x00(%[addr0], %[stride]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "uld %[low32], 0x00(%[addr0]) \n\t" + "mtc1 %[low32], %[ftmp6] \n\t" + "gslwxc1 %[ftmp7], 0x00(%[addr0], %[stride]) \n\t" + "psrah %[ftmp5], %[ftmp5], %[ftmp10] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "gsswlc1 %[ftmp6], 0x03(%[addr0]) \n\t" + "gsswrc1 %[ftmp6], 0x00(%[addr0]) \n\t" + "gsswxc1 %[ftmp7], 0x00(%[addr0], %[stride]) \n\t" + PTR_ADDIU "$29, $29, 0x20 \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), + [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]), + [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]), + [tmp6]"=&r"(tmp[6]), [tmp7]"=&r"(tmp[7]), + [addr0]"=&r"(addr[0]), + [low32]"=&r"(low32) + : [dst]"r"(dst), [block]"r"(block), + [stride]"r"((mips_reg)stride) + : "$29","memory" + ); + + memset(block, 0, 128); +} + +void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride) +{ + int dc = (block[0] + 32) >> 6; + double ftmp[6]; + uint64_t low32; + + block[0] = 0; + + __asm__ volatile ( + "mtc1 %[dc], %[ftmp5] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "uld %[low32], 0x00(%[dst0]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "uld %[low32], 0x00(%[dst1]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "uld %[low32], 0x00(%[dst2]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "uld %[low32], 0x00(%[dst3]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst0]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst0]) \n\t" + "gsswlc1 %[ftmp2], 0x03(%[dst1]) \n\t" + "gsswrc1 %[ftmp2], 0x00(%[dst1]) \n\t" + "gsswlc1 %[ftmp3], 0x03(%[dst2]) \n\t" + "gsswrc1 %[ftmp3], 0x00(%[dst2]) \n\t" + "gsswlc1 %[ftmp4], 0x03(%[dst3]) \n\t" + "gsswrc1 %[ftmp4], 0x00(%[dst3]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [low32]"=&r"(low32) + : [dst0]"r"(dst), [dst1]"r"(dst+stride), + [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride), + [dc]"r"(dc) + : "memory" + ); +} + +void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride) +{ + int dc = (block[0] + 32) >> 6; + double ftmp[10]; + + block[0] = 0; + + __asm__ volatile ( + "mtc1 %[dc], %[ftmp5] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "ldc1 %[ftmp1], 0x00(%[dst0]) \n\t" + "ldc1 %[ftmp2], 0x00(%[dst1]) \n\t" + "ldc1 %[ftmp3], 0x00(%[dst2]) \n\t" + "ldc1 %[ftmp4], 0x00(%[dst3]) \n\t" + "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "punpckhbh %[ftmp9], %[ftmp4], %[ftmp0] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "paddsh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp5] \n\t" + "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "paddsh %[ftmp9], %[ftmp9], %[ftmp5] \n\t" + "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "packushb %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "packushb %[ftmp4], %[ftmp4], %[ftmp9] \n\t" + "sdc1 %[ftmp1], 0x00(%[dst0]) \n\t" + "sdc1 %[ftmp2], 0x00(%[dst1]) \n\t" + "sdc1 %[ftmp3], 0x00(%[dst2]) \n\t" + "sdc1 %[ftmp4], 0x00(%[dst3]) \n\t" + + "ldc1 %[ftmp1], 0x00(%[dst4]) \n\t" + "ldc1 %[ftmp2], 0x00(%[dst5]) \n\t" + "ldc1 %[ftmp3], 0x00(%[dst6]) \n\t" + "ldc1 %[ftmp4], 0x00(%[dst7]) \n\t" + "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "punpckhbh %[ftmp9], %[ftmp4], %[ftmp0] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "paddsh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp5] \n\t" + "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "paddsh %[ftmp9], %[ftmp9], %[ftmp5] \n\t" + "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "packushb %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "packushb %[ftmp4], %[ftmp4], %[ftmp9] \n\t" + "sdc1 %[ftmp1], 0x00(%[dst4]) \n\t" + "sdc1 %[ftmp2], 0x00(%[dst5]) \n\t" + "sdc1 %[ftmp3], 0x00(%[dst6]) \n\t" + "sdc1 %[ftmp4], 0x00(%[dst7]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]) + : [dst0]"r"(dst), [dst1]"r"(dst+stride), + [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride), + [dst4]"r"(dst+4*stride), [dst5]"r"(dst+5*stride), + [dst6]"r"(dst+6*stride), [dst7]"r"(dst+7*stride), + [dc]"r"(dc) + : "memory" + ); +} + +void ff_h264_idct_add16_8_mmi(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, const uint8_t nnzc[15*8]) +{ + int i; + for(i=0; i<16; i++){ + int nnz = nnzc[ scan8[i] ]; + if(nnz){ + if(nnz==1 && ((int16_t*)block)[i*16]) + ff_h264_idct_dc_add_8_mmi(dst + block_offset[i], block + i*16, + stride); + else + ff_h264_idct_add_8_mmi(dst + block_offset[i], block + i*16, + stride); + } + } +} + +void ff_h264_idct_add16intra_8_mmi(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, const uint8_t nnzc[15*8]) +{ + int i; + for(i=0; i<16; i++){ + if(nnzc[ scan8[i] ]) + ff_h264_idct_add_8_mmi(dst + block_offset[i], block + i*16, stride); + else if(((int16_t*)block)[i*16]) + ff_h264_idct_dc_add_8_mmi(dst + block_offset[i], block + i*16, + stride); + } +} + +void ff_h264_idct8_add4_8_mmi(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, const uint8_t nnzc[15*8]) +{ + int i; + for(i=0; i<16; i+=4){ + int nnz = nnzc[ scan8[i] ]; + if(nnz){ + if(nnz==1 && ((int16_t*)block)[i*16]) + ff_h264_idct8_dc_add_8_mmi(dst + block_offset[i], + block + i*16, stride); + else + ff_h264_idct8_add_8_mmi(dst + block_offset[i], block + i*16, + stride); + } + } +} + +void ff_h264_idct_add8_8_mmi(uint8_t **dest, const int *block_offset, + int16_t *block, int stride, const uint8_t nnzc[15*8]) +{ + int i, j; + for(j=1; j<3; j++){ + for(i=j*16; i<j*16+4; i++){ + if(nnzc[ scan8[i] ]) + ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i], + block + i*16, stride); + else if(((int16_t*)block)[i*16]) + ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i], + block + i*16, stride); + } + } +} + +void ff_h264_idct_add8_422_8_mmi(uint8_t **dest, const int *block_offset, + int16_t *block, int stride, const uint8_t nnzc[15*8]) +{ + int i, j; + + for(j=1; j<3; j++){ + for(i=j*16; i<j*16+4; i++){ + if(nnzc[ scan8[i] ]) + ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i], + block + i*16, stride); + else if(((int16_t*)block)[i*16]) + ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i], + block + i*16, stride); + } + } + + for(j=1; j<3; j++){ + for(i=j*16+4; i<j*16+8; i++){ + if(nnzc[ scan8[i+4] ]) + ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i+4], + block + i*16, stride); + else if(((int16_t*)block)[i*16]) + ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i+4], + block + i*16, stride); + } + } +} + +void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input, + int qmul) +{ + double ftmp[10]; + uint64_t tmp[2]; + + __asm__ volatile ( + ".set noreorder \n\t" + "dli %[tmp0], 0x08 \n\t" + "ldc1 %[ftmp3], 0x18(%[input]) \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" + "ldc1 %[ftmp2], 0x10(%[input]) \n\t" + "dli %[tmp0], 0x20 \n\t" + "ldc1 %[ftmp1], 0x08(%[input]) \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "ldc1 %[ftmp0], 0x00(%[input]) \n\t" + "mov.d %[ftmp4], %[ftmp3] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "mov.d %[ftmp4], %[ftmp1] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "mov.d %[ftmp4], %[ftmp3] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "psubh %[ftmp1], %[ftmp1], %[ftmp4] \n\t" + "mov.d %[ftmp4], %[ftmp2] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "mov.d %[ftmp4], %[ftmp3] \n\t" + "punpcklhw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "punpckhhw %[ftmp4], %[ftmp4], %[ftmp1] \n\t" + "punpckhhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "punpckhwd %[ftmp2], %[ftmp3], %[ftmp0] \n\t" + "punpcklwd %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "mov.d %[ftmp0], %[ftmp4] \n\t" + "punpcklwd %[ftmp4], %[ftmp4], %[ftmp1] \n\t" + "punpckhwd %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "mov.d %[ftmp1], %[ftmp0] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "psubh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" + "mov.d %[ftmp1], %[ftmp2] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "mov.d %[ftmp1], %[ftmp0] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "mov.d %[ftmp1], %[ftmp4] \n\t" + "daddi %[tmp0], %[qmul], -0x7fff \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t" + "bgtz %[tmp0], 1f \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "ori %[tmp0], $0, 0x80 \n\t" + "dsll %[tmp0], %[tmp0], 0x10 \n\t" + "punpckhhw %[ftmp1], %[ftmp0], %[ff_pw_1] \n\t" + "daddu %[qmul], %[qmul], %[tmp0] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ff_pw_1] \n\t" + "punpckhhw %[ftmp5], %[ftmp2], %[ff_pw_1] \n\t" + "punpcklhw %[ftmp2], %[ftmp2], %[ff_pw_1] \n\t" + "mtc1 %[qmul], %[ftmp7] \n\t" + "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "pmaddhw %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "pmaddhw %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "pmaddhw %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "pmaddhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "psraw %[ftmp0], %[ftmp0], %[ftmp8] \n\t" + "psraw %[ftmp2], %[ftmp2], %[ftmp8] \n\t" + "psraw %[ftmp1], %[ftmp1], %[ftmp8] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp8] \n\t" + "packsswh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "packsswh %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + "dmfc1 %[tmp1], %[ftmp0] \n\t" + "dsrl %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "mfc1 %[input], %[ftmp0] \n\t" + "sh %[tmp1], 0x00(%[output]) \n\t" + "sh %[input], 0x80(%[output]) \n\t" + "dsrl %[tmp1], %[tmp1], 0x10 \n\t" + PTR_SRL "%[input], %[input], 0x10 \n\t" + "sh %[tmp1], 0x20(%[output]) \n\t" + "sh %[input], 0xa0(%[output]) \n\t" + "dmfc1 %[tmp1], %[ftmp2] \n\t" + "dsrl %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "mfc1 %[input], %[ftmp2] \n\t" + "sh %[tmp1], 0x40(%[output]) \n\t" + "sh %[input], 0xc0(%[output]) \n\t" + "dsrl %[tmp1], %[tmp1], 0x10 \n\t" + PTR_SRL "%[input], %[input], 0x10 \n\t" + "sh %[tmp1], 0x60(%[output]) \n\t" + "sh %[input], 0xe0(%[output]) \n\t" + "punpckhhw %[ftmp1], %[ftmp3], %[ff_pw_1] \n\t" + "punpcklhw %[ftmp3], %[ftmp3], %[ff_pw_1] \n\t" + "punpckhhw %[ftmp5], %[ftmp4], %[ff_pw_1] \n\t" + "punpcklhw %[ftmp4], %[ftmp4], %[ff_pw_1] \n\t" + "mtc1 %[qmul], %[ftmp7] \n\t" + "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "pmaddhw %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "pmaddhw %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "pmaddhw %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "pmaddhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "psraw %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "psraw %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "psraw %[ftmp1], %[ftmp1], %[ftmp8] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp8] \n\t" + "packsswh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "packsswh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "dmfc1 %[tmp1], %[ftmp3] \n\t" + "dsrl %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "mfc1 %[input], %[ftmp3] \n\t" + "sh %[tmp1], 0x100(%[output]) \n\t" + "sh %[input], 0x180(%[output]) \n\t" + "dsrl %[tmp1], %[tmp1], 0x10 \n\t" + PTR_SRL "%[input], %[input], 0x10 \n\t" + "sh %[tmp1], 0x120(%[output]) \n\t" + "sh %[input], 0x1a0(%[output]) \n\t" + "dmfc1 %[tmp1], %[ftmp4] \n\t" + "dsrl %[ftmp4], %[ftmp4], %[ftmp9] \n\t" + "mfc1 %[input], %[ftmp4] \n\t" + "sh %[tmp1], 0x140(%[output]) \n\t" + "sh %[input], 0x1c0(%[output]) \n\t" + "dsrl %[tmp1], %[tmp1], 0x10 \n\t" + PTR_SRL "%[input], %[input], 0x10 \n\t" + "sh %[tmp1], 0x160(%[output]) \n\t" + "j 2f \n\t" + "sh %[input], 0x1e0(%[output]) \n\t" + "1: \n\t" + "ori %[tmp0], $0, 0x1f \n\t" + "clz %[tmp1], %[qmul] \n\t" + "ori %[input], $0, 0x07 \n\t" + "dsubu %[tmp1], %[tmp0], %[tmp1] \n\t" + "ori %[tmp0], $0, 0x80 \n\t" + "dsll %[tmp0], %[tmp0], 0x10 \n\t" + "daddu %[qmul], %[qmul], %[tmp0] \n\t" + "dsubu %[tmp0], %[tmp1], %[input] \n\t" + "movn %[tmp1], %[input], %[tmp0] \n\t" + PTR_ADDIU "%[input], %[input], 0x01 \n\t" + "andi %[tmp0], %[tmp1], 0xff \n\t" + "srlv %[qmul], %[qmul], %[tmp0] \n\t" + PTR_SUBU "%[input], %[input], %[tmp1] \n\t" + "mtc1 %[input], %[ftmp6] \n\t" + "punpckhhw %[ftmp1], %[ftmp0], %[ff_pw_1] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ff_pw_1] \n\t" + "punpckhhw %[ftmp5], %[ftmp2], %[ff_pw_1] \n\t" + "punpcklhw %[ftmp2], %[ftmp2], %[ff_pw_1] \n\t" + "mtc1 %[qmul], %[ftmp7] \n\t" + "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "pmaddhw %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "pmaddhw %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "pmaddhw %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "pmaddhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "psraw %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "psraw %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "psraw %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + "packsswh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "packsswh %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + "dmfc1 %[tmp1], %[ftmp0] \n\t" + "dsrl %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "sh %[tmp1], 0x00(%[output]) \n\t" + "mfc1 %[input], %[ftmp0] \n\t" + "dsrl %[tmp1], %[tmp1], 0x10 \n\t" + "sh %[input], 0x80(%[output]) \n\t" + "sh %[tmp1], 0x20(%[output]) \n\t" + PTR_SRL "%[input], %[input], 0x10 \n\t" + "dmfc1 %[tmp1], %[ftmp2] \n\t" + "sh %[input], 0xa0(%[output]) \n\t" + "dsrl %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "sh %[tmp1], 0x40(%[output]) \n\t" + "mfc1 %[input], %[ftmp2] \n\t" + "dsrl %[tmp1], %[tmp1], 0x10 \n\t" + "sh %[input], 0xc0(%[output]) \n\t" + "sh %[tmp1], 0x60(%[output]) \n\t" + PTR_SRL "%[input], %[input], 0x10 \n\t" + "sh %[input], 0xe0(%[output]) \n\t" + "punpckhhw %[ftmp1], %[ftmp3], %[ff_pw_1] \n\t" + "punpcklhw %[ftmp3], %[ftmp3], %[ff_pw_1] \n\t" + "punpckhhw %[ftmp5], %[ftmp4], %[ff_pw_1] \n\t" + "punpcklhw %[ftmp4], %[ftmp4], %[ff_pw_1] \n\t" + "mtc1 %[qmul], %[ftmp7] \n\t" + "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "pmaddhw %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "pmaddhw %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "pmaddhw %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "pmaddhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "psraw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "psraw %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "psraw %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + "packsswh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "packsswh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "dmfc1 %[tmp1], %[ftmp3] \n\t" + "dsrl %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "mfc1 %[input], %[ftmp3] \n\t" + "sh %[tmp1], 0x100(%[output]) \n\t" + "sh %[input], 0x180(%[output]) \n\t" + "dsrl %[tmp1], %[tmp1], 0x10 \n\t" + PTR_SRL "%[input], %[input], 0x10 \n\t" + "sh %[tmp1], 0x120(%[output]) \n\t" + "sh %[input], 0x1a0(%[output]) \n\t" + "dmfc1 %[tmp1], %[ftmp4] \n\t" + "dsrl %[ftmp4], %[ftmp4], %[ftmp9] \n\t" + "mfc1 %[input], %[ftmp4] \n\t" + "sh %[tmp1], 0x140(%[output]) \n\t" + "sh %[input], 0x1c0(%[output]) \n\t" + "dsrl %[tmp1], %[tmp1], 0x10 \n\t" + PTR_SRL "%[input], %[input], 0x10 \n\t" + "sh %[tmp1], 0x160(%[output]) \n\t" + "sh %[input], 0x1e0(%[output]) \n\t" + "2: \n\t" + ".set reorder \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [output]"+&r"(output), [input]"+&r"(input), + [qmul]"+&r"(qmul) + : [ff_pw_1]"f"(ff_pw_1) + : "memory" + ); +} + +void ff_h264_chroma422_dc_dequant_idct_8_mmi(int16_t *block, int qmul) +{ + int temp[8]; + int t[8]; + + temp[0] = block[0] + block[16]; + temp[1] = block[0] - block[16]; + temp[2] = block[32] + block[48]; + temp[3] = block[32] - block[48]; + temp[4] = block[64] + block[80]; + temp[5] = block[64] - block[80]; + temp[6] = block[96] + block[112]; + temp[7] = block[96] - block[112]; + + t[0] = temp[0] + temp[4] + temp[2] + temp[6]; + t[1] = temp[0] - temp[4] + temp[2] - temp[6]; + t[2] = temp[0] - temp[4] - temp[2] + temp[6]; + t[3] = temp[0] + temp[4] - temp[2] - temp[6]; + t[4] = temp[1] + temp[5] + temp[3] + temp[7]; + t[5] = temp[1] - temp[5] + temp[3] - temp[7]; + t[6] = temp[1] - temp[5] - temp[3] + temp[7]; + t[7] = temp[1] + temp[5] - temp[3] - temp[7]; + + block[ 0]= (t[0]*qmul + 128) >> 8; + block[ 32]= (t[1]*qmul + 128) >> 8; + block[ 64]= (t[2]*qmul + 128) >> 8; + block[ 96]= (t[3]*qmul + 128) >> 8; + block[ 16]= (t[4]*qmul + 128) >> 8; + block[ 48]= (t[5]*qmul + 128) >> 8; + block[ 80]= (t[6]*qmul + 128) >> 8; + block[112]= (t[7]*qmul + 128) >> 8; +} + +void ff_h264_chroma_dc_dequant_idct_8_mmi(int16_t *block, int qmul) +{ + int a,b,c,d; + + d = block[0] - block[16]; + a = block[0] + block[16]; + b = block[32] - block[48]; + c = block[32] + block[48]; + block[0] = ((a+c)*qmul) >> 7; + block[16]= ((d+b)*qmul) >> 7; + block[32]= ((a-c)*qmul) >> 7; + block[48]= ((d-b)*qmul) >> 7; +} + +void ff_h264_weight_pixels16_8_mmi(uint8_t *block, int stride, int height, + int log2_denom, int weight, int offset) +{ + int y; + double ftmp[8]; + + offset <<= log2_denom; + + if (log2_denom) + offset += 1 << (log2_denom - 1); + + for (y=0; y<height; y++, block+=stride) { + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "ldc1 %[ftmp1], 0x00(%[block0]) \n\t" + "ldc1 %[ftmp2], 0x00(%[block1]) \n\t" + "mtc1 %[weight], %[ftmp3] \n\t" + "mtc1 %[offset], %[ftmp4] \n\t" + "mtc1 %[log2_denom], %[ftmp5] \n\t" + "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "pmullh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "paddsh %[ftmp7], %[ftmp7], %[ftmp4] \n\t" + "paddsh %[ftmp1], %[ftmp1], %[ftmp4] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "sdc1 %[ftmp1], 0x00(%[block0]) \n\t" + "sdc1 %[ftmp2], 0x00(%[block1]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]) + : [block0]"r"(block), [block1]"r"(block+8), + [weight]"r"(weight), [offset]"r"(offset), + [log2_denom]"r"(log2_denom) + : "memory" + ); + } +} + +void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src, int stride, + int height, int log2_denom, int weightd, int weights, int offset) +{ + int y; + double ftmp[9]; + + offset = ((offset + 1) | 1) << log2_denom; + + for (y=0; y<height; y++, dst+=stride, src+=stride) { + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "ldc1 %[ftmp1], 0x00(%[src0]) \n\t" + "ldc1 %[ftmp2], 0x00(%[dst0]) \n\t" + "mtc1 %[weights], %[ftmp3] \n\t" + "mtc1 %[weightd], %[ftmp4] \n\t" + "mtc1 %[offset], %[ftmp5] \n\t" + "mtc1 %[log2_denom], %[ftmp6] \n\t" + "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pmullh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "pmullh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "paddsh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "paddsh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp6] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "sdc1 %[ftmp1], 0x00(%[dst0]) \n\t" + "ldc1 %[ftmp1], 0x00(%[src1]) \n\t" + "ldc1 %[ftmp2], 0x00(%[dst1]) \n\t" + "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pmullh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "pmullh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "paddsh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "paddsh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp6] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "sdc1 %[ftmp1], 0x00(%[dst1]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]) + : [dst0]"r"(dst), [dst1]"r"(dst+8), + [src0]"r"(src), [src1]"r"(src+8), + [weights]"r"(weights), [weightd]"r"(weightd), + [offset]"r"(offset), [log2_denom]"r"(log2_denom+1) + : "memory" + ); + } +} + +void ff_h264_weight_pixels8_8_mmi(uint8_t *block, int stride, int height, + int log2_denom, int weight, int offset) +{ + int y; + double ftmp[6]; + + offset <<= log2_denom; + + if (log2_denom) + offset += 1 << (log2_denom - 1); + + for (y=0; y<height; y++, block+=stride) { + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "ldc1 %[ftmp1], 0x00(%[block]) \n\t" + "mtc1 %[weight], %[ftmp2] \n\t" + "mtc1 %[offset], %[ftmp3] \n\t" + "mtc1 %[log2_denom], %[ftmp5] \n\t" + "pshufh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "pmullh %[ftmp4], %[ftmp4], %[ftmp2] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "paddsh %[ftmp4], %[ftmp4], %[ftmp3] \n\t" + "paddsh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp4] \n\t" + "sdc1 %[ftmp1], 0x00(%[block]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]) + : [block]"r"(block), [weight]"r"(weight), + [offset]"r"(offset), [log2_denom]"r"(log2_denom) + : "memory" + ); + } +} + +void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src, int stride, + int height, int log2_denom, int weightd, int weights, int offset) +{ + int y; + double ftmp[9]; + + offset = ((offset + 1) | 1) << log2_denom; + + for (y=0; y<height; y++, dst+=stride, src+=stride) { + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "ldc1 %[ftmp1], 0x00(%[src]) \n\t" + "ldc1 %[ftmp2], 0x00(%[dst]) \n\t" + "mtc1 %[weights], %[ftmp3] \n\t" + "mtc1 %[weightd], %[ftmp4] \n\t" + "mtc1 %[offset], %[ftmp5] \n\t" + "mtc1 %[log2_denom], %[ftmp6] \n\t" + "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pmullh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "pmullh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "paddsh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "paddsh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp6] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "sdc1 %[ftmp1], 0x00(%[dst]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]) + : [dst]"r"(dst), [src]"r"(src), + [weights]"r"(weights), [weightd]"r"(weightd), + [offset]"r"(offset), [log2_denom]"r"(log2_denom+1) + : "memory" + ); + } +} + +void ff_h264_weight_pixels4_8_mmi(uint8_t *block, int stride, int height, + int log2_denom, int weight, int offset) +{ + int y; + double ftmp[5]; + uint64_t low32; + + offset <<= log2_denom; + + if (log2_denom) + offset += 1 << (log2_denom - 1); + + for (y=0; y<height; y++, block+=stride) { + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "uld %[low32], 0x00(%[block]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "mtc1 %[weight], %[ftmp2] \n\t" + "mtc1 %[offset], %[ftmp3] \n\t" + "mtc1 %[log2_denom], %[ftmp4] \n\t" + "pshufh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "paddsh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp4] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[block]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[block]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), + [low32]"=&r"(low32) + : [block]"r"(block), [weight]"r"(weight), + [offset]"r"(offset), [log2_denom]"r"(log2_denom) + : "memory" + ); + } +} + +void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src, int stride, + int height, int log2_denom, int weightd, int weights, int offset) +{ + int y; + double ftmp[7]; + uint64_t low32; + + offset = ((offset + 1) | 1) << log2_denom; + + for (y=0; y<height; y++, dst+=stride, src+=stride) { + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "uld %[low32], 0x00(%[dst]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "mtc1 %[weight], %[ftmp3] \n\t" + "mtc1 %[weightd], %[ftmp4] \n\t" + "mtc1 %[offset], %[ftmp5] \n\t" + "mtc1 %[log2_denom], %[ftmp6] \n\t" + "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "paddsh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), + [low32]"=&r"(low32) + : [dst]"r"(dst), [src]"r"(src), + [weight]"r"(weights), [weightd]"r"(weightd), + [offset]"r"(offset), [log2_denom]"r"(log2_denom+1) + : "memory" + ); + } +} + +void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, + int8_t *tc0) +{ + double ftmp[12]; + mips_reg addr[2]; + uint64_t low32; + + __asm__ volatile ( + PTR_ADDU "%[addr0], %[stride], %[stride] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + PTR_ADDU "%[addr1], %[stride], %[addr0] \n\t" + "addi %[alpha], %[alpha], -0x01 \n\t" + PTR_SUBU "%[addr1], $0, %[addr1] \n\t" + "addi %[beta], %[beta], -0x01 \n\t" + PTR_ADDU "%[addr1], %[addr1], %[pix] \n\t" + "ldc1 %[ftmp3], 0x00(%[pix]) \n\t" + "gsldxc1 %[ftmp1], 0x00(%[addr1], %[stride]) \n\t" + "gsldxc1 %[ftmp2], 0x00(%[addr1], %[addr0]) \n\t" + "gsldxc1 %[ftmp4], 0x00(%[pix], %[stride]) \n\t" + "mtc1 %[alpha], %[ftmp5] \n\t" + "mtc1 %[beta], %[ftmp6] \n\t" + "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "pshufh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "psubusb %[ftmp7], %[ftmp3], %[ftmp2] \n\t" + "psubusb %[ftmp8], %[ftmp2], %[ftmp3] \n\t" + "or %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "psubusb %[ftmp7], %[ftmp2], %[ftmp1] \n\t" + "psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t" + "psubusb %[ftmp5], %[ftmp1], %[ftmp2] \n\t" + "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "psubusb %[ftmp7], %[ftmp3], %[ftmp4] \n\t" + "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t" + "psubusb %[ftmp5], %[ftmp4], %[ftmp3] \n\t" + "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t" + "pcmpeqb %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + "pcmpeqb %[ftmp4], %[ftmp4], %[ftmp4] \n\t" + "uld %[low32], 0x00(%[tc0]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "punpcklbh %[ftmp9], %[ftmp5], %[ftmp5] \n\t" + "pcmpgtb %[ftmp5], %[ftmp9], %[ftmp4] \n\t" + "ldc1 %[ftmp4], 0x00(%[addr1]) \n\t" + "and %[ftmp10], %[ftmp5], %[ftmp8] \n\t" + "psubusb %[ftmp8], %[ftmp4], %[ftmp2] \n\t" + "psubusb %[ftmp7], %[ftmp2], %[ftmp4] \n\t" + "psubusb %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp6] \n\t" + "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t" + "and %[ftmp5], %[ftmp10], %[ftmp9] \n\t" + "psubb %[ftmp8], %[ftmp5], %[ftmp7] \n\t" + "and %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "pavgb %[ftmp5], %[ftmp2], %[ftmp3] \n\t" + "ldc1 %[ftmp11], 0x00(%[addr1]) \n\t" + "pavgb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "xor %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + "and %[ftmp5], %[ftmp5], %[ff_pb_1] \n\t" + "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "psubusb %[ftmp5], %[ftmp1], %[ftmp7] \n\t" + "paddusb %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "pmaxub %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "pminub %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "gssdxc1 %[ftmp4], 0x00(%[addr1], %[stride]) \n\t" + "gsldxc1 %[ftmp5], 0x00(%[pix], %[addr0]) \n\t" + "psubusb %[ftmp4], %[ftmp5], %[ftmp3] \n\t" + "psubusb %[ftmp7], %[ftmp3], %[ftmp5] \n\t" + "psubusb %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp6] \n\t" + "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp4] \n\t" + "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t" + "psubb %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "and %[ftmp6], %[ftmp9], %[ftmp7] \n\t" + "gsldxc1 %[ftmp4], 0x00(%[pix], %[stride]) \n\t" + "pavgb %[ftmp7], %[ftmp2], %[ftmp3] \n\t" + "gsldxc1 %[ftmp11], 0x00(%[pix], %[addr0]) \n\t" + "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "xor %[ftmp7], %[ftmp7], %[ftmp11] \n\t" + "and %[ftmp7], %[ftmp7], %[ff_pb_1] \n\t" + "psubusb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "psubusb %[ftmp7], %[ftmp4], %[ftmp6] \n\t" + "paddusb %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "pmaxub %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pminub %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + "gssdxc1 %[ftmp5], 0x00(%[pix], %[stride]) \n\t" + "xor %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "and %[ftmp6], %[ftmp6], %[ff_pb_1] \n\t" + "xor %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "xor %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + "pavgb %[ftmp4], %[ftmp4], %[ftmp1] \n\t" + "pavgb %[ftmp4], %[ftmp4], %[ff_pb_3] \n\t" + "pavgb %[ftmp5], %[ftmp5], %[ftmp3] \n\t" + "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "paddusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "psubusb %[ftmp7], %[ff_pb_A1], %[ftmp4] \n\t" + "psubusb %[ftmp4], %[ftmp4], %[ff_pb_A1] \n\t" + "pminub %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "pminub %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "psubusb %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "psubusb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "paddusb %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "paddusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "gssdxc1 %[ftmp2], 0x00(%[addr1], %[addr0]) \n\t" + "sdc1 %[ftmp3], 0x00(%[pix]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [low32]"=&r"(low32) + : [pix]"r"(pix), [stride]"r"((mips_reg)stride), + [alpha]"r"((mips_reg)alpha), [beta]"r"((mips_reg)beta), + [tc0]"r"(tc0), [ff_pb_1]"f"(ff_pb_1), + [ff_pb_3]"f"(ff_pb_3), [ff_pb_A1]"f"(ff_pb_A1) + : "memory" + ); +} + +static void deblock_v8_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha, + int beta) +{ + DECLARE_ALIGNED(8, const uint64_t, stack[0x0a]); + double ftmp[16]; + uint64_t tmp[1]; + mips_reg addr[3]; + +__asm__ volatile ( +"ori %[tmp0], $0, 0x01 \n\t" +"xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" +"mtc1 %[tmp0], %[ftmp9] \n\t" +PTR_SLL "%[addr0], %[stride], 0x02 \n\t" +PTR_ADDU "%[addr2], %[stride], %[stride] \n\t" +PTR_ADDIU "%[alpha], %[alpha], -0x01 \n\t" +PTR_SLL "%[ftmp11], %[ftmp9], %[ftmp9] \n\t" +"bltz %[alpha], 1f \n\t" +PTR_ADDU "%[addr1], %[addr2], %[stride] \n\t" +PTR_ADDIU "%[beta], %[beta], -0x01 \n\t" +"bltz %[beta], 1f \n\t" +PTR_SUBU "%[addr0], $0, %[addr0] \n\t" +PTR_ADDU "%[addr0], %[addr0], %[pix] \n\t" +"ldc1 %[ftmp3], 0x00(%[pix]) \n\t" +"gsldxc1 %[ftmp1], 0x00(%[addr0], %[addr2]) \n\t" +"gsldxc1 %[ftmp2], 0x00(%[addr0], %[addr1]) \n\t" +"gsldxc1 %[ftmp4], 0x00(%[pix], %[stride]) \n\t" +"mtc1 %[alpha], %[ftmp5] \n\t" +"mtc1 %[beta], %[ftmp6] \n\t" +"pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" +"pshufh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" +"packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" +"psubusb %[ftmp7], %[ftmp3], %[ftmp2] \n\t" +"psubusb %[ftmp8], %[ftmp2], %[ftmp3] \n\t" +"packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" +"or %[ftmp8], %[ftmp8], %[ftmp7] \n\t" +"sdc1 %[ftmp5], 0x10+%[stack] \n\t" +"psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t" +"psubusb %[ftmp7], %[ftmp2], %[ftmp1] \n\t" +"psubusb %[ftmp5], %[ftmp1], %[ftmp2] \n\t" +"or %[ftmp5], %[ftmp5], %[ftmp7] \n\t" +"psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" +"or %[ftmp8], %[ftmp8], %[ftmp5] \n\t" +"psubusb %[ftmp7], %[ftmp3], %[ftmp4] \n\t" +"psubusb %[ftmp5], %[ftmp4], %[ftmp3] \n\t" +"or %[ftmp5], %[ftmp5], %[ftmp7] \n\t" +"psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" +"or %[ftmp8], %[ftmp8], %[ftmp5] \n\t" +"xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" +"ldc1 %[ftmp5], 0x10+%[stack] \n\t" +"pcmpeqb %[ftmp8], %[ftmp8], %[ftmp7] \n\t" +"ldc1 %[ftmp10], %[ff_pb_1] \n\t" +"sdc1 %[ftmp8], 0x20+%[stack] \n\t" +"pavgb %[ftmp5], %[ftmp5], %[ftmp0] \n\t" +"psubusb %[ftmp8], %[ftmp3], %[ftmp2] \n\t" +"pavgb %[ftmp5], %[ftmp5], %[ftmp10] \n\t" +"psubusb %[ftmp7], %[ftmp2], %[ftmp3] \n\t" +"psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t" +"psubusb %[ftmp7], %[ftmp7], %[ftmp5] \n\t" +"ldc1 %[ftmp15], 0x20+%[stack] \n\t" +"pcmpeqb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" +"and %[ftmp7], %[ftmp7], %[ftmp15] \n\t" +"gsldxc1 %[ftmp15], 0x00(%[addr0], %[stride]) \n\t" +"psubusb %[ftmp8], %[ftmp15], %[ftmp2] \n\t" +"psubusb %[ftmp5], %[ftmp2], %[ftmp15] \n\t" +"psubusb %[ftmp8], %[ftmp8], %[ftmp6] \n\t" +"psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" +"pcmpeqb %[ftmp5], %[ftmp5], %[ftmp8] \n\t" +"and %[ftmp5], %[ftmp5], %[ftmp7] \n\t" +"gsldxc1 %[ftmp14], 0x00(%[pix], %[addr2]) \n\t" +"sdc1 %[ftmp5], 0x30+%[stack] \n\t" +"psubusb %[ftmp8], %[ftmp14], %[ftmp3] \n\t" +"psubusb %[ftmp5], %[ftmp3], %[ftmp14] \n\t" +"psubusb %[ftmp8], %[ftmp8], %[ftmp6] \n\t" +"psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" +"pcmpeqb %[ftmp5], %[ftmp5], %[ftmp8] \n\t" +"and %[ftmp5], %[ftmp5], %[ftmp7] \n\t" +"sdc1 %[ftmp5], 0x40+%[stack] \n\t" +"pavgb %[ftmp5], %[ftmp15], %[ftmp1] \n\t" +"pavgb %[ftmp6], %[ftmp2], %[ftmp3] \n\t" +"pavgb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" +"sdc1 %[ftmp6], 0x10+%[stack] \n\t" +"paddb %[ftmp7], %[ftmp15], %[ftmp1] \n\t" +"paddb %[ftmp8], %[ftmp2], %[ftmp3] \n\t" +"paddb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" +"mov.d %[ftmp8], %[ftmp7] \n\t" +"sdc1 %[ftmp7], 0x00+%[stack] \n\t" +"psrlh %[ftmp7], %[ftmp7], %[ftmp9] \n\t" +"pavgb %[ftmp7], %[ftmp7], %[ftmp0] \n\t" +"xor %[ftmp7], %[ftmp7], %[ftmp5] \n\t" +"and %[ftmp7], %[ftmp7], %[ftmp10] \n\t" +"psubb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" +"pavgb %[ftmp6], %[ftmp15], %[ftmp4] \n\t" +"psubb %[ftmp7], %[ftmp15], %[ftmp4] \n\t" +"paddb %[ftmp8], %[ftmp8], %[ftmp8] \n\t" +"psubb %[ftmp8], %[ftmp8], %[ftmp7] \n\t" +"and %[ftmp7], %[ftmp7], %[ftmp10] \n\t" +"psubb %[ftmp6], %[ftmp6], %[ftmp7] \n\t" +"ldc1 %[ftmp13], 0x10+%[stack] \n\t" +"pavgb %[ftmp6], %[ftmp6], %[ftmp1] \n\t" +"psrlh %[ftmp8], %[ftmp8], %[ftmp11] \n\t" +"pavgb %[ftmp6], %[ftmp6], %[ftmp13] \n\t" +"pavgb %[ftmp8], %[ftmp8], %[ftmp0] \n\t" +"xor %[ftmp8], %[ftmp8], %[ftmp6] \n\t" +"and %[ftmp8], %[ftmp8], %[ftmp10] \n\t" +"psubb %[ftmp6], %[ftmp6], %[ftmp8] \n\t" +"xor %[ftmp8], %[ftmp2], %[ftmp4] \n\t" +"pavgb %[ftmp7], %[ftmp2], %[ftmp4] \n\t" +"and %[ftmp8], %[ftmp8], %[ftmp10] \n\t" +"psubb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" +"ldc1 %[ftmp13], 0x30+%[stack] \n\t" +"pavgb %[ftmp7], %[ftmp7], %[ftmp1] \n\t" +"ldc1 %[ftmp12], 0x20+%[stack] \n\t" +"xor %[ftmp6], %[ftmp6], %[ftmp7] \n\t" +"xor %[ftmp7], %[ftmp7], %[ftmp2] \n\t" +"and %[ftmp6], %[ftmp6], %[ftmp13] \n\t" +"and %[ftmp7], %[ftmp7], %[ftmp12] \n\t" +"xor %[ftmp6], %[ftmp6], %[ftmp7] \n\t" +"xor %[ftmp6], %[ftmp6], %[ftmp2] \n\t" +"gssdxc1 %[ftmp6], 0x00(%[addr0], %[addr1]) \n\t" +"ldc1 %[ftmp6], 0x00(%[addr0]) \n\t" +"paddb %[ftmp7], %[ftmp15], %[ftmp6] \n\t" +"pavgb %[ftmp6], %[ftmp6], %[ftmp15] \n\t" +"ldc1 %[ftmp12], 0x00+%[stack] \n\t" +"pavgb %[ftmp6], %[ftmp6], %[ftmp5] \n\t" +"paddb %[ftmp7], %[ftmp7], %[ftmp7] \n\t" +"paddb %[ftmp7], %[ftmp7], %[ftmp12] \n\t" +"psrlh %[ftmp7], %[ftmp7], %[ftmp11] \n\t" +"pavgb %[ftmp7], %[ftmp7], %[ftmp0] \n\t" +"xor %[ftmp7], %[ftmp7], %[ftmp6] \n\t" +"and %[ftmp7], %[ftmp7], %[ftmp10] \n\t" +"ldc1 %[ftmp12], 0x30+%[stack] \n\t" +"psubb %[ftmp6], %[ftmp6], %[ftmp7] \n\t" +"xor %[ftmp5], %[ftmp5], %[ftmp1] \n\t" +"xor %[ftmp6], %[ftmp6], %[ftmp15] \n\t" +"and %[ftmp5], %[ftmp5], %[ftmp12] \n\t" +"and %[ftmp6], %[ftmp6], %[ftmp12] \n\t" +"xor %[ftmp5], %[ftmp5], %[ftmp1] \n\t" +"xor %[ftmp6], %[ftmp6], %[ftmp15] \n\t" +"gssdxc1 %[ftmp5], 0x00(%[addr0], %[addr2]) \n\t" +"gssdxc1 %[ftmp6], 0x00(%[addr0], %[stride]) \n\t" +"pavgb %[ftmp5], %[ftmp14], %[ftmp4] \n\t" +"pavgb %[ftmp6], %[ftmp3], %[ftmp2] \n\t" +"pavgb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" +"sdc1 %[ftmp6], 0x10+%[stack] \n\t" +"paddb %[ftmp7], %[ftmp14], %[ftmp4] \n\t" +"paddb %[ftmp8], %[ftmp3], %[ftmp2] \n\t" +"paddb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" +"mov.d %[ftmp8], %[ftmp7] \n\t" +"sdc1 %[ftmp7], 0x00+%[stack] \n\t" +"psrlh %[ftmp7], %[ftmp7], %[ftmp9] \n\t" +"pavgb %[ftmp7], %[ftmp7], %[ftmp0] \n\t" +"xor %[ftmp7], %[ftmp7], %[ftmp5] \n\t" +"and %[ftmp7], %[ftmp7], %[ftmp10] \n\t" +"psubb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" +"pavgb %[ftmp6], %[ftmp14], %[ftmp1] \n\t" +"paddb %[ftmp8], %[ftmp8], %[ftmp8] \n\t" +"psubb %[ftmp7], %[ftmp14], %[ftmp1] \n\t" +"psubb %[ftmp8], %[ftmp8], %[ftmp7] \n\t" +"and %[ftmp7], %[ftmp7], %[ftmp10] \n\t" +"psubb %[ftmp6], %[ftmp6], %[ftmp7] \n\t" +"ldc1 %[ftmp12], 0x10+%[stack] \n\t" +"pavgb %[ftmp6], %[ftmp6], %[ftmp4] \n\t" +"pavgb %[ftmp6], %[ftmp6], %[ftmp12] \n\t" +"psrlh %[ftmp8], %[ftmp8], %[ftmp11] \n\t" +"pavgb %[ftmp8], %[ftmp8], %[ftmp0] \n\t" +"xor %[ftmp8], %[ftmp8], %[ftmp6] \n\t" +"and %[ftmp8], %[ftmp8], %[ftmp10] \n\t" +"psubb %[ftmp6], %[ftmp6], %[ftmp8] \n\t" +"xor %[ftmp8], %[ftmp3], %[ftmp1] \n\t" +"pavgb %[ftmp7], %[ftmp3], %[ftmp1] \n\t" +"and %[ftmp8], %[ftmp8], %[ftmp10] \n\t" +"ldc1 %[ftmp12], 0x40+%[stack] \n\t" +"psubb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" +"ldc1 %[ftmp13], 0x20+%[stack] \n\t" +"pavgb %[ftmp7], %[ftmp7], %[ftmp4] \n\t" +"xor %[ftmp6], %[ftmp6], %[ftmp7] \n\t" +"xor %[ftmp7], %[ftmp7], %[ftmp3] \n\t" +"and %[ftmp6], %[ftmp6], %[ftmp12] \n\t" +"and %[ftmp7], %[ftmp7], %[ftmp13] \n\t" +"xor %[ftmp6], %[ftmp6], %[ftmp7] \n\t" +"xor %[ftmp6], %[ftmp6], %[ftmp3] \n\t" +"sdc1 %[ftmp6], 0x00(%[pix]) \n\t" +"gsldxc1 %[ftmp6], 0x00(%[pix], %[addr1]) \n\t" +"paddb %[ftmp7], %[ftmp14], %[ftmp6] \n\t" +"pavgb %[ftmp6], %[ftmp6], %[ftmp14] \n\t" +"ldc1 %[ftmp12], 0x00+%[stack] \n\t" +"pavgb %[ftmp6], %[ftmp6], %[ftmp5] \n\t" +"paddb %[ftmp7], %[ftmp7], %[ftmp7] \n\t" +"paddb %[ftmp7], %[ftmp7], %[ftmp12] \n\t" +"psrlh %[ftmp7], %[ftmp7], %[ftmp11] \n\t" +"pavgb %[ftmp7], %[ftmp7], %[ftmp0] \n\t" +"xor %[ftmp7], %[ftmp7], %[ftmp6] \n\t" +"and %[ftmp7], %[ftmp7], %[ftmp10] \n\t" +"ldc1 %[ftmp12], 0x40+%[stack] \n\t" +"psubb %[ftmp6], %[ftmp6], %[ftmp7] \n\t" +"xor %[ftmp5], %[ftmp5], %[ftmp4] \n\t" +"xor %[ftmp6], %[ftmp6], %[ftmp14] \n\t" +"and %[ftmp5], %[ftmp5], %[ftmp12] \n\t" +"and %[ftmp6], %[ftmp6], %[ftmp12] \n\t" +"xor %[ftmp5], %[ftmp5], %[ftmp4] \n\t" +"xor %[ftmp6], %[ftmp6], %[ftmp14] \n\t" +"gssdxc1 %[ftmp5], 0x00(%[pix], %[stride]) \n\t" +"gssdxc1 %[ftmp6], 0x00(%[pix], %[addr2]) \n\t" +"1: \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), + [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [addr2]"=&r"(addr[2]), + [alpha]"+&r"(alpha), [beta]"+&r"(beta) + : [pix]"r"(pix), [stride]"r"((mips_reg)stride), + [stack]"m"(stack[0]), [ff_pb_1]"m"(ff_pb_1) +: "memory" +); +} + +void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, + int8_t *tc0) +{ + double ftmp[9]; + mips_reg addr[1]; + uint64_t low32; + + __asm__ volatile ( + "addi %[alpha], %[alpha], -0x01 \n\t" + "addi %[beta], %[beta], -0x01 \n\t" + "or %[addr0], $0, %[pix] \n\t" + PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t" + PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t" + "ldc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gsldxc1 %[ftmp2], 0x00(%[addr0], %[stride]) \n\t" + "ldc1 %[ftmp3], 0x00(%[pix]) \n\t" + "gsldxc1 %[ftmp4], 0x00(%[pix], %[stride]) \n\t" + + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "mtc1 %[alpha], %[ftmp5] \n\t" + "mtc1 %[beta], %[ftmp6] \n\t" + "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "pshufh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "psubusb %[ftmp7], %[ftmp3], %[ftmp2] \n\t" + "psubusb %[ftmp8], %[ftmp2], %[ftmp3] \n\t" + "or %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t" + "psubusb %[ftmp7], %[ftmp2], %[ftmp1] \n\t" + "psubusb %[ftmp5], %[ftmp1], %[ftmp2] \n\t" + "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t" + "psubusb %[ftmp7], %[ftmp3], %[ftmp4] \n\t" + "psubusb %[ftmp5], %[ftmp4], %[ftmp3] \n\t" + "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t" + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "pcmpeqb %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "uld %[low32], 0x00(%[tc0]) \n\t" + "mtc1 %[low32], %[ftmp7] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "and %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "xor %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + "xor %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "and %[ftmp6], %[ftmp6], %[ff_pb_1] \n\t" + "pavgb %[ftmp4], %[ftmp4], %[ftmp1] \n\t" + "xor %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + "pavgb %[ftmp4], %[ftmp4], %[ff_pb_3] \n\t" + "pavgb %[ftmp5], %[ftmp5], %[ftmp3] \n\t" + "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "paddusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "psubusb %[ftmp7], %[ff_pb_A1], %[ftmp4] \n\t" + "psubusb %[ftmp4], %[ftmp4], %[ff_pb_A1] \n\t" + "pminub %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "pminub %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "psubusb %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "psubusb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "paddusb %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "paddusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + + "gssdxc1 %[ftmp2], 0x00(%[addr0], %[stride]) \n\t" + "sdc1 %[ftmp3], 0x00(%[pix]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), + [addr0]"=&r"(addr[0]), + [low32]"=&r"(low32) + : [pix]"r"(pix), [stride]"r"((mips_reg)stride), + [alpha]"r"(alpha), [beta]"r"(beta), + [tc0]"r"(tc0), [ff_pb_1]"f"(ff_pb_1), + [ff_pb_3]"f"(ff_pb_3), [ff_pb_A1]"f"(ff_pb_A1) + : "memory" + ); +} + +void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha, + int beta) +{ + double ftmp[9]; + mips_reg addr[1]; + + __asm__ volatile ( + "addi %[alpha], %[alpha], -0x01 \n\t" + "addi %[beta], %[beta], -0x01 \n\t" + "or %[addr0], $0, %[pix] \n\t" + PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t" + PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t" + "ldc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gsldxc1 %[ftmp2], 0x00(%[addr0], %[stride]) \n\t" + "ldc1 %[ftmp3], 0x00(%[pix]) \n\t" + "gsldxc1 %[ftmp4], 0x00(%[pix], %[stride]) \n\t" + + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "mtc1 %[alpha], %[ftmp5] \n\t" + "mtc1 %[beta], %[ftmp6] \n\t" + "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "pshufh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "psubusb %[ftmp7], %[ftmp3], %[ftmp2] \n\t" + "psubusb %[ftmp8], %[ftmp2], %[ftmp3] \n\t" + "or %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t" + "psubusb %[ftmp7], %[ftmp2], %[ftmp1] \n\t" + "psubusb %[ftmp5], %[ftmp1], %[ftmp2] \n\t" + "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t" + "psubusb %[ftmp7], %[ftmp3], %[ftmp4] \n\t" + "psubusb %[ftmp5], %[ftmp4], %[ftmp3] \n\t" + "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t" + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "pcmpeqb %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "mov.d %[ftmp6], %[ftmp2] \n\t" + "mov.d %[ftmp7], %[ftmp3] \n\t" + "xor %[ftmp5], %[ftmp2], %[ftmp4] \n\t" + "and %[ftmp5], %[ftmp5], %[ff_pb_1] \n\t" + "pavgb %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "psubusb %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + "pavgb %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "xor %[ftmp5], %[ftmp3], %[ftmp1] \n\t" + "and %[ftmp5], %[ftmp5], %[ff_pb_1] \n\t" + "pavgb %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "psubusb %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "pavgb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubb %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "psubb %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "and %[ftmp2], %[ftmp2], %[ftmp8] \n\t" + "and %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "paddb %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "paddb %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + + "gssdxc1 %[ftmp2], 0x00(%[addr0], %[stride]) \n\t" + "sdc1 %[ftmp3], 0x00(%[pix]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), + [addr0]"=&r"(addr[0]) + : [pix]"r"(pix), [stride]"r"((mips_reg)stride), + [alpha]"r"(alpha), [beta]"r"(beta), + [ff_pb_1]"f"(ff_pb_1) + : "memory" + ); +} + +void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, + int8_t *tc0) +{ + double ftmp[11]; + mips_reg addr[6]; + uint64_t low32; + + __asm__ volatile ( + "addi %[alpha], %[alpha], -0x01 \n\t" + "addi %[beta], %[beta], -0x01 \n\t" + PTR_ADDU "%[addr0], %[stride], %[stride] \n\t" + PTR_ADDI "%[pix], %[pix], -0x02 \n\t" + PTR_ADDU "%[addr1], %[addr0], %[stride] \n\t" + PTR_ADDU "%[addr2], %[addr0], %[addr0] \n\t" + "or %[addr5], $0, %[pix] \n\t" + PTR_ADDU "%[pix], %[pix], %[addr1] \n\t" + "uld %[low32], 0x00(%[addr5]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + PTR_ADDU "%[addr3], %[addr5], %[stride] \n\t" + "uld %[low32], 0x00(%[addr3]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + PTR_ADDU "%[addr4], %[addr5], %[addr0] \n\t" + "uld %[low32], 0x00(%[addr4]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "uld %[low32], 0x00(%[pix]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + PTR_ADDU "%[addr3], %[pix], %[stride] \n\t" + "punpckhhw %[ftmp2], %[ftmp0], %[ftmp1] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "uld %[low32], 0x00(%[addr3]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + PTR_ADDU "%[addr4], %[pix], %[addr0] \n\t" + "uld %[low32], 0x00(%[addr4]) \n\t" + "mtc1 %[low32], %[ftmp6] \n\t" + PTR_ADDU "%[addr3], %[pix], %[addr1] \n\t" + "uld %[low32], 0x00(%[addr3]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + PTR_ADDU "%[addr4], %[pix], %[addr2] \n\t" + "uld %[low32], 0x00(%[addr4]) \n\t" + "mtc1 %[low32], %[ftmp7] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "mov.d %[ftmp6], %[ftmp4] \n\t" + "punpcklhw %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "punpckhhw %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "punpckhwd %[ftmp1], %[ftmp0], %[ftmp4] \n\t" + "punpckhwd %[ftmp3], %[ftmp2], %[ftmp6] \n\t" + "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "punpcklwd %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "mov.d %[ftmp9], %[ftmp0] \n\t" + "mov.d %[ftmp10], %[ftmp3] \n\t" + + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "mtc1 %[alpha], %[ftmp4] \n\t" + "mtc1 %[beta], %[ftmp5] \n\t" + "pshufh %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "pshufh %[ftmp5], %[ftmp5], %[ftmp8] \n\t" + "packushb %[ftmp4], %[ftmp4], %[ftmp4] \n\t" + "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "psubusb %[ftmp6], %[ftmp2], %[ftmp1] \n\t" + "psubusb %[ftmp7], %[ftmp1], %[ftmp2] \n\t" + "or %[ftmp7], %[ftmp7], %[ftmp6] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp4] \n\t" + "psubusb %[ftmp6], %[ftmp1], %[ftmp0] \n\t" + "psubusb %[ftmp4], %[ftmp0], %[ftmp1] \n\t" + "or %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "or %[ftmp7], %[ftmp7], %[ftmp4] \n\t" + "psubusb %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + "psubusb %[ftmp4], %[ftmp3], %[ftmp2] \n\t" + "or %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "or %[ftmp7], %[ftmp7], %[ftmp4] \n\t" + "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp6] \n\t" + "uld %[low32], 0x00(%[tc0]) \n\t" + "mtc1 %[low32], %[ftmp6] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "and %[ftmp7], %[ftmp7], %[ftmp6] \n\t" + "pcmpeqb %[ftmp4], %[ftmp4], %[ftmp4] \n\t" + "xor %[ftmp5], %[ftmp1], %[ftmp2] \n\t" + "xor %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "and %[ftmp5], %[ftmp5], %[ff_pb_1] \n\t" + "pavgb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "xor %[ftmp4], %[ftmp4], %[ftmp1] \n\t" + "pavgb %[ftmp3], %[ftmp3], %[ff_pb_3] \n\t" + "pavgb %[ftmp4], %[ftmp4], %[ftmp2] \n\t" + "pavgb %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "paddusb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubusb %[ftmp6], %[ff_pb_A1], %[ftmp3] \n\t" + "psubusb %[ftmp3], %[ftmp3], %[ff_pb_A1] \n\t" + "pminub %[ftmp6], %[ftmp6], %[ftmp7] \n\t" + "pminub %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + "psubusb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "paddusb %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + + "punpckhwd %[ftmp4], %[ftmp9], %[ftmp9] \n\t" + "punpckhwd %[ftmp5], %[ftmp1], %[ftmp1] \n\t" + "punpckhwd %[ftmp6], %[ftmp2], %[ftmp2] \n\t" + "punpcklbh %[ftmp0], %[ftmp9], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp10] \n\t" + "punpcklhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[addr5]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[addr5]) \n\t" + PTR_ADDU "%[addr3], %[addr5], %[stride] \n\t" + "punpckhwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[addr3]) \n\t" + PTR_ADDU "%[addr4], %[addr5], %[addr0] \n\t" + "gsswrc1 %[ftmp1], 0x00(%[addr3]) \n\t" + "gsswlc1 %[ftmp0], 0x03(%[addr4]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[addr4]) \n\t" + "punpckhwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "punpckhwd %[ftmp3], %[ftmp10], %[ftmp10] \n\t" + "gsswlc1 %[ftmp0], 0x03(%[pix]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[pix]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + PTR_ADDU "%[addr3], %[pix], %[stride] \n\t" + "punpcklhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t" + "punpckhhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "gsswlc1 %[ftmp5], 0x03(%[addr3]) \n\t" + "gsswrc1 %[ftmp5], 0x00(%[addr3]) \n\t" + "punpckhwd %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + PTR_ADDU "%[addr3], %[pix], %[addr0] \n\t" + PTR_ADDU "%[addr4], %[pix], %[addr1] \n\t" + "gsswlc1 %[ftmp5], 0x03(%[addr3]) \n\t" + "gsswrc1 %[ftmp5], 0x00(%[addr3]) \n\t" + "gsswlc1 %[ftmp4], 0x03(%[addr4]) \n\t" + PTR_ADDU "%[addr3], %[pix], %[addr2] \n\t" + "punpckhwd %[ftmp9], %[ftmp4], %[ftmp4] \n\t" + "gsswrc1 %[ftmp4], 0x00(%[addr4]) \n\t" + "gsswlc1 %[ftmp9], 0x03(%[addr3]) \n\t" + "gsswrc1 %[ftmp9], 0x00(%[addr3]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]), + [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]), + [pix]"+&r"(pix), + [low32]"=&r"(low32) + : [alpha]"r"(alpha), [beta]"r"(beta), + [stride]"r"((mips_reg)stride), [tc0]"r"(tc0), + [ff_pb_1]"f"(ff_pb_1), [ff_pb_3]"f"(ff_pb_3), + [ff_pb_A1]"f"(ff_pb_A1) + : "memory" + ); +} + +void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha, + int beta) +{ + double ftmp[11]; + mips_reg addr[6]; + uint64_t low32; + + __asm__ volatile ( + "addi %[alpha], %[alpha], -0x01 \n\t" + "addi %[beta], %[beta], -0x01 \n\t" + PTR_ADDU "%[addr0], %[stride], %[stride] \n\t" + PTR_ADDI "%[pix], %[pix], -0x02 \n\t" + PTR_ADDU "%[addr1], %[addr0], %[stride] \n\t" + PTR_ADDU "%[addr2], %[addr0], %[addr0] \n\t" + "or %[addr5], $0, %[pix] \n\t" + PTR_ADDU "%[pix], %[pix], %[addr1] \n\t" + "uld %[low32], 0x00(%[addr5]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + PTR_ADDU "%[addr3], %[addr5], %[stride] \n\t" + "uld %[low32], 0x00(%[addr3]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + PTR_ADDU "%[addr4], %[addr5], %[addr0] \n\t" + "uld %[low32], 0x00(%[addr4]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "uld %[low32], 0x00(%[pix]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + PTR_ADDU "%[addr3], %[pix], %[stride] \n\t" + "punpckhhw %[ftmp2], %[ftmp0], %[ftmp1] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "uld %[low32], 0x00(%[addr3]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + PTR_ADDU "%[addr4], %[pix], %[addr0] \n\t" + "uld %[low32], 0x00(%[addr4]) \n\t" + "mtc1 %[low32], %[ftmp6] \n\t" + PTR_ADDU "%[addr3], %[pix], %[addr1] \n\t" + "uld %[low32], 0x00(%[addr3]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + PTR_ADDU "%[addr4], %[pix], %[addr2] \n\t" + "uld %[low32], 0x00(%[addr4]) \n\t" + "mtc1 %[low32], %[ftmp7] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "mov.d %[ftmp6], %[ftmp4] \n\t" + "punpcklhw %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "punpckhhw %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "punpckhwd %[ftmp1], %[ftmp0], %[ftmp4] \n\t" + "punpckhwd %[ftmp3], %[ftmp2], %[ftmp6] \n\t" + "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "punpcklwd %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "mtc1 %[alpha], %[ftmp4] \n\t" + "mtc1 %[beta], %[ftmp5] \n\t" + "pshufh %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "pshufh %[ftmp5], %[ftmp5], %[ftmp8] \n\t" + "packushb %[ftmp4], %[ftmp4], %[ftmp4] \n\t" + "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "psubusb %[ftmp6], %[ftmp2], %[ftmp1] \n\t" + "psubusb %[ftmp7], %[ftmp1], %[ftmp2] \n\t" + "or %[ftmp7], %[ftmp7], %[ftmp6] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp4] \n\t" + "psubusb %[ftmp6], %[ftmp1], %[ftmp0] \n\t" + "psubusb %[ftmp4], %[ftmp0], %[ftmp1] \n\t" + "or %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "or %[ftmp7], %[ftmp7], %[ftmp4] \n\t" + "psubusb %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + "psubusb %[ftmp4], %[ftmp3], %[ftmp2] \n\t" + "or %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "or %[ftmp7], %[ftmp7], %[ftmp4] \n\t" + "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp6] \n\t" + "mov.d %[ftmp5], %[ftmp1] \n\t" + "mov.d %[ftmp6], %[ftmp2] \n\t" + "xor %[ftmp4], %[ftmp1], %[ftmp3] \n\t" + "and %[ftmp4], %[ftmp4], %[ff_pb_1] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp4] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "xor %[ftmp4], %[ftmp2], %[ftmp0] \n\t" + "and %[ftmp4], %[ftmp4], %[ff_pb_1] \n\t" + "pavgb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "psubusb %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "pavgb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "psubb %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "psubb %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "and %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "and %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "paddb %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "paddb %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + + "punpckhwd %[ftmp4], %[ftmp0], %[ftmp0] \n\t" + "punpckhwd %[ftmp5], %[ftmp1], %[ftmp1] \n\t" + "punpckhwd %[ftmp6], %[ftmp2], %[ftmp2] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "punpcklhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[addr5]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[addr5]) \n\t" + PTR_ADDU "%[addr3], %[addr5], %[stride] \n\t" + "punpckhwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[addr3]) \n\t" + PTR_ADDU "%[addr4], %[addr5], %[addr0] \n\t" + "gsswrc1 %[ftmp1], 0x00(%[addr3]) \n\t" + "gsswlc1 %[ftmp0], 0x03(%[addr4]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[addr4]) \n\t" + "punpckhwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "punpckhwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "gsswlc1 %[ftmp0], 0x03(%[pix]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[pix]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + PTR_ADDU "%[addr3], %[pix], %[stride] \n\t" + "punpcklhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t" + "punpckhhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "gsswlc1 %[ftmp5], 0x03(%[addr3]) \n\t" + "gsswrc1 %[ftmp5], 0x00(%[addr3]) \n\t" + "punpckhwd %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + PTR_ADDU "%[addr3], %[pix], %[addr0] \n\t" + PTR_ADDU "%[addr4], %[pix], %[addr1] \n\t" + "gsswlc1 %[ftmp5], 0x03(%[addr3]) \n\t" + "gsswrc1 %[ftmp5], 0x00(%[addr3]) \n\t" + "gsswlc1 %[ftmp4], 0x03(%[addr4]) \n\t" + PTR_ADDU "%[addr3], %[pix], %[addr2] \n\t" + "punpckhwd %[ftmp9], %[ftmp4], %[ftmp4] \n\t" + "gsswrc1 %[ftmp4], 0x00(%[addr4]) \n\t" + "gsswlc1 %[ftmp9], 0x03(%[addr3]) \n\t" + "gsswrc1 %[ftmp9], 0x00(%[addr3]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]), + [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]), + [pix]"+&r"(pix), + [low32]"=&r"(low32) + : [alpha]"r"(alpha), [beta]"r"(beta), + [stride]"r"((mips_reg)stride), [ff_pb_1]"f"(ff_pb_1) + : "memory" + ); +} + +void ff_deblock_v_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, + int8_t *tc0) +{ + if ((tc0[0] & tc0[1]) >= 0) + ff_deblock_v8_luma_8_mmi(pix + 0, stride, alpha, beta, tc0); + if ((tc0[2] & tc0[3]) >= 0) + ff_deblock_v8_luma_8_mmi(pix + 8, stride, alpha, beta, tc0 + 2); +} + +void ff_deblock_v_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha, + int beta) +{ + deblock_v8_luma_intra_8_mmi(pix + 0, stride, alpha, beta); + deblock_v8_luma_intra_8_mmi(pix + 8, stride, alpha, beta); +} + +void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, + int8_t *tc0) +{ + uint64_t stack[0xd]; + double ftmp[9]; + mips_reg addr[8]; + + __asm__ volatile ( + PTR_ADDU "%[addr0], %[stride], %[stride] \n\t" + PTR_ADDI "%[addr1], %[pix], -0x4 \n\t" + PTR_ADDU "%[addr2], %[stride], %[addr0] \n\t" + "gsldlc1 %[ftmp0], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + PTR_ADDU "%[addr3], %[addr1], %[stride] \n\t" + PTR_ADDU "%[addr4], %[addr1], %[addr2] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr3]) \n\t" + PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr3]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[addr5]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr5]) \n\t" + "gsldlc1 %[ftmp3], 0x07(%[addr4]) \n\t" + PTR_ADDU "%[addr3], %[addr4], %[stride] \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr4]) \n\t" + "gsldlc1 %[ftmp4], 0x07(%[addr3]) \n\t" + PTR_ADDU "%[addr5], %[addr4], %[addr0] \n\t" + "gsldrc1 %[ftmp4], 0x00(%[addr3]) \n\t" + "gsldlc1 %[ftmp5], 0x07(%[addr5]) \n\t" + PTR_ADDU "%[addr3], %[addr4], %[addr2] \n\t" + "gsldrc1 %[ftmp5], 0x00(%[addr5]) \n\t" + "gsldlc1 %[ftmp6], 0x07(%[addr3]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[addr3]) \n\t" + PTR_ADDU "%[addr6], %[addr0], %[addr0] \n\t" + "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + PTR_ADDU "%[addr3], %[addr4], %[addr6] \n\t" + "sdc1 %[ftmp1], 0x10(%[stack]) \n\t" + "gsldlc1 %[ftmp8], 0x07(%[addr3]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[addr3]) \n\t" + PTR_ADDU "%[addr7], %[addr6], %[addr6] \n\t" + "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "punpckhhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t" + "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "ldc1 %[ftmp8], 0x10(%[stack]) \n\t" + "punpckhwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "sdc1 %[ftmp0], 0x00(%[stack]) \n\t" + "punpckhhw %[ftmp6], %[ftmp7], %[ftmp8] \n\t" + "punpcklhw %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "punpckhhw %[ftmp0], %[ftmp3], %[ftmp5] \n\t" + "punpcklhw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "punpcklwd %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpckhwd %[ftmp5], %[ftmp7], %[ftmp3] \n\t" + "punpcklwd %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "punpckhwd %[ftmp3], %[ftmp1], %[ftmp2] \n\t" + "punpcklwd %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "sdc1 %[ftmp1], 0x10(%[stack]) \n\t" + "sdc1 %[ftmp3], 0x20(%[stack]) \n\t" + "sdc1 %[ftmp7], 0x30(%[stack]) \n\t" + "sdc1 %[ftmp5], 0x40(%[stack]) \n\t" + "sdc1 %[ftmp6], 0x50(%[stack]) \n\t" + PTR_ADDU "%[addr1], %[addr1], %[addr7] \n\t" + PTR_ADDU "%[addr4], %[addr4], %[addr7] \n\t" + "gsldlc1 %[ftmp0], 0x07(%[addr1]) \n\t" + PTR_ADDU "%[addr3], %[addr1], %[stride] \n\t" + "gsldrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr3]) \n\t" + PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr3]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[addr5]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr5]) \n\t" + "gsldlc1 %[ftmp3], 0x07(%[addr4]) \n\t" + PTR_ADDU "%[addr3], %[addr4], %[stride] \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr4]) \n\t" + "gsldlc1 %[ftmp4], 0x07(%[addr3]) \n\t" + PTR_ADDU "%[addr5], %[addr4], %[addr0] \n\t" + "gsldrc1 %[ftmp4], 0x00(%[addr3]) \n\t" + "gsldlc1 %[ftmp5], 0x07(%[addr5]) \n\t" + PTR_ADDU "%[addr3], %[addr4], %[addr2] \n\t" + "gsldrc1 %[ftmp5], 0x00(%[addr5]) \n\t" + "gsldlc1 %[ftmp6], 0x07(%[addr3]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[addr3]) \n\t" + "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + PTR_ADDU "%[addr3], %[addr4], %[addr6] \n\t" + "sdc1 %[ftmp1], 0x18(%[stack]) \n\t" + "gsldlc1 %[ftmp8], 0x07(%[addr3]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[addr3]) \n\t" + "punpckhhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t" + "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t" + "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "punpckhwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "ldc1 %[ftmp8], 0x18(%[stack]) \n\t" + "sdc1 %[ftmp0], 0x08(%[stack]) \n\t" + "punpckhhw %[ftmp6], %[ftmp7], %[ftmp8] \n\t" + "punpcklhw %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "punpckhhw %[ftmp0], %[ftmp3], %[ftmp5] \n\t" + "punpcklhw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "punpckhwd %[ftmp5], %[ftmp7], %[ftmp3] \n\t" + "punpcklwd %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "punpckhwd %[ftmp3], %[ftmp1], %[ftmp2] \n\t" + "punpcklwd %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "punpcklwd %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "sdc1 %[ftmp1], 0x18(%[stack]) \n\t" + "sdc1 %[ftmp3], 0x28(%[stack]) \n\t" + "sdc1 %[ftmp7], 0x38(%[stack]) \n\t" + "sdc1 %[ftmp5], 0x48(%[stack]) \n\t" + "sdc1 %[ftmp6], 0x58(%[stack]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]), + [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]), + [addr6]"=&r"(addr[6]), [addr7]"=&r"(addr[7]) + : [pix]"r"(pix), [stride]"r"((mips_reg)stride), + [stack]"r"(stack) + : "memory" + ); + + ff_deblock_v_luma_8_mmi((uint8_t *) &stack[6], 0x10, alpha, beta, tc0); + + __asm__ volatile ( + PTR_ADDU "%[addr0], %[stride], %[stride] \n\t" + PTR_ADDI "%[addr1], %[pix], -0x02 \n\t" + PTR_ADDU "%[addr6], %[addr0], %[addr0] \n\t" + PTR_ADDU "%[addr2], %[addr0], %[stride] \n\t" + PTR_ADDU "%[addr7], %[addr6], %[addr6] \n\t" + PTR_ADDU "%[addr4], %[addr1], %[addr2] \n\t" + "ldc1 %[ftmp0], 0x10(%[stack]) \n\t" + "ldc1 %[ftmp1], 0x20(%[stack]) \n\t" + "ldc1 %[ftmp2], 0x30(%[stack]) \n\t" + "ldc1 %[ftmp3], 0x40(%[stack]) \n\t" + "punpckhwd %[ftmp4], %[ftmp0], %[ftmp0] \n\t" + "punpckhwd %[ftmp5], %[ftmp1], %[ftmp1] \n\t" + "punpckhwd %[ftmp6], %[ftmp2], %[ftmp2] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "punpcklhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[addr1]) \n\t" + PTR_ADDU "%[addr3], %[addr1], %[stride] \n\t" + "punpckhwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[addr3]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[addr3]) \n\t" + "gsswlc1 %[ftmp0], 0x03(%[addr5]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[addr5]) \n\t" + "punpckhwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "punpckhwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "gsswlc1 %[ftmp0], 0x03(%[addr4]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[addr4]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "punpcklhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t" + PTR_ADDU "%[addr3], %[addr4], %[stride] \n\t" + "punpckhhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "gsswlc1 %[ftmp5], 0x03(%[addr3]) \n\t" + "gsswrc1 %[ftmp5], 0x00(%[addr3]) \n\t" + PTR_ADDU "%[addr3], %[addr4], %[addr0] \n\t" + "punpckhwd %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t" + "gsswlc1 %[ftmp5], 0x03(%[addr3]) \n\t" + "gsswrc1 %[ftmp5], 0x00(%[addr3]) \n\t" + "gsswlc1 %[ftmp4], 0x03(%[addr5]) \n\t" + "gsswrc1 %[ftmp4], 0x00(%[addr5]) \n\t" + PTR_ADDU "%[addr3], %[addr4], %[addr6] \n\t" + "punpckhwd %[ftmp4], %[ftmp4], %[ftmp4] \n\t" + PTR_ADDU "%[addr1], %[addr1], %[addr7] \n\t" + "gsswlc1 %[ftmp4], 0x03(%[addr3]) \n\t" + "gsswrc1 %[ftmp4], 0x00(%[addr3]) \n\t" + PTR_ADDU "%[addr4], %[addr4], %[addr7] \n\t" + "ldc1 %[ftmp0], 0x18(%[stack]) \n\t" + "ldc1 %[ftmp1], 0x28(%[stack]) \n\t" + "ldc1 %[ftmp2], 0x38(%[stack]) \n\t" + "ldc1 %[ftmp3], 0x48(%[stack]) \n\t" + PTR_ADDU "%[addr0], %[stride], %[stride] \n\t" + "punpckhwd %[ftmp4], %[ftmp0], %[ftmp0] \n\t" + PTR_ADDU "%[addr6], %[addr0], %[addr0] \n\t" + "punpckhwd %[ftmp5], %[ftmp1], %[ftmp1] \n\t" + "punpckhwd %[ftmp6], %[ftmp2], %[ftmp2] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + PTR_ADDU "%[addr3], %[addr1], %[stride] \n\t" + "punpcklhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[addr1]) \n\t" + "punpckhwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[addr3]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[addr3]) \n\t" + "gsswlc1 %[ftmp0], 0x03(%[addr5]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[addr5]) \n\t" + "punpckhwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "punpckhwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "gsswlc1 %[ftmp0], 0x03(%[addr4]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[addr4]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + PTR_ADDU "%[addr3], %[addr4], %[stride] \n\t" + "punpcklhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t" + "punpckhhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "gsswlc1 %[ftmp5], 0x03(%[addr3]) \n\t" + "gsswrc1 %[ftmp5], 0x00(%[addr3]) \n\t" + PTR_ADDU "%[addr3], %[addr4], %[addr0] \n\t" + "punpckhwd %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t" + "gsswlc1 %[ftmp5], 0x03(%[addr3]) \n\t" + "gsswrc1 %[ftmp5], 0x00(%[addr3]) \n\t" + "gsswlc1 %[ftmp4], 0x03(%[addr5]) \n\t" + "gsswrc1 %[ftmp4], 0x00(%[addr5]) \n\t" + PTR_ADDU "%[addr3], %[addr4], %[addr6] \n\t" + "punpckhwd %[ftmp4], %[ftmp4], %[ftmp4] \n\t" + "gsswlc1 %[ftmp4], 0x03(%[addr3]) \n\t" + "gsswrc1 %[ftmp4], 0x00(%[addr3]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]), + [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]), + [addr6]"=&r"(addr[6]), [addr7]"=&r"(addr[7]) + : [pix]"r"(pix), [stride]"r"((mips_reg)stride), + [stack]"r"(stack) + : "memory" + ); +} + +void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha, + int beta) +{ + uint64_t ptmp[0x11]; + uint64_t pdat[4]; + double ftmp[9]; + mips_reg addr[7]; + + __asm__ volatile ( + PTR_ADDU "%[addr0], %[stride], %[stride] \n\t" + PTR_ADDI "%[addr1], %[pix], -0x04 \n\t" + PTR_ADDU "%[addr2], %[addr0], %[stride] \n\t" + PTR_ADDU "%[addr3], %[addr0], %[addr0] \n\t" + PTR_ADDU "%[addr4], %[addr1], %[addr2] \n\t" + PTR_ADDU "%[addr5], %[addr1], %[stride] \n\t" + "gsldlc1 %[ftmp0], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + PTR_ADDU "%[addr6], %[addr1], %[addr0] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr5]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr5]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[addr6]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr6]) \n\t" + PTR_ADDU "%[addr5], %[addr4], %[stride] \n\t" + "gsldlc1 %[ftmp3], 0x07(%[addr4]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr4]) \n\t" + PTR_ADDU "%[addr6], %[addr4], %[addr0] \n\t" + "gsldlc1 %[ftmp4], 0x07(%[addr5]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[addr5]) \n\t" + PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t" + "gsldlc1 %[ftmp5], 0x07(%[addr6]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[addr6]) \n\t" + "gsldlc1 %[ftmp6], 0x07(%[addr5]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[addr5]) \n\t" + PTR_ADDU "%[addr5], %[addr4], %[addr3] \n\t" + "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "gsldlc1 %[ftmp8], 0x07(%[addr5]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[addr5]) \n\t" + "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "sdc1 %[ftmp3], 0x00(%[ptmp]) \n\t" + "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t" + "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "punpckhhw %[ftmp6], %[ftmp7], %[ftmp1] \n\t" + "punpcklhw %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "sdc1 %[ftmp2], 0x20(%[ptmp]) \n\t" + "ldc1 %[ftmp2], 0x00(%[ptmp]) \n\t" + "punpckhhw %[ftmp1], %[ftmp2], %[ftmp5] \n\t" + "punpcklhw %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + "punpckhwd %[ftmp5], %[ftmp0], %[ftmp4] \n\t" + "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "punpckhwd %[ftmp4], %[ftmp7], %[ftmp2] \n\t" + "punpcklwd %[ftmp7], %[ftmp7], %[ftmp2] \n\t" + "sdc1 %[ftmp0], 0x00(%[ptmp]) \n\t" + "sdc1 %[ftmp5], 0x10(%[ptmp]) \n\t" + "sdc1 %[ftmp7], 0x40(%[ptmp]) \n\t" + "sdc1 %[ftmp4], 0x50(%[ptmp]) \n\t" + "ldc1 %[ftmp8], 0x20(%[ptmp]) \n\t" + "punpckhwd %[ftmp0], %[ftmp3], %[ftmp8] \n\t" + "punpcklwd %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "punpckhwd %[ftmp5], %[ftmp6], %[ftmp1] \n\t" + "punpcklwd %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + PTR_ADDU "%[addr5], %[addr3], %[addr3] \n\t" + "sdc1 %[ftmp3], 0x20(%[ptmp]) \n\t" + "sdc1 %[ftmp0], 0x30(%[ptmp]) \n\t" + "sdc1 %[ftmp6], 0x60(%[ptmp]) \n\t" + "sdc1 %[ftmp5], 0x70(%[ptmp]) \n\t" + PTR_ADDU "%[addr1], %[addr1], %[addr5] \n\t" + PTR_ADDU "%[addr4], %[addr4], %[addr5] \n\t" + PTR_ADDU "%[addr5], %[addr1], %[stride] \n\t" + "gsldlc1 %[ftmp0], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + PTR_ADDU "%[addr6], %[addr1], %[addr0] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr5]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr5]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[addr6]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr6]) \n\t" + PTR_ADDU "%[addr5], %[addr4], %[stride] \n\t" + "gsldlc1 %[ftmp3], 0x07(%[addr4]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr4]) \n\t" + PTR_ADDU "%[addr6], %[addr4], %[addr0] \n\t" + "gsldlc1 %[ftmp4], 0x07(%[addr5]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[addr5]) \n\t" + PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t" + "gsldlc1 %[ftmp5], 0x07(%[addr6]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[addr6]) \n\t" + "gsldlc1 %[ftmp6], 0x07(%[addr5]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[addr5]) \n\t" + PTR_ADDU "%[addr5], %[addr4], %[addr3] \n\t" + "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "gsldlc1 %[ftmp8], 0x07(%[addr5]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[addr5]) \n\t" + "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "sdc1 %[ftmp3], 0x08(%[ptmp]) \n\t" + "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t" + "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "punpckhhw %[ftmp6], %[ftmp7], %[ftmp1] \n\t" + "punpcklhw %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "sdc1 %[ftmp2], 0x28(%[ptmp]) \n\t" + "ldc1 %[ftmp2], 0x08(%[ptmp]) \n\t" + "punpckhhw %[ftmp1], %[ftmp2], %[ftmp5] \n\t" + "punpcklhw %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + "punpckhwd %[ftmp5], %[ftmp0], %[ftmp4] \n\t" + "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "punpckhwd %[ftmp4], %[ftmp7], %[ftmp2] \n\t" + "punpcklwd %[ftmp7], %[ftmp7], %[ftmp2] \n\t" + "sdc1 %[ftmp0], 0x08(%[ptmp]) \n\t" + "sdc1 %[ftmp5], 0x18(%[ptmp]) \n\t" + "sdc1 %[ftmp7], 0x48(%[ptmp]) \n\t" + "sdc1 %[ftmp4], 0x58(%[ptmp]) \n\t" + "ldc1 %[ftmp8], 0x28(%[ptmp]) \n\t" + "punpckhwd %[ftmp0], %[ftmp3], %[ftmp8] \n\t" + "punpcklwd %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "punpckhwd %[ftmp5], %[ftmp6], %[ftmp1] \n\t" + "punpcklwd %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "sdc1 %[ftmp3], 0x28(%[ptmp]) \n\t" + "sdc1 %[ftmp0], 0x38(%[ptmp]) \n\t" + "sdc1 %[ftmp6], 0x68(%[ptmp]) \n\t" + "sdc1 %[ftmp5], 0x78(%[ptmp]) \n\t" + PTR_S "%[addr1], 0x00(%[pdat]) \n\t" + PTR_S "%[addr2], 0x08(%[pdat]) \n\t" + PTR_S "%[addr0], 0x10(%[pdat]) \n\t" + PTR_S "%[addr3], 0x18(%[pdat]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]), + [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]), + [addr6]"=&r"(addr[6]) + : [pix]"r"(pix), [stride]"r"((mips_reg)stride), + [ptmp]"r"(ptmp), [pdat]"r"(pdat) + : "memory" + ); + + ff_deblock_v_luma_intra_8_mmi((uint8_t *) &ptmp[8], 0x10, alpha, beta); + + __asm__ volatile ( + PTR_L "%[addr1], 0x00(%[pdat]) \n\t" + PTR_L "%[addr2], 0x08(%[pdat]) \n\t" + PTR_L "%[addr0], 0x10(%[pdat]) \n\t" + PTR_L "%[addr3], 0x18(%[pdat]) \n\t" + PTR_ADDU "%[addr4], %[addr1], %[addr2] \n\t" + "ldc1 %[ftmp0], 0x08(%[ptmp]) \n\t" + "ldc1 %[ftmp1], 0x18(%[ptmp]) \n\t" + "ldc1 %[ftmp2], 0x28(%[ptmp]) \n\t" + "ldc1 %[ftmp3], 0x38(%[ptmp]) \n\t" + "ldc1 %[ftmp4], 0x48(%[ptmp]) \n\t" + "ldc1 %[ftmp5], 0x58(%[ptmp]) \n\t" + "ldc1 %[ftmp6], 0x68(%[ptmp]) \n\t" + "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "ldc1 %[ftmp8], 0x78(%[ptmp]) \n\t" + "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "gssdlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t" + "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t" + "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "punpckhhw %[ftmp6], %[ftmp7], %[ftmp1] \n\t" + "punpcklhw %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "gssdlc1 %[ftmp2], 0x07(%[addr5]) \n\t" + "gssdrc1 %[ftmp2], 0x00(%[addr5]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t" + "punpckhhw %[ftmp1], %[ftmp2], %[ftmp5] \n\t" + "punpcklhw %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + "punpckhwd %[ftmp5], %[ftmp0], %[ftmp4] \n\t" + "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "punpckhwd %[ftmp4], %[ftmp7], %[ftmp2] \n\t" + "punpcklwd %[ftmp7], %[ftmp7], %[ftmp2] \n\t" + PTR_ADDU "%[addr5], %[addr1], %[stride] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + PTR_ADDU "%[addr6], %[addr4], %[stride] \n\t" + "gssdlc1 %[ftmp5], 0x07(%[addr5]) \n\t" + "gssdrc1 %[ftmp5], 0x00(%[addr5]) \n\t" + PTR_ADDU "%[addr5], %[addr4], %[addr0] \n\t" + "gssdlc1 %[ftmp7], 0x07(%[addr6]) \n\t" + "gssdrc1 %[ftmp7], 0x00(%[addr6]) \n\t" + PTR_ADDU "%[addr6], %[addr1], %[addr0] \n\t" + "gssdlc1 %[ftmp4], 0x07(%[addr5]) \n\t" + "gssdrc1 %[ftmp4], 0x00(%[addr5]) \n\t" + "gsldlc1 %[ftmp8], 0x07(%[addr6]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[addr6]) \n\t" + PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t" + "punpckhwd %[ftmp0], %[ftmp3], %[ftmp8] \n\t" + "punpcklwd %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "punpckhwd %[ftmp5], %[ftmp6], %[ftmp1] \n\t" + "punpcklwd %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "gssdlc1 %[ftmp3], 0x07(%[addr5]) \n\t" + "gssdrc1 %[ftmp3], 0x00(%[addr5]) \n\t" + PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[addr4]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[addr4]) \n\t" + PTR_ADDU "%[addr6], %[addr4], %[addr3] \n\t" + "gssdlc1 %[ftmp6], 0x07(%[addr5]) \n\t" + "gssdrc1 %[ftmp6], 0x00(%[addr5]) \n\t" + PTR_ADDU "%[addr5], %[addr3], %[addr3] \n\t" + "gssdlc1 %[ftmp5], 0x07(%[addr6]) \n\t" + "gssdrc1 %[ftmp5], 0x00(%[addr6]) \n\t" + PTR_SUBU "%[addr1], %[addr1], %[addr5] \n\t" + PTR_SUBU "%[addr4], %[addr4], %[addr5] \n\t" + "ldc1 %[ftmp0], 0x00(%[ptmp]) \n\t" + "ldc1 %[ftmp1], 0x10(%[ptmp]) \n\t" + "ldc1 %[ftmp2], 0x20(%[ptmp]) \n\t" + "ldc1 %[ftmp3], 0x30(%[ptmp]) \n\t" + "ldc1 %[ftmp4], 0x40(%[ptmp]) \n\t" + "ldc1 %[ftmp5], 0x50(%[ptmp]) \n\t" + "ldc1 %[ftmp6], 0x60(%[ptmp]) \n\t" + "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "ldc1 %[ftmp8], 0x70(%[ptmp]) \n\t" + "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "gssdlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t" + "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t" + "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "punpckhhw %[ftmp6], %[ftmp7], %[ftmp1] \n\t" + "punpcklhw %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "gssdlc1 %[ftmp2], 0x07(%[addr5]) \n\t" + "gssdrc1 %[ftmp2], 0x00(%[addr5]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t" + "punpckhhw %[ftmp1], %[ftmp2], %[ftmp5] \n\t" + "punpcklhw %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + "punpckhwd %[ftmp5], %[ftmp0], %[ftmp4] \n\t" + "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "punpckhwd %[ftmp4], %[ftmp7], %[ftmp2] \n\t" + "punpcklwd %[ftmp7], %[ftmp7], %[ftmp2] \n\t" + PTR_ADDU "%[addr5], %[addr1], %[stride] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + PTR_ADDU "%[addr6], %[addr4], %[stride] \n\t" + "gssdlc1 %[ftmp5], 0x07(%[addr5]) \n\t" + "gssdrc1 %[ftmp5], 0x00(%[addr5]) \n\t" + PTR_ADDU "%[addr5], %[addr4], %[addr0] \n\t" + "gssdlc1 %[ftmp7], 0x07(%[addr6]) \n\t" + "gssdrc1 %[ftmp7], 0x00(%[addr6]) \n\t" + PTR_ADDU "%[addr6], %[addr1], %[addr0] \n\t" + "gssdlc1 %[ftmp4], 0x07(%[addr5]) \n\t" + "gssdrc1 %[ftmp4], 0x00(%[addr5]) \n\t" + "gsldlc1 %[ftmp8], 0x07(%[addr6]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[addr6]) \n\t" + PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t" + "punpckhwd %[ftmp0], %[ftmp3], %[ftmp8] \n\t" + "punpcklwd %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "punpckhwd %[ftmp5], %[ftmp6], %[ftmp1] \n\t" + "punpcklwd %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "gssdlc1 %[ftmp3], 0x07(%[addr5]) \n\t" + "gssdrc1 %[ftmp3], 0x00(%[addr5]) \n\t" + PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[addr4]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[addr4]) \n\t" + PTR_ADDU "%[addr6], %[addr4], %[addr3] \n\t" + "gssdlc1 %[ftmp6], 0x07(%[addr5]) \n\t" + "gssdrc1 %[ftmp6], 0x00(%[addr5]) \n\t" + "gssdlc1 %[ftmp5], 0x07(%[addr6]) \n\t" + "gssdrc1 %[ftmp5], 0x00(%[addr6]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]), + [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]), + [addr6]"=&r"(addr[6]) + : [pix]"r"(pix), [stride]"r"((mips_reg)stride), + [ptmp]"r"(ptmp), [pdat]"r"(pdat) + : "memory" + ); +} diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c new file mode 100644 index 0000000000..fce01ac91c --- /dev/null +++ b/libavcodec/mips/h264dsp_msa.c @@ -0,0 +1,2544 @@ +/* + * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "h264dsp_mips.h" + +static void avc_wgt_4x2_msa(uint8_t *data, int32_t stride, + int32_t log2_denom, int32_t src_weight, + int32_t offset_in) +{ + uint32_t data0, data1; + v16u8 zero = { 0 }; + v16u8 src0, src1; + v4i32 res0, res1; + v8i16 temp0, temp1, vec0, vec1, wgt, denom, offset; + v8u16 out0, out1; + + offset_in <<= (log2_denom); + + if (log2_denom) { + offset_in += (1 << (log2_denom - 1)); + } + + wgt = __msa_fill_h(src_weight); + offset = __msa_fill_h(offset_in); + denom = __msa_fill_h(log2_denom); + + data0 = LW(data); + data1 = LW(data + stride); + + src0 = (v16u8) __msa_fill_w(data0); + src1 = (v16u8) __msa_fill_w(data1); + + ILVR_B2_SH(zero, src0, zero, src1, vec0, vec1); + MUL2(wgt, vec0, wgt, vec1, temp0, temp1); + ADDS_SH2_SH(temp0, offset, temp1, offset, temp0, temp1); + MAXI_SH2_SH(temp0, temp1, 0); + + out0 = (v8u16) __msa_srl_h(temp0, denom); + out1 = (v8u16) __msa_srl_h(temp1, denom); + + SAT_UH2_UH(out0, out1, 7); + PCKEV_B2_SW(out0, out0, out1, out1, res0, res1); + + data0 = __msa_copy_u_w(res0, 0); + data1 = __msa_copy_u_w(res1, 0); + SW(data0, data); + data += stride; + SW(data1, data); +} + +static void avc_wgt_4x4multiple_msa(uint8_t *data, int32_t stride, + int32_t height, int32_t log2_denom, + int32_t src_weight, int32_t offset_in) +{ + uint8_t cnt; + uint32_t data0, data1, data2, data3; + v16u8 zero = { 0 }; + v16u8 src0, src1, src2, src3; + v8u16 temp0, temp1, temp2, temp3, wgt; + v8i16 denom, offset; + + offset_in <<= (log2_denom); + + if (log2_denom) { + offset_in += (1 << (log2_denom - 1)); + } + + wgt = (v8u16) __msa_fill_h(src_weight); + offset = __msa_fill_h(offset_in); + denom = __msa_fill_h(log2_denom); + + for (cnt = height / 4; cnt--;) { + LW4(data, stride, data0, data1, data2, data3); + + src0 = (v16u8) __msa_fill_w(data0); + src1 = (v16u8) __msa_fill_w(data1); + src2 = (v16u8) __msa_fill_w(data2); + src3 = (v16u8) __msa_fill_w(data3); + + ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3, + temp0, temp1, temp2, temp3); + MUL4(wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3, + temp0, temp1, temp2, temp3); + ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset, + temp0, temp1, temp2, temp3); + MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0); + SRL_H4_UH(temp0, temp1, temp2, temp3, denom); + SAT_UH4_UH(temp0, temp1, temp2, temp3, 7); + PCKEV_ST4x4_UB(temp0, temp1, temp2, temp3, data, stride); + data += (4 * stride); + } +} + +static void avc_wgt_4width_msa(uint8_t *data, int32_t stride, + int32_t height, int32_t log2_denom, + int32_t src_weight, int32_t offset_in) +{ + if (2 == height) { + avc_wgt_4x2_msa(data, stride, log2_denom, src_weight, offset_in); + } else { + avc_wgt_4x4multiple_msa(data, stride, height, log2_denom, src_weight, + offset_in); + } +} + +static void avc_wgt_8width_msa(uint8_t *data, int32_t stride, + int32_t height, int32_t log2_denom, + int32_t src_weight, int32_t offset_in) +{ + uint8_t cnt; + v16u8 zero = { 0 }; + v16u8 src0, src1, src2, src3; + v8u16 src0_r, src1_r, src2_r, src3_r; + v8u16 temp0, temp1, temp2, temp3; + v8u16 wgt, denom, offset; + v16i8 out0, out1; + + offset_in <<= (log2_denom); + + if (log2_denom) { + offset_in += (1 << (log2_denom - 1)); + } + + wgt = (v8u16) __msa_fill_h(src_weight); + offset = (v8u16) __msa_fill_h(offset_in); + denom = (v8u16) __msa_fill_h(log2_denom); + + for (cnt = height / 4; cnt--;) { + LD_UB4(data, stride, src0, src1, src2, src3); + ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3, + src0_r, src1_r, src2_r, src3_r); + MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, + temp0, temp1, temp2, temp3); + ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset, + temp0, temp1, temp2, temp3); + MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0); + SRL_H4_UH(temp0, temp1, temp2, temp3, denom); + SAT_UH4_UH(temp0, temp1, temp2, temp3, 7); + PCKEV_B2_SB(temp1, temp0, temp3, temp2, out0, out1); + ST8x4_UB(out0, out1, data, stride); + data += (4 * stride); + } +} + +static void avc_wgt_16width_msa(uint8_t *data, int32_t stride, + int32_t height, int32_t log2_denom, + int32_t src_weight, int32_t offset_in) +{ + uint8_t cnt; + v16i8 zero = { 0 }; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v8u16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r; + v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + v8u16 wgt, denom, offset; + + offset_in <<= (log2_denom); + + if (log2_denom) { + offset_in += (1 << (log2_denom - 1)); + } + + wgt = (v8u16) __msa_fill_h(src_weight); + offset = (v8u16) __msa_fill_h(offset_in); + denom = (v8u16) __msa_fill_h(log2_denom); + + for (cnt = height / 4; cnt--;) { + LD_UB4(data, stride, src0, src1, src2, src3); + ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3, + src0_r, src1_r, src2_r, src3_r); + ILVL_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3, + src0_l, src1_l, src2_l, src3_l); + MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, + temp0, temp1, temp2, temp3); + MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, + temp4, temp5, temp6, temp7); + ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset, + temp0, temp1, temp2, temp3); + ADDS_SH4_UH(temp4, offset, temp5, offset, temp6, offset, temp7, offset, + temp4, temp5, temp6, temp7); + MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0); + MAXI_SH4_UH(temp4, temp5, temp6, temp7, 0); + SRL_H4_UH(temp0, temp1, temp2, temp3, denom); + SRL_H4_UH(temp4, temp5, temp6, temp7, denom); + SAT_UH4_UH(temp0, temp1, temp2, temp3, 7); + SAT_UH4_UH(temp4, temp5, temp6, temp7, 7); + PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6, + dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, data, stride); + data += 4 * stride; + } +} + +static void avc_biwgt_4x2_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t log2_denom, int32_t src_weight, + int32_t dst_weight, int32_t offset_in) +{ + uint32_t load0, load1, out0, out1; + v16i8 src_wgt, dst_wgt, wgt; + v16i8 src0, src1, dst0, dst1; + v8i16 temp0, temp1, denom, offset, add_val; + int32_t val = 128 * (src_weight + dst_weight); + + offset_in = ((offset_in + 1) | 1) << log2_denom; + + src_wgt = __msa_fill_b(src_weight); + dst_wgt = __msa_fill_b(dst_weight); + offset = __msa_fill_h(offset_in); + denom = __msa_fill_h(log2_denom + 1); + add_val = __msa_fill_h(val); + offset += add_val; + + wgt = __msa_ilvev_b(dst_wgt, src_wgt); + + load0 = LW(src); + src += src_stride; + load1 = LW(src); + + src0 = (v16i8) __msa_fill_w(load0); + src1 = (v16i8) __msa_fill_w(load1); + + load0 = LW(dst); + load1 = LW(dst + dst_stride); + + dst0 = (v16i8) __msa_fill_w(load0); + dst1 = (v16i8) __msa_fill_w(load1); + + XORI_B4_128_SB(src0, src1, dst0, dst1); + ILVR_B2_SH(dst0, src0, dst1, src1, temp0, temp1); + + temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0); + temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1); + + temp0 >>= denom; + temp1 >>= denom; + + CLIP_SH2_0_255(temp0, temp1); + PCKEV_B2_SB(temp0, temp0, temp1, temp1, dst0, dst1); + + out0 = __msa_copy_u_w((v4i32) dst0, 0); + out1 = __msa_copy_u_w((v4i32) dst1, 0); + SW(out0, dst); + dst += dst_stride; + SW(out1, dst); +} + +static void avc_biwgt_4x4multiple_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t log2_denom, + int32_t src_weight, int32_t dst_weight, + int32_t offset_in) +{ + uint8_t cnt; + uint32_t load0, load1, load2, load3; + v16i8 src_wgt, dst_wgt, wgt; + v16i8 src0, src1, src2, src3; + v16i8 dst0, dst1, dst2, dst3; + v8i16 temp0, temp1, temp2, temp3; + v8i16 denom, offset, add_val; + int32_t val = 128 * (src_weight + dst_weight); + + offset_in = ((offset_in + 1) | 1) << log2_denom; + + src_wgt = __msa_fill_b(src_weight); + dst_wgt = __msa_fill_b(dst_weight); + offset = __msa_fill_h(offset_in); + denom = __msa_fill_h(log2_denom + 1); + add_val = __msa_fill_h(val); + offset += add_val; + + wgt = __msa_ilvev_b(dst_wgt, src_wgt); + + for (cnt = height / 4; cnt--;) { + LW4(src, src_stride, load0, load1, load2, load3); + src += (4 * src_stride); + + src0 = (v16i8) __msa_fill_w(load0); + src1 = (v16i8) __msa_fill_w(load1); + src2 = (v16i8) __msa_fill_w(load2); + src3 = (v16i8) __msa_fill_w(load3); + + LW4(dst, dst_stride, load0, load1, load2, load3); + + dst0 = (v16i8) __msa_fill_w(load0); + dst1 = (v16i8) __msa_fill_w(load1); + dst2 = (v16i8) __msa_fill_w(load2); + dst3 = (v16i8) __msa_fill_w(load3); + + XORI_B4_128_SB(src0, src1, src2, src3); + XORI_B4_128_SB(dst0, dst1, dst2, dst3); + ILVR_B4_SH(dst0, src0, dst1, src1, dst2, src2, dst3, src3, + temp0, temp1, temp2, temp3); + + temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0); + temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1); + temp2 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp2); + temp3 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp3); + + SRA_4V(temp0, temp1, temp2, temp3, denom); + CLIP_SH4_0_255(temp0, temp1, temp2, temp3); + PCKEV_ST4x4_UB(temp0, temp1, temp2, temp3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avc_biwgt_4width_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t log2_denom, + int32_t src_weight, int32_t dst_weight, + int32_t offset_in) +{ + if (2 == height) { + avc_biwgt_4x2_msa(src, src_stride, dst, dst_stride, log2_denom, + src_weight, dst_weight, offset_in); + } else { + avc_biwgt_4x4multiple_msa(src, src_stride, dst, dst_stride, height, + log2_denom, src_weight, dst_weight, + offset_in); + } +} + +static void avc_biwgt_8width_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t log2_denom, + int32_t src_weight, int32_t dst_weight, + int32_t offset_in) +{ + uint8_t cnt; + v16i8 src_wgt, dst_wgt, wgt; + v16i8 src0, src1, src2, src3; + v16i8 dst0, dst1, dst2, dst3; + v16i8 out0, out1; + v8i16 temp0, temp1, temp2, temp3; + v8i16 denom, offset, add_val; + int32_t val = 128 * (src_weight + dst_weight); + + offset_in = ((offset_in + 1) | 1) << log2_denom; + + src_wgt = __msa_fill_b(src_weight); + dst_wgt = __msa_fill_b(dst_weight); + offset = __msa_fill_h(offset_in); + denom = __msa_fill_h(log2_denom + 1); + add_val = __msa_fill_h(val); + offset += add_val; + + wgt = __msa_ilvev_b(dst_wgt, src_wgt); + + for (cnt = height / 4; cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + LD_SB4(dst, dst_stride, dst0, dst1, dst2, dst3); + XORI_B4_128_SB(src0, src1, src2, src3); + XORI_B4_128_SB(dst0, dst1, dst2, dst3); + ILVR_B4_SH(dst0, src0, dst1, src1, dst2, src2, dst3, src3, + temp0, temp1, temp2, temp3); + + temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0); + temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1); + temp2 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp2); + temp3 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp3); + + SRA_4V(temp0, temp1, temp2, temp3, denom); + CLIP_SH4_0_255(temp0, temp1, temp2, temp3); + PCKEV_B2_SB(temp1, temp0, temp3, temp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += 4 * dst_stride; + } +} + +static void avc_biwgt_16width_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t log2_denom, + int32_t src_weight, int32_t dst_weight, + int32_t offset_in) +{ + uint8_t cnt; + v16i8 src_wgt, dst_wgt, wgt; + v16i8 src0, src1, src2, src3; + v16i8 dst0, dst1, dst2, dst3; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + v8i16 denom, offset, add_val; + int32_t val = 128 * (src_weight + dst_weight); + + offset_in = ((offset_in + 1) | 1) << log2_denom; + + src_wgt = __msa_fill_b(src_weight); + dst_wgt = __msa_fill_b(dst_weight); + offset = __msa_fill_h(offset_in); + denom = __msa_fill_h(log2_denom + 1); + add_val = __msa_fill_h(val); + offset += add_val; + + wgt = __msa_ilvev_b(dst_wgt, src_wgt); + + for (cnt = height / 4; cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + LD_SB4(dst, dst_stride, dst0, dst1, dst2, dst3); + XORI_B4_128_SB(src0, src1, src2, src3); + XORI_B4_128_SB(dst0, dst1, dst2, dst3); + ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, + vec0, vec2, vec4, vec6); + ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, + vec1, vec3, vec5, vec7); + + temp0 = __msa_dpadd_s_h(offset, wgt, vec0); + temp1 = __msa_dpadd_s_h(offset, wgt, vec1); + temp2 = __msa_dpadd_s_h(offset, wgt, vec2); + temp3 = __msa_dpadd_s_h(offset, wgt, vec3); + temp4 = __msa_dpadd_s_h(offset, wgt, vec4); + temp5 = __msa_dpadd_s_h(offset, wgt, vec5); + temp6 = __msa_dpadd_s_h(offset, wgt, vec6); + temp7 = __msa_dpadd_s_h(offset, wgt, vec7); + + SRA_4V(temp0, temp1, temp2, temp3, denom); + SRA_4V(temp4, temp5, temp6, temp7, denom); + CLIP_SH4_0_255(temp0, temp1, temp2, temp3); + CLIP_SH4_0_255(temp4, temp5, temp6, temp7); + PCKEV_B4_SB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6, + dst0, dst1, dst2, dst3); + ST_SB4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += 4 * dst_stride; + } +} + +#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \ + q3_or_p3_org_in, p1_or_q1_org_in, \ + p2_or_q2_org_in, q1_or_p1_org_in, \ + p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \ +{ \ + v8i16 threshold; \ + v8i16 const3 = __msa_ldi_h(3); \ + \ + threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in); \ + threshold += (p1_or_q1_org_in); \ + \ + (p0_or_q0_out) = threshold << 1; \ + (p0_or_q0_out) += (p2_or_q2_org_in); \ + (p0_or_q0_out) += (q1_or_p1_org_in); \ + (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3); \ + \ + (p1_or_q1_out) = (p2_or_q2_org_in) + threshold; \ + (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2); \ + \ + (p2_or_q2_out) = (p2_or_q2_org_in) * const3; \ + (p2_or_q2_out) += (p3_or_q3_org_in); \ + (p2_or_q2_out) += (p3_or_q3_org_in); \ + (p2_or_q2_out) += threshold; \ + (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3); \ +} + +/* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */ +#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \ + p1_or_q1_org_in, p0_or_q0_out) \ +{ \ + (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in); \ + (p0_or_q0_out) += (p1_or_q1_org_in); \ + (p0_or_q0_out) += (p1_or_q1_org_in); \ + (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2); \ +} + +#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \ + p1_or_q1_org_in, p2_or_q2_org_in, \ + negate_tc_in, tc_in, p1_or_q1_out) \ +{ \ + v8i16 clip3, temp; \ + \ + clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in, \ + (v8u16) q0_or_p0_org_in); \ + temp = p1_or_q1_org_in << 1; \ + clip3 = clip3 - temp; \ + clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3); \ + clip3 = CLIP_SH(clip3, negate_tc_in, tc_in); \ + p1_or_q1_out = p1_or_q1_org_in + clip3; \ +} + +#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \ + p1_or_q1_org_in, q1_or_p1_org_in, \ + negate_threshold_in, threshold_in, \ + p0_or_q0_out, q0_or_p0_out) \ +{ \ + v8i16 q0_sub_p0, p1_sub_q1, delta; \ + \ + q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in; \ + p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in; \ + q0_sub_p0 <<= 2; \ + p1_sub_q1 += 4; \ + delta = q0_sub_p0 + p1_sub_q1; \ + delta >>= 3; \ + \ + delta = CLIP_SH(delta, negate_threshold_in, threshold_in); \ + \ + p0_or_q0_out = p0_or_q0_org_in + delta; \ + q0_or_p0_out = q0_or_p0_org_in - delta; \ + \ + CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out); \ +} + +#define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res) \ +{ \ + uint32_t load0, load1, load2, load3; \ + v16u8 src0 = { 0 }; \ + v16u8 src1 = { 0 }; \ + v16u8 src2 = { 0 }; \ + v16u8 src3 = { 0 }; \ + v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \ + v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \ + v8i16 tc, q0_sub_p0, p1_sub_q1, delta; \ + v8i16 res0_r, res1_r; \ + v16i8 zeros = { 0 }; \ + v16u8 res0, res1; \ + \ + LW4((src - 2), stride, load0, load1, load2, load3); \ + src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \ + src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \ + src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2); \ + src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3); \ + \ + TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3); \ + \ + p0_asub_q0 = __msa_asub_u_b(src2, src1); \ + p1_asub_p0 = __msa_asub_u_b(src1, src0); \ + q1_asub_q0 = __msa_asub_u_b(src2, src3); \ + \ + tc = __msa_fill_h(tc_val); \ + \ + is_less_than_alpha = (p0_asub_q0 < alpha); \ + is_less_than_beta = (p1_asub_p0 < beta); \ + is_less_than = is_less_than_alpha & is_less_than_beta; \ + is_less_than_beta = (q1_asub_q0 < beta); \ + is_less_than = is_less_than_beta & is_less_than; \ + \ + ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \ + HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \ + \ + q0_sub_p0 <<= 2; \ + delta = q0_sub_p0 + p1_sub_q1; \ + delta = __msa_srari_h(delta, 3); \ + \ + delta = CLIP_SH(delta, -tc, tc); \ + \ + ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \ + \ + res0_r += delta; \ + res1_r -= delta; \ + \ + CLIP_SH2_0_255(res0_r, res1_r); \ + PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \ + \ + res0 = __msa_bmnz_v(src1, res0, is_less_than); \ + res1 = __msa_bmnz_v(src2, res1, is_less_than); \ + \ + res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \ +} + +#define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3) \ +{ \ + v16i8 zero_m = { 0 }; \ + \ + out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0); \ + out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2); \ + SLDI_B2_0_UB(out1, out2, out2, out3, 2); \ +} + +#define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res) \ +{ \ + uint32_t load0, load1; \ + v16u8 src0 = { 0 }; \ + v16u8 src1 = { 0 }; \ + v16u8 src2 = { 0 }; \ + v16u8 src3 = { 0 }; \ + v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \ + v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \ + v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r; \ + v16i8 zeros = { 0 }; \ + v16u8 res0, res1; \ + \ + load0 = LW(src - 2); \ + load1 = LW(src - 2 + stride); \ + \ + src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \ + src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \ + \ + TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3); \ + \ + p0_asub_q0 = __msa_asub_u_b(src2, src1); \ + p1_asub_p0 = __msa_asub_u_b(src1, src0); \ + q1_asub_q0 = __msa_asub_u_b(src2, src3); \ + \ + tc = __msa_fill_h(tc_val); \ + \ + is_less_than_alpha = (p0_asub_q0 < alpha); \ + is_less_than_beta = (p1_asub_p0 < beta); \ + is_less_than = is_less_than_alpha & is_less_than_beta; \ + is_less_than_beta = (q1_asub_q0 < beta); \ + is_less_than = is_less_than_beta & is_less_than; \ + \ + ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \ + HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \ + \ + q0_sub_p0 <<= 2; \ + delta = q0_sub_p0 + p1_sub_q1; \ + delta = __msa_srari_h(delta, 3); \ + delta = CLIP_SH(delta, -tc, tc); \ + \ + ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \ + \ + res0_r += delta; \ + res1_r -= delta; \ + \ + CLIP_SH2_0_255(res0_r, res1_r); \ + PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \ + \ + res0 = __msa_bmnz_v(src1, res0, is_less_than); \ + res1 = __msa_bmnz_v(src2, res1, is_less_than); \ + \ + res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \ +} + +static void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data, + uint8_t alpha_in, + uint8_t beta_in, + uint32_t img_width) +{ + v16u8 p2_asub_p0, q2_asub_q0, p0_asub_q0; + v16u8 alpha, beta; + v16u8 is_less_than, is_less_than_beta, negate_is_less_than_beta; + v16u8 p2, p1, p0, q0, q1, q2; + v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org; + v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; + v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; + v8i16 p2_r = { 0 }; + v8i16 p1_r = { 0 }; + v8i16 p0_r = { 0 }; + v8i16 q0_r = { 0 }; + v8i16 q1_r = { 0 }; + v8i16 q2_r = { 0 }; + v8i16 p2_l = { 0 }; + v8i16 p1_l = { 0 }; + v8i16 p0_l = { 0 }; + v8i16 q0_l = { 0 }; + v8i16 q1_l = { 0 }; + v8i16 q2_l = { 0 }; + v16u8 tmp_flag; + v16i8 zero = { 0 }; + + alpha = (v16u8) __msa_fill_b(alpha_in); + beta = (v16u8) __msa_fill_b(beta_in); + + LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org); + + { + v16u8 p1_asub_p0, q1_asub_q0, is_less_than_alpha; + + p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org); + p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org); + q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org); + + is_less_than_alpha = (p0_asub_q0 < alpha); + is_less_than_beta = (p1_asub_p0 < beta); + is_less_than = is_less_than_beta & is_less_than_alpha; + is_less_than_beta = (q1_asub_q0 < beta); + is_less_than = is_less_than_beta & is_less_than; + } + + if (!__msa_test_bz_v(is_less_than)) { + q2_org = LD_UB(data + (2 * img_width)); + p3_org = LD_UB(data - (img_width << 2)); + p2_org = LD_UB(data - (3 * img_width)); + + UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l); + UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l); + UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l); + + tmp_flag = alpha >> 2; + tmp_flag = tmp_flag + 2; + tmp_flag = (p0_asub_q0 < tmp_flag); + + p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org); + is_less_than_beta = (p2_asub_p0 < beta); + is_less_than_beta = is_less_than_beta & tmp_flag; + negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff); + is_less_than_beta = is_less_than_beta & is_less_than; + negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; + { + v8u16 is_less_than_beta_l, is_less_than_beta_r; + + q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org); + + is_less_than_beta_r = + (v8u16) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8); + if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) { + v8i16 p3_org_r; + + ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r); + AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r, + p2_r, q1_org_r, p0_r, p1_r, p2_r); + } + + q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org); + + is_less_than_beta_l = + (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8); + + if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) { + v8i16 p3_org_l; + + ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l); + AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l, + p2_l, q1_org_l, p0_l, p1_l, p2_l); + } + } + /* combine and store */ + if (!__msa_test_bz_v(is_less_than_beta)) { + PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2); + + p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta); + p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta); + p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta); + + ST_UB(p1_org, data - (2 * img_width)); + ST_UB(p2_org, data - (3 * img_width)); + } + { + v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l; + + negate_is_less_than_beta_r = + (v8u16) __msa_sldi_b((v16i8) negate_is_less_than_beta, zero, 8); + if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_r)) { + AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r); + } + + negate_is_less_than_beta_l = + (v8u16) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8); + if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_l)) { + AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l); + } + } + /* combine */ + if (!__msa_test_bz_v(negate_is_less_than_beta)) { + p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r); + p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta); + } + + ST_UB(p0_org, data - img_width); + + /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */ + q3_org = LD_UB(data + (3 * img_width)); + q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org); + is_less_than_beta = (q2_asub_q0 < beta); + is_less_than_beta = is_less_than_beta & tmp_flag; + negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff); + is_less_than_beta = is_less_than_beta & is_less_than; + negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; + + { + v8u16 is_less_than_beta_l, is_less_than_beta_r; + is_less_than_beta_r = + (v8u16) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8); + if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) { + v8i16 q3_org_r; + + ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r); + AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r, + q2_r, p1_org_r, q0_r, q1_r, q2_r); + } + is_less_than_beta_l = + (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8); + if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) { + v8i16 q3_org_l; + + ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l); + AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l, + q2_l, p1_org_l, q0_l, q1_l, q2_l); + } + } + + /* combine and store */ + if (!__msa_test_bz_v(is_less_than_beta)) { + PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2); + q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta); + q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta); + q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta); + + ST_UB(q1_org, data + img_width); + ST_UB(q2_org, data + 2 * img_width); + } + { + v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l; + negate_is_less_than_beta_r = + (v8u16) __msa_sldi_b((v16i8) negate_is_less_than_beta, zero, 8); + if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_r)) { + AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r); + } + + negate_is_less_than_beta_l = + (v8u16) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8); + if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_l)) { + AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l); + } + } + /* combine */ + if (!__msa_test_bz_v(negate_is_less_than_beta)) { + q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r); + q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta); + } + ST_UB(q0_org, data); + } +} + +static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data, + uint8_t alpha_in, + uint8_t beta_in, + uint32_t img_width) +{ + uint8_t *src; + v16u8 alpha, beta, p0_asub_q0; + v16u8 is_less_than_alpha, is_less_than; + v16u8 is_less_than_beta, negate_is_less_than_beta; + v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org; + v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; + v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; + v8i16 p2_r = { 0 }; + v8i16 p1_r = { 0 }; + v8i16 p0_r = { 0 }; + v8i16 q0_r = { 0 }; + v8i16 q1_r = { 0 }; + v8i16 q2_r = { 0 }; + v8i16 p2_l = { 0 }; + v8i16 p1_l = { 0 }; + v8i16 p0_l = { 0 }; + v8i16 q0_l = { 0 }; + v8i16 q1_l = { 0 }; + v8i16 q2_l = { 0 }; + v16i8 zero = { 0 }; + v16u8 tmp_flag; + + src = data - 4; + { + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + + LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7); + LD_UB8(src + (8 * img_width), img_width, + row8, row9, row10, row11, row12, row13, row14, row15); + + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, + row4, row5, row6, row7, + row8, row9, row10, row11, + row12, row13, row14, row15, + p3_org, p2_org, p1_org, p0_org, + q0_org, q1_org, q2_org, q3_org); + } + UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l); + UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l); + UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l); + UNPCK_UB_SH(q1_org, q1_org_r, q1_org_l); + + /* if ( ((unsigned)ABS(p0-q0) < thresholds->alpha_in) && + ((unsigned)ABS(p1-p0) < thresholds->beta_in) && + ((unsigned)ABS(q1-q0) < thresholds->beta_in) ) */ + { + v16u8 p1_asub_p0, q1_asub_q0; + + p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org); + p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org); + q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org); + + alpha = (v16u8) __msa_fill_b(alpha_in); + beta = (v16u8) __msa_fill_b(beta_in); + + is_less_than_alpha = (p0_asub_q0 < alpha); + is_less_than_beta = (p1_asub_p0 < beta); + is_less_than = is_less_than_beta & is_less_than_alpha; + is_less_than_beta = (q1_asub_q0 < beta); + is_less_than = is_less_than_beta & is_less_than; + } + + if (!__msa_test_bz_v(is_less_than)) { + tmp_flag = alpha >> 2; + tmp_flag = tmp_flag + 2; + tmp_flag = (p0_asub_q0 < tmp_flag); + + { + v16u8 p2_asub_p0; + + p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org); + is_less_than_beta = (p2_asub_p0 < beta); + } + is_less_than_beta = tmp_flag & is_less_than_beta; + negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff); + is_less_than_beta = is_less_than_beta & is_less_than; + negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; + + /* right */ + { + v16u8 is_less_than_beta_r; + + is_less_than_beta_r = + (v16u8) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8); + if (!__msa_test_bz_v(is_less_than_beta_r)) { + v8i16 p3_org_r; + + ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r); + AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r, + p2_r, q1_org_r, p0_r, p1_r, p2_r); + } + } + /* left */ + { + v16u8 is_less_than_beta_l; + + is_less_than_beta_l = + (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8); + if (!__msa_test_bz_v(is_less_than_beta_l)) { + v8i16 p3_org_l; + + ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l); + AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l, + p2_l, q1_org_l, p0_l, p1_l, p2_l); + } + } + /* combine and store */ + if (!__msa_test_bz_v(is_less_than_beta)) { + v16u8 p0, p2, p1; + + PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2); + p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta); + p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta); + p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta); + } + /* right */ + { + v16u8 negate_is_less_than_beta_r; + + negate_is_less_than_beta_r = + (v16u8) __msa_sldi_b((v16i8) negate_is_less_than_beta, zero, 8); + + if (!__msa_test_bz_v(negate_is_less_than_beta_r)) { + AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r); + } + } + /* left */ + { + v16u8 negate_is_less_than_beta_l; + + negate_is_less_than_beta_l = + (v16u8) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8); + if (!__msa_test_bz_v(negate_is_less_than_beta_l)) { + AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l); + } + } + + if (!__msa_test_bz_v(negate_is_less_than_beta)) { + v16u8 p0; + + p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r); + p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta); + } + + { + v16u8 q2_asub_q0; + + q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org); + is_less_than_beta = (q2_asub_q0 < beta); + } + + is_less_than_beta = is_less_than_beta & tmp_flag; + negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff); + + is_less_than_beta = is_less_than_beta & is_less_than; + negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; + + /* right */ + { + v16u8 is_less_than_beta_r; + + is_less_than_beta_r = + (v16u8) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8); + if (!__msa_test_bz_v(is_less_than_beta_r)) { + v8i16 q3_org_r; + + ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r); + AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r, + q2_r, p1_org_r, q0_r, q1_r, q2_r); + } + } + /* left */ + { + v16u8 is_less_than_beta_l; + + is_less_than_beta_l = + (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8); + if (!__msa_test_bz_v(is_less_than_beta_l)) { + v8i16 q3_org_l; + + ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l); + AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l, + q2_l, p1_org_l, q0_l, q1_l, q2_l); + } + } + /* combine and store */ + if (!__msa_test_bz_v(is_less_than_beta)) { + v16u8 q0, q1, q2; + + PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2); + q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta); + q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta); + q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta); + } + + /* right */ + { + v16u8 negate_is_less_than_beta_r; + + negate_is_less_than_beta_r = + (v16u8) __msa_sldi_b((v16i8) negate_is_less_than_beta, zero, 8); + if (!__msa_test_bz_v(negate_is_less_than_beta_r)) { + AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r); + } + } + /* left */ + { + v16u8 negate_is_less_than_beta_l; + + negate_is_less_than_beta_l = + (v16u8) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8); + if (!__msa_test_bz_v(negate_is_less_than_beta_l)) { + AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l); + } + } + if (!__msa_test_bz_v(negate_is_less_than_beta)) { + v16u8 q0; + + q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r); + q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta); + } + } + { + v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + ILVRL_B2_SH(p1_org, p2_org, tp0, tp2); + ILVRL_B2_SH(q0_org, p0_org, tp1, tp3); + ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5); + + ILVRL_H2_SH(tp1, tp0, tmp3, tmp4); + ILVRL_H2_SH(tp3, tp2, tmp6, tmp7); + + src = data - 3; + ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src, img_width); + ST2x4_UB(tmp2, 0, src + 4, img_width); + src += 4 * img_width; + ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src, img_width); + ST2x4_UB(tmp2, 4, src + 4, img_width); + src += 4 * img_width; + + ST4x4_UB(tmp6, tmp6, 0, 1, 2, 3, src, img_width); + ST2x4_UB(tmp5, 0, src + 4, img_width); + src += 4 * img_width; + ST4x4_UB(tmp7, tmp7, 0, 1, 2, 3, src, img_width); + ST2x4_UB(tmp5, 4, src + 4, img_width); + } +} + +static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride, + int32_t alpha_in, + int32_t beta_in) +{ + uint64_t load0, load1; + uint32_t out0, out2; + uint16_t out1, out3; + v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; + v8u16 dst0_r, dst1_r, dst4_r, dst5_r; + v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r; + v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y; + v8i16 tmp0, tmp1, tmp2, tmp3; + v16u8 alpha, beta; + v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0; + v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; + v16u8 is_less_than_beta1, is_less_than_beta2; + v16i8 src0 = { 0 }; + v16i8 src1 = { 0 }; + v16i8 src2 = { 0 }; + v16i8 src3 = { 0 }; + v16i8 src4 = { 0 }; + v16i8 src5 = { 0 }; + v16i8 src6 = { 0 }; + v16i8 src7 = { 0 }; + v16i8 zeros = { 0 }; + + load0 = LD(src - 4); + load1 = LD(src + stride - 4); + src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, load0); + src1 = (v16i8) __msa_insert_d((v2i64) src1, 0, load1); + + load0 = LD(src + (2 * stride) - 4); + load1 = LD(src + (3 * stride) - 4); + src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0); + src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1); + + load0 = LD(src + (4 * stride) - 4); + load1 = LD(src + (5 * stride) - 4); + src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0); + src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1); + + load0 = LD(src + (6 * stride) - 4); + load1 = LD(src + (7 * stride) - 4); + src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0); + src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, + src0, src1, src2, src3); + + ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2); + ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3); + + ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3); + ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5); + SLDI_B4_0_SB(src6, src1, src3, src5, src0, src2, src4, src7, 8); + + p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3); + p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2); + q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3); + + alpha = (v16u8) __msa_fill_b(alpha_in); + beta = (v16u8) __msa_fill_b(beta_in); + + is_less_than_alpha = (p0_asub_q0 < alpha); + is_less_than_beta = (p1_asub_p0 < beta); + is_less_than = is_less_than_alpha & is_less_than_beta; + is_less_than_beta = (q1_asub_q0 < beta); + is_less_than = is_less_than & is_less_than_beta; + + alpha >>= 2; + alpha += 2; + + is_less_than_alpha = (p0_asub_q0 < alpha); + + p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2); + is_less_than_beta1 = (p2_asub_p0 < beta); + q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3); + is_less_than_beta2 = (q2_asub_q0 < beta); + + ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3, + src0_r, src1_r, src2_r, src3_r); + ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7, + src4_r, src5_r, src6_r, src7_r); + + dst2_x_r = src1_r + src2_r + src3_r; + dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r; + dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3); + dst1_r = src0_r + src1_r + src2_r + src3_r; + dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2); + + dst0_r = (2 * src6_r) + (3 * src0_r); + dst0_r += src1_r + src2_r + src3_r; + dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3); + dst2_y_r = (2 * src1_r) + src2_r + src4_r; + dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2); + + PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y); + dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1); + + dst3_x_r = src2_r + src3_r + src4_r; + dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r; + dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3); + dst4_r = src2_r + src3_r + src4_r + src5_r; + dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2); + + dst5_r = (2 * src7_r) + (3 * src5_r); + dst5_r += src4_r + src3_r + src2_r; + dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3); + dst3_y_r = (2 * src4_r) + src3_r + src1_r; + dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2); + + PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y); + dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2); + + dst2_y_r = (2 * src1_r) + src2_r + src4_r; + dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2); + dst3_y_r = (2 * src4_r) + src3_r + src1_r; + dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2); + + PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y); + + dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha); + dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha); + dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than); + dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than); + + is_less_than = is_less_than_alpha & is_less_than; + dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r); + is_less_than_beta1 = is_less_than_beta1 & is_less_than; + dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1); + + dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r); + dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1); + dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r); + is_less_than_beta2 = is_less_than_beta2 & is_less_than; + dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2); + dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r); + dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2); + + ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1); + dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4); + ILVRL_H2_SH(dst1, dst0, tmp0, tmp1); + ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3); + + ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4); + SLDI_B2_0_UB(dst0, dst4, dst1, dst5, 8); + dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0); + dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1); + SLDI_B2_0_UB(dst2_x, dst2_y, dst3_x, dst3_y, 8); + + out0 = __msa_copy_u_w((v4i32) dst0, 0); + out1 = __msa_copy_u_h((v8i16) dst0, 2); + out2 = __msa_copy_u_w((v4i32) dst1, 0); + out3 = __msa_copy_u_h((v8i16) dst1, 2); + + SW(out0, (src - 3)); + SH(out1, (src + 1)); + src += stride; + SW(out2, (src - 3)); + SH(out3, (src + 1)); + src += stride; + + out0 = __msa_copy_u_w((v4i32) dst2_x, 0); + out1 = __msa_copy_u_h((v8i16) dst2_x, 2); + out2 = __msa_copy_u_w((v4i32) dst3_x, 0); + out3 = __msa_copy_u_h((v8i16) dst3_x, 2); + + SW(out0, (src - 3)); + SH(out1, (src + 1)); + src += stride; + SW(out2, (src - 3)); + SH(out3, (src + 1)); + src += stride; + + out0 = __msa_copy_u_w((v4i32) dst4, 0); + out1 = __msa_copy_u_h((v8i16) dst4, 2); + out2 = __msa_copy_u_w((v4i32) dst5, 0); + out3 = __msa_copy_u_h((v8i16) dst5, 2); + + SW(out0, (src - 3)); + SH(out1, (src + 1)); + src += stride; + SW(out2, (src - 3)); + SH(out3, (src + 1)); + src += stride; + + out0 = __msa_copy_u_w((v4i32) dst2_y, 0); + out1 = __msa_copy_u_h((v8i16) dst2_y, 2); + out2 = __msa_copy_u_w((v4i32) dst3_y, 0); + out3 = __msa_copy_u_h((v8i16) dst3_y, 2); + + SW(out0, (src - 3)); + SH(out1, (src + 1)); + src += stride; + SW(out2, (src - 3)); + SH(out3, (src + 1)); +} + +static void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr, + uint8_t alpha_in, + uint8_t beta_in, + uint32_t img_width) +{ + v16u8 alpha, beta; + v16u8 is_less_than; + v8i16 p0_or_q0, q0_or_p0; + v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org; + v16i8 zero = { 0 }; + v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; + v16u8 is_less_than_alpha, is_less_than_beta; + v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; + + alpha = (v16u8) __msa_fill_b(alpha_in); + beta = (v16u8) __msa_fill_b(beta_in); + + LD_UB4(data_cb_or_cr - (img_width << 1), img_width, + p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org); + + p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org); + p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org); + q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org); + + is_less_than_alpha = (p0_asub_q0 < alpha); + is_less_than_beta = (p1_asub_p0 < beta); + is_less_than = is_less_than_beta & is_less_than_alpha; + is_less_than_beta = (q1_asub_q0 < beta); + is_less_than = is_less_than_beta & is_less_than; + + is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than); + + if (!__msa_test_bz_v(is_less_than)) { + ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org, + zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r); + AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0); + AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0); + PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0); + + p0_or_q0_org = + __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than); + q0_or_p0_org = + __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than); + + ST_UB(q0_or_p0_org, data_cb_or_cr); + ST_UB(p0_or_q0_org, data_cb_or_cr - img_width); + } +} + +static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr, + uint8_t alpha_in, + uint8_t beta_in, + uint32_t img_width) +{ + v8i16 tmp1; + v16u8 alpha, beta, is_less_than; + v8i16 p0_or_q0, q0_or_p0; + v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org; + v16i8 zero = { 0 }; + v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; + v16u8 is_less_than_alpha, is_less_than_beta; + v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; + + { + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + + LD_UB8((data_cb_or_cr - 2), img_width, + row0, row1, row2, row3, row4, row5, row6, row7); + + TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, + p1_or_q1_org, p0_or_q0_org, + q0_or_p0_org, q1_or_p1_org); + } + + alpha = (v16u8) __msa_fill_b(alpha_in); + beta = (v16u8) __msa_fill_b(beta_in); + + p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org); + p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org); + q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org); + + is_less_than_alpha = (p0_asub_q0 < alpha); + is_less_than_beta = (p1_asub_p0 < beta); + is_less_than = is_less_than_beta & is_less_than_alpha; + is_less_than_beta = (q1_asub_q0 < beta); + is_less_than = is_less_than_beta & is_less_than; + is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than); + + if (!__msa_test_bz_v(is_less_than)) { + ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org, + zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r); + + AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0); + AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0); + + /* convert 16 bit output into 8 bit output */ + PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0); + + p0_or_q0_org = + __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than); + q0_or_p0_org = + __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than); + tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org); + + data_cb_or_cr -= 1; + ST2x4_UB(tmp1, 0, data_cb_or_cr, img_width); + data_cb_or_cr += 4 * img_width; + ST2x4_UB(tmp1, 4, data_cb_or_cr, img_width); + } +} + +static void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *data, + uint8_t bs0, uint8_t bs1, + uint8_t bs2, uint8_t bs3, + uint8_t tc0, uint8_t tc1, + uint8_t tc2, uint8_t tc3, + uint8_t alpha_in, + uint8_t beta_in, + uint32_t img_width) +{ + uint8_t *src; + v16u8 beta, tmp_vec, bs = { 0 }; + v16u8 tc = { 0 }; + v16u8 is_less_than, is_less_than_beta; + v16u8 p1, p0, q0, q1; + v8i16 p0_r, q0_r, p1_r = { 0 }; + v8i16 q1_r = { 0 }; + v8i16 p0_l, q0_l, p1_l = { 0 }; + v8i16 q1_l = { 0 }; + v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org; + v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r; + v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l; + v8i16 tc_r, tc_l; + v16i8 zero = { 0 }; + v16u8 is_bs_greater_than0; + + tmp_vec = (v16u8) __msa_fill_b(bs0); + bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec); + tmp_vec = (v16u8) __msa_fill_b(bs1); + bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec); + tmp_vec = (v16u8) __msa_fill_b(bs2); + bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec); + tmp_vec = (v16u8) __msa_fill_b(bs3); + bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec); + + if (!__msa_test_bz_v(bs)) { + tmp_vec = (v16u8) __msa_fill_b(tc0); + tc = (v16u8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec); + tmp_vec = (v16u8) __msa_fill_b(tc1); + tc = (v16u8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec); + tmp_vec = (v16u8) __msa_fill_b(tc2); + tc = (v16u8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec); + tmp_vec = (v16u8) __msa_fill_b(tc3); + tc = (v16u8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec); + + is_bs_greater_than0 = (zero < bs); + + { + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + + src = data; + src -= 4; + + LD_UB8(src, img_width, + row0, row1, row2, row3, row4, row5, row6, row7); + src += (8 * img_width); + LD_UB8(src, img_width, + row8, row9, row10, row11, row12, row13, row14, row15); + + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, + row8, row9, row10, row11, + row12, row13, row14, row15, + p3_org, p2_org, p1_org, p0_org, + q0_org, q1_org, q2_org, q3_org); + } + { + v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha; + v16u8 is_less_than_alpha; + + p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org); + p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org); + q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org); + + alpha = (v16u8) __msa_fill_b(alpha_in); + beta = (v16u8) __msa_fill_b(beta_in); + + is_less_than_alpha = (p0_asub_q0 < alpha); + is_less_than_beta = (p1_asub_p0 < beta); + is_less_than = is_less_than_beta & is_less_than_alpha; + is_less_than_beta = (q1_asub_q0 < beta); + is_less_than = is_less_than_beta & is_less_than; + is_less_than = is_less_than & is_bs_greater_than0; + } + if (!__msa_test_bz_v(is_less_than)) { + v16i8 negate_tc, sign_negate_tc; + v8i16 negate_tc_r, i16_negatetc_l; + + negate_tc = zero - (v16i8) tc; + sign_negate_tc = __msa_clti_s_b(negate_tc, 0); + + ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l); + + UNPCK_UB_SH(tc, tc_r, tc_l); + UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l); + UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l); + UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l); + + { + v16u8 p2_asub_p0; + v16u8 is_less_than_beta_r, is_less_than_beta_l; + + p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org); + is_less_than_beta = (p2_asub_p0 < beta); + is_less_than_beta = is_less_than_beta & is_less_than; + + is_less_than_beta_r = + (v16u8) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8); + if (!__msa_test_bz_v(is_less_than_beta_r)) { + p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org); + + AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r, + negate_tc_r, tc_r, p1_r); + } + + is_less_than_beta_l = + (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8); + if (!__msa_test_bz_v(is_less_than_beta_l)) { + p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org); + + AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l, + i16_negatetc_l, tc_l, p1_l); + } + } + + if (!__msa_test_bz_v(is_less_than_beta)) { + p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r); + p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta); + + is_less_than_beta = __msa_andi_b(is_less_than_beta, 1); + tc = tc + is_less_than_beta; + } + + { + v16u8 u8_q2asub_q0; + v16u8 is_less_than_beta_l, is_less_than_beta_r; + + u8_q2asub_q0 = __msa_asub_u_b(q2_org, q0_org); + is_less_than_beta = (u8_q2asub_q0 < beta); + is_less_than_beta = is_less_than_beta & is_less_than; + + q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org); + + is_less_than_beta_r = + (v16u8) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8); + if (!__msa_test_bz_v(is_less_than_beta_r)) { + q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org); + AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r, + negate_tc_r, tc_r, q1_r); + } + + q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org); + + is_less_than_beta_l = + (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8); + if (!__msa_test_bz_v(is_less_than_beta_l)) { + q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org); + AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l, + i16_negatetc_l, tc_l, q1_l); + } + } + + if (!__msa_test_bz_v(is_less_than_beta)) { + q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r); + q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta); + + is_less_than_beta = __msa_andi_b(is_less_than_beta, 1); + tc = tc + is_less_than_beta; + } + + { + v8i16 threshold_r, negate_thresh_r; + v8i16 threshold_l, negate_thresh_l; + v16i8 negate_thresh, sign_negate_thresh; + + negate_thresh = zero - (v16i8) tc; + sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0); + + ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh, + threshold_r, negate_thresh_r); + + AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, + negate_thresh_r, threshold_r, p0_r, q0_r); + + threshold_l = (v8i16) __msa_ilvl_b(zero, (v16i8) tc); + negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh, + negate_thresh); + + AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l, + negate_thresh_l, threshold_l, p0_l, q0_l); + } + + PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0); + + p0_org = __msa_bmnz_v(p0_org, p0, is_less_than); + q0_org = __msa_bmnz_v(q0_org, q0, is_less_than); + } + { + v16i8 tp0, tp1, tp2, tp3; + v8i16 tmp2, tmp5; + v4i32 tmp3, tmp4, tmp6, tmp7; + uint32_t out0, out2; + uint16_t out1, out3; + + src = data - 3; + + ILVRL_B2_SB(p1_org, p2_org, tp0, tp2); + ILVRL_B2_SB(q0_org, p0_org, tp1, tp3); + ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5); + + ILVRL_H2_SW(tp1, tp0, tmp3, tmp4); + ILVRL_H2_SW(tp3, tp2, tmp6, tmp7); + + out0 = __msa_copy_u_w(tmp3, 0); + out1 = __msa_copy_u_h(tmp2, 0); + out2 = __msa_copy_u_w(tmp3, 1); + out3 = __msa_copy_u_h(tmp2, 1); + + SW(out0, src); + SH(out1, (src + 4)); + src += img_width; + SW(out2, src); + SH(out3, (src + 4)); + + out0 = __msa_copy_u_w(tmp3, 2); + out1 = __msa_copy_u_h(tmp2, 2); + out2 = __msa_copy_u_w(tmp3, 3); + out3 = __msa_copy_u_h(tmp2, 3); + + src += img_width; + SW(out0, src); + SH(out1, (src + 4)); + src += img_width; + SW(out2, src); + SH(out3, (src + 4)); + + out0 = __msa_copy_u_w(tmp4, 0); + out1 = __msa_copy_u_h(tmp2, 4); + out2 = __msa_copy_u_w(tmp4, 1); + out3 = __msa_copy_u_h(tmp2, 5); + + src += img_width; + SW(out0, src); + SH(out1, (src + 4)); + src += img_width; + SW(out2, src); + SH(out3, (src + 4)); + + out0 = __msa_copy_u_w(tmp4, 2); + out1 = __msa_copy_u_h(tmp2, 6); + out2 = __msa_copy_u_w(tmp4, 3); + out3 = __msa_copy_u_h(tmp2, 7); + + src += img_width; + SW(out0, src); + SH(out1, (src + 4)); + src += img_width; + SW(out2, src); + SH(out3, (src + 4)); + + out0 = __msa_copy_u_w(tmp6, 0); + out1 = __msa_copy_u_h(tmp5, 0); + out2 = __msa_copy_u_w(tmp6, 1); + out3 = __msa_copy_u_h(tmp5, 1); + + src += img_width; + SW(out0, src); + SH(out1, (src + 4)); + src += img_width; + SW(out2, src); + SH(out3, (src + 4)); + + out0 = __msa_copy_u_w(tmp6, 2); + out1 = __msa_copy_u_h(tmp5, 2); + out2 = __msa_copy_u_w(tmp6, 3); + out3 = __msa_copy_u_h(tmp5, 3); + + src += img_width; + SW(out0, src); + SH(out1, (src + 4)); + src += img_width; + SW(out2, src); + SH(out3, (src + 4)); + + out0 = __msa_copy_u_w(tmp7, 0); + out1 = __msa_copy_u_h(tmp5, 4); + out2 = __msa_copy_u_w(tmp7, 1); + out3 = __msa_copy_u_h(tmp5, 5); + + src += img_width; + SW(out0, src); + SH(out1, (src + 4)); + src += img_width; + SW(out2, src); + SH(out3, (src + 4)); + + out0 = __msa_copy_u_w(tmp7, 2); + out1 = __msa_copy_u_h(tmp5, 6); + out2 = __msa_copy_u_w(tmp7, 3); + out3 = __msa_copy_u_h(tmp5, 7); + + src += img_width; + SW(out0, src); + SH(out1, (src + 4)); + src += img_width; + SW(out2, src); + SH(out3, (src + 4)); + } + } +} + +static void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data, + uint8_t bs0, uint8_t bs1, + uint8_t bs2, uint8_t bs3, + uint8_t tc0, uint8_t tc1, + uint8_t tc2, uint8_t tc3, + uint8_t alpha_in, + uint8_t beta_in, + uint32_t image_width) +{ + v16u8 p2_asub_p0, u8_q2asub_q0; + v16u8 alpha, beta, is_less_than, is_less_than_beta; + v16u8 p1, p0, q0, q1; + v8i16 p1_r = { 0 }; + v8i16 p0_r, q0_r, q1_r = { 0 }; + v8i16 p1_l = { 0 }; + v8i16 p0_l, q0_l, q1_l = { 0 }; + v16u8 p2_org, p1_org, p0_org, q0_org, q1_org, q2_org; + v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r; + v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l; + v16i8 zero = { 0 }; + v16u8 tmp_vec; + v16u8 bs = { 0 }; + v16i8 tc = { 0 }; + + tmp_vec = (v16u8) __msa_fill_b(bs0); + bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec); + tmp_vec = (v16u8) __msa_fill_b(bs1); + bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec); + tmp_vec = (v16u8) __msa_fill_b(bs2); + bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec); + tmp_vec = (v16u8) __msa_fill_b(bs3); + bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec); + + if (!__msa_test_bz_v(bs)) { + tmp_vec = (v16u8) __msa_fill_b(tc0); + tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec); + tmp_vec = (v16u8) __msa_fill_b(tc1); + tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec); + tmp_vec = (v16u8) __msa_fill_b(tc2); + tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec); + tmp_vec = (v16u8) __msa_fill_b(tc3); + tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec); + + alpha = (v16u8) __msa_fill_b(alpha_in); + beta = (v16u8) __msa_fill_b(beta_in); + + LD_UB5(data - (3 * image_width), image_width, + p2_org, p1_org, p0_org, q0_org, q1_org); + + { + v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; + v16u8 is_less_than_alpha, is_bs_greater_than0; + + is_bs_greater_than0 = ((v16u8) zero < bs); + p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org); + p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org); + q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org); + + is_less_than_alpha = (p0_asub_q0 < alpha); + is_less_than_beta = (p1_asub_p0 < beta); + is_less_than = is_less_than_beta & is_less_than_alpha; + is_less_than_beta = (q1_asub_q0 < beta); + is_less_than = is_less_than_beta & is_less_than; + is_less_than = is_less_than & is_bs_greater_than0; + } + + if (!__msa_test_bz_v(is_less_than)) { + v16i8 sign_negate_tc, negate_tc; + v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r; + + q2_org = LD_UB(data + (2 * image_width)); + negate_tc = zero - tc; + sign_negate_tc = __msa_clti_s_b(negate_tc, 0); + + ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l); + + UNPCK_UB_SH(tc, tc_r, tc_l); + UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l); + UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l); + UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l); + + p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org); + is_less_than_beta = (p2_asub_p0 < beta); + is_less_than_beta = is_less_than_beta & is_less_than; + { + v8u16 is_less_than_beta_r, is_less_than_beta_l; + + is_less_than_beta_r = + (v8u16) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8); + if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) { + p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org); + + AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r, + negate_tc_r, tc_r, p1_r); + } + + is_less_than_beta_l = + (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8); + if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) { + p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org); + + AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l, + i16_negatetc_l, tc_l, p1_l); + } + } + if (!__msa_test_bz_v(is_less_than_beta)) { + p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r); + p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta); + ST_UB(p1_org, data - (2 * image_width)); + + is_less_than_beta = __msa_andi_b(is_less_than_beta, 1); + tc = tc + (v16i8) is_less_than_beta; + } + + u8_q2asub_q0 = __msa_asub_u_b(q2_org, q0_org); + is_less_than_beta = (u8_q2asub_q0 < beta); + is_less_than_beta = is_less_than_beta & is_less_than; + + { + v8u16 is_less_than_beta_r, is_less_than_beta_l; + is_less_than_beta_r = + (v8u16) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8); + + q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org); + if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) { + q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org); + + AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r, + negate_tc_r, tc_r, q1_r); + } + is_less_than_beta_l = + (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8); + + q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org); + if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) { + q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org); + + AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l, + i16_negatetc_l, tc_l, q1_l); + } + } + if (!__msa_test_bz_v(is_less_than_beta)) { + q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r); + q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta); + ST_UB(q1_org, data + image_width); + + is_less_than_beta = __msa_andi_b(is_less_than_beta, 1); + tc = tc + (v16i8) is_less_than_beta; + } + { + v16i8 negate_thresh, sign_negate_thresh; + v8i16 threshold_r, threshold_l; + v8i16 negate_thresh_l, negate_thresh_r; + + negate_thresh = zero - tc; + sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0); + + ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh, + threshold_r, negate_thresh_r); + AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, + negate_thresh_r, threshold_r, p0_r, q0_r); + + threshold_l = (v8i16) __msa_ilvl_b(zero, tc); + negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh, + negate_thresh); + AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l, + negate_thresh_l, threshold_l, p0_l, q0_l); + } + + PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0); + + p0_org = __msa_bmnz_v(p0_org, p0, is_less_than); + q0_org = __msa_bmnz_v(q0_org, q0, is_less_than); + + ST_UB(p0_org, (data - image_width)); + ST_UB(q0_org, data); + } + } +} + +static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride, + int32_t alpha_in, int32_t beta_in, + int8_t *tc0) +{ + uint8_t *data = in; + uint32_t out0, out1, out2, out3; + uint64_t load; + uint32_t tc_val; + v16u8 alpha, beta; + v16i8 inp0 = { 0 }; + v16i8 inp1 = { 0 }; + v16i8 inp2 = { 0 }; + v16i8 inp3 = { 0 }; + v16i8 inp4 = { 0 }; + v16i8 inp5 = { 0 }; + v16i8 inp6 = { 0 }; + v16i8 inp7 = { 0 }; + v16i8 src0, src1, src2, src3; + v8i16 src4, src5, src6, src7; + v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0; + v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; + v16u8 is_less_than_beta1, is_less_than_beta2; + v8i16 tc, tc_orig_r, tc_plus1; + v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 }; + v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1; + v8u16 src2_r, src3_r; + v8i16 p2_r, p1_r, q2_r, q1_r; + v16u8 p2, q2, p0, q0; + v4i32 dst0, dst1; + v16i8 zeros = { 0 }; + + alpha = (v16u8) __msa_fill_b(alpha_in); + beta = (v16u8) __msa_fill_b(beta_in); + + if (tc0[0] < 0) { + data += (2 * stride); + } else { + load = LD(data - 3); + inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load); + load = LD(data - 3 + stride); + inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load); + data += (2 * stride); + } + + if (tc0[1] < 0) { + data += (2 * stride); + } else { + load = LD(data - 3); + inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load); + load = LD(data - 3 + stride); + inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load); + data += (2 * stride); + } + + if (tc0[2] < 0) { + data += (2 * stride); + } else { + load = LD(data - 3); + inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load); + load = LD(data - 3 + stride); + inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load); + data += (2 * stride); + } + + if (tc0[3] < 0) { + data += (2 * stride); + } else { + load = LD(data - 3); + inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load); + load = LD(data - 3 + stride); + inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load); + data += (2 * stride); + } + + ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6, + src0, src1, src2, src3); + + ILVR_H2_SH(src1, src0, src3, src2, src4, src6); + ILVL_H2_SH(src1, src0, src3, src2, src5, src7); + + src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4); + src1 = __msa_sldi_b(zeros, (v16i8) src0, 8); + src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4); + src3 = __msa_sldi_b(zeros, (v16i8) src2, 8); + src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5); + src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8); + + p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3); + p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2); + q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3); + p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2); + q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3); + + is_less_than_alpha = (p0_asub_q0 < alpha); + is_less_than_beta = (p1_asub_p0 < beta); + is_less_than = is_less_than_alpha & is_less_than_beta; + is_less_than_beta = (q1_asub_q0 < beta); + is_less_than = is_less_than_beta & is_less_than; + + is_less_than_beta1 = (p2_asub_p0 < beta); + is_less_than_beta2 = (q2_asub_q0 < beta); + + p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2); + p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0); + p0_add_q0 = __msa_srari_h(p0_add_q0, 1); + + ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r); + p2_r += p0_add_q0; + p2_r >>= 1; + p2_r -= p1_r; + ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r); + q2_r += p0_add_q0; + q2_r >>= 1; + q2_r -= q1_r; + + tc_val = LW(tc0); + tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val); + tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig); + is_tc_orig1 = tc_orig; + is_tc_orig2 = tc_orig; + tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig); + tc = tc_orig_r; + + p2_r = CLIP_SH(p2_r, -tc_orig_r, tc_orig_r); + q2_r = CLIP_SH(q2_r, -tc_orig_r, tc_orig_r); + + p2_r += p1_r; + q2_r += q1_r; + + PCKEV_B2_UB(p2_r, p2_r, q2_r, q2_r, p2, q2); + + is_tc_orig1 = (zeros < is_tc_orig1); + is_tc_orig2 = is_tc_orig1; + is_tc_orig1 = is_less_than_beta1 & is_tc_orig1; + is_tc_orig2 = is_less_than_beta2 & is_tc_orig2; + is_tc_orig1 = is_less_than & is_tc_orig1; + is_tc_orig2 = is_less_than & is_tc_orig2; + + p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1); + q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2); + + q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0); + q0_sub_p0 <<= 2; + p1_sub_q1 = p1_r - q1_r; + q0_sub_p0 += p1_sub_q1; + q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3); + + tc_plus1 = tc + 1; + is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1, + (v16i8) is_less_than_beta1); + tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1); + tc_plus1 = tc + 1; + is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2, + (v16i8) is_less_than_beta2); + tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2); + + q0_sub_p0 = CLIP_SH(q0_sub_p0, -tc, tc); + + ILVR_B2_UH(zeros, src2, zeros, src3, src2_r, src3_r); + src2_r += q0_sub_p0; + src3_r -= q0_sub_p0; + + src2_r = (v8u16) CLIP_SH_0_255(src2_r); + src3_r = (v8u16) CLIP_SH_0_255(src3_r); + + PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0); + + p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than); + q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than); + + ILVR_B2_UB(p0, p2, q2, q0, p2, q2); + + ILVRL_H2_SW(q2, p2, dst0, dst1); + + data = in; + + out0 = __msa_copy_u_w(dst0, 0); + out1 = __msa_copy_u_w(dst0, 1); + out2 = __msa_copy_u_w(dst0, 2); + out3 = __msa_copy_u_w(dst0, 3); + + if (tc0[0] < 0) { + data += (2 * stride); + } else { + SW(out0, (data - 2)); + data += stride; + SW(out1, (data - 2)); + data += stride; + } + + if (tc0[1] < 0) { + data += (2 * stride); + } else { + SW(out2, (data - 2)); + data += stride; + SW(out3, (data - 2)); + data += stride; + } + + out0 = __msa_copy_u_w(dst1, 0); + out1 = __msa_copy_u_w(dst1, 1); + out2 = __msa_copy_u_w(dst1, 2); + out3 = __msa_copy_u_w(dst1, 3); + + if (tc0[2] < 0) { + data += (2 * stride); + } else { + SW(out0, (data - 2)); + data += stride; + SW(out1, (data - 2)); + data += stride; + } + + if (tc0[3] >= 0) { + SW(out2, (data - 2)); + data += stride; + SW(out3, (data - 2)); + } +} + +static void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data, + uint8_t bs0, uint8_t bs1, + uint8_t bs2, uint8_t bs3, + uint8_t tc0, uint8_t tc1, + uint8_t tc2, uint8_t tc3, + uint8_t alpha_in, + uint8_t beta_in, + uint32_t img_width) +{ + v16u8 alpha, beta; + v8i16 tmp_vec; + v8i16 bs = { 0 }; + v8i16 tc = { 0 }; + v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0; + v16u8 is_less_than; + v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0; + v8i16 p0_r, q0_r; + v16u8 p1_org, p0_org, q0_org, q1_org; + v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; + v16i8 negate_tc, sign_negate_tc; + v8i16 tc_r, negate_tc_r; + v16i8 zero = { 0 }; + + tmp_vec = (v8i16) __msa_fill_b(bs0); + bs = __msa_insve_h(bs, 0, tmp_vec); + tmp_vec = (v8i16) __msa_fill_b(bs1); + bs = __msa_insve_h(bs, 1, tmp_vec); + tmp_vec = (v8i16) __msa_fill_b(bs2); + bs = __msa_insve_h(bs, 2, tmp_vec); + tmp_vec = (v8i16) __msa_fill_b(bs3); + bs = __msa_insve_h(bs, 3, tmp_vec); + + if (!__msa_test_bz_v((v16u8) bs)) { + tmp_vec = (v8i16) __msa_fill_b(tc0); + tc = __msa_insve_h(tc, 0, tmp_vec); + tmp_vec = (v8i16) __msa_fill_b(tc1); + tc = __msa_insve_h(tc, 1, tmp_vec); + tmp_vec = (v8i16) __msa_fill_b(tc2); + tc = __msa_insve_h(tc, 2, tmp_vec); + tmp_vec = (v8i16) __msa_fill_b(tc3); + tc = __msa_insve_h(tc, 3, tmp_vec); + + is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs); + + alpha = (v16u8) __msa_fill_b(alpha_in); + beta = (v16u8) __msa_fill_b(beta_in); + + LD_UB4(data - (img_width << 1), img_width, + p1_org, p0_org, q0_org, q1_org); + + p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org); + p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org); + q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org); + + is_less_than_alpha = (p0_asub_q0 < alpha); + is_less_than_beta = (p1_asub_p0 < beta); + is_less_than = is_less_than_beta & is_less_than_alpha; + is_less_than_beta = (q1_asub_q0 < beta); + is_less_than = is_less_than_beta & is_less_than; + is_less_than = is_less_than & is_bs_greater_than0; + + is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than); + + if (!__msa_test_bz_v(is_less_than)) { + negate_tc = zero - (v16i8) tc; + sign_negate_tc = __msa_clti_s_b(negate_tc, 0); + + ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r); + + ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org, + p1_org_r, p0_org_r, q0_org_r, q1_org_r); + + AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r, + tc_r, p0_r, q0_r); + + PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0); + + p0_org = __msa_bmnz_v(p0_org, p0, is_less_than); + q0_org = __msa_bmnz_v(q0_org, q0, is_less_than); + + ST_UB(q0_org, data); + ST_UB(p0_org, (data - img_width)); + } + } +} + +static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data, + uint8_t bs0, uint8_t bs1, + uint8_t bs2, uint8_t bs3, + uint8_t tc0, uint8_t tc1, + uint8_t tc2, uint8_t tc3, + uint8_t alpha_in, + uint8_t beta_in, + uint32_t img_width) +{ + uint8_t *src; + v16u8 alpha, beta; + v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; + v16u8 is_less_than, is_less_than_beta, is_less_than_alpha; + v16u8 p0, q0; + v8i16 p0_r = { 0 }; + v8i16 q0_r = { 0 }; + v16u8 p1_org, p0_org, q0_org, q1_org; + v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; + v16u8 is_bs_greater_than0; + v8i16 tc_r, negate_tc_r; + v16i8 negate_tc, sign_negate_tc; + v16i8 zero = { 0 }; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v8i16 tmp1, tmp_vec, bs = { 0 }; + v8i16 tc = { 0 }; + + tmp_vec = (v8i16) __msa_fill_b(bs0); + bs = __msa_insve_h(bs, 0, tmp_vec); + tmp_vec = (v8i16) __msa_fill_b(bs1); + bs = __msa_insve_h(bs, 1, tmp_vec); + tmp_vec = (v8i16) __msa_fill_b(bs2); + bs = __msa_insve_h(bs, 2, tmp_vec); + tmp_vec = (v8i16) __msa_fill_b(bs3); + bs = __msa_insve_h(bs, 3, tmp_vec); + + if (!__msa_test_bz_v((v16u8) bs)) { + tmp_vec = (v8i16) __msa_fill_b(tc0); + tc = __msa_insve_h(tc, 0, tmp_vec); + tmp_vec = (v8i16) __msa_fill_b(tc1); + tc = __msa_insve_h(tc, 1, tmp_vec); + tmp_vec = (v8i16) __msa_fill_b(tc2); + tc = __msa_insve_h(tc, 2, tmp_vec); + tmp_vec = (v8i16) __msa_fill_b(tc3); + tc = __msa_insve_h(tc, 3, tmp_vec); + + is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs); + + LD_UB8((data - 2), img_width, + row0, row1, row2, row3, row4, row5, row6, row7); + + TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, + row4, row5, row6, row7, + p1_org, p0_org, q0_org, q1_org); + + p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org); + p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org); + q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org); + + alpha = (v16u8) __msa_fill_b(alpha_in); + beta = (v16u8) __msa_fill_b(beta_in); + + is_less_than_alpha = (p0_asub_q0 < alpha); + is_less_than_beta = (p1_asub_p0 < beta); + is_less_than = is_less_than_beta & is_less_than_alpha; + is_less_than_beta = (q1_asub_q0 < beta); + is_less_than = is_less_than_beta & is_less_than; + is_less_than = is_bs_greater_than0 & is_less_than; + + is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than); + + if (!__msa_test_bz_v(is_less_than)) { + ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org, + p1_org_r, p0_org_r, q0_org_r, q1_org_r); + + negate_tc = zero - (v16i8) tc; + sign_negate_tc = __msa_clti_s_b(negate_tc, 0); + + ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r); + + AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r, + tc_r, p0_r, q0_r); + + PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0); + + p0_org = __msa_bmnz_v(p0_org, p0, is_less_than); + q0_org = __msa_bmnz_v(q0_org, q0, is_less_than); + tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org); + src = data - 1; + ST2x4_UB(tmp1, 0, src, img_width); + src += 4 * img_width; + ST2x4_UB(tmp1, 4, src, img_width); + } + } +} + +static void avc_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride, + int32_t alpha_in, int32_t beta_in, + int8_t *tc0) +{ + int32_t col, tc_val; + v16u8 alpha, beta, res; + + alpha = (v16u8) __msa_fill_b(alpha_in); + beta = (v16u8) __msa_fill_b(beta_in); + + for (col = 0; col < 4; col++) { + tc_val = (tc0[col] - 1) + 1; + + if (tc_val <= 0) { + src += (4 * stride); + continue; + } + + AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res); + ST2x4_UB(res, 0, (src - 1), stride); + src += (4 * stride); + } +} + +static void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, int32_t stride, + int32_t alpha_in, + int32_t beta_in, + int8_t *tc0) +{ + int32_t col, tc_val; + int16_t out0, out1; + v16u8 alpha, beta, res; + + alpha = (v16u8) __msa_fill_b(alpha_in); + beta = (v16u8) __msa_fill_b(beta_in); + + for (col = 0; col < 4; col++) { + tc_val = (tc0[col] - 1) + 1; + + if (tc_val <= 0) { + src += 4 * stride; + continue; + } + + AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res); + + out0 = __msa_copy_s_h((v8i16) res, 0); + out1 = __msa_copy_s_h((v8i16) res, 1); + + SH(out0, (src - 1)); + src += stride; + SH(out1, (src - 1)); + src += stride; + } +} + +void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, int img_width, + int alpha, int beta, int8_t *tc) +{ + uint8_t bs0 = 1; + uint8_t bs1 = 1; + uint8_t bs2 = 1; + uint8_t bs3 = 1; + + if (tc[0] < 0) + bs0 = 0; + if (tc[1] < 0) + bs1 = 0; + if (tc[2] < 0) + bs2 = 0; + if (tc[3] < 0) + bs3 = 0; + + avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3, + tc[0], tc[1], tc[2], tc[3], + alpha, beta, img_width); +} + +void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, int img_width, + int alpha, int beta, int8_t *tc) +{ + + uint8_t bs0 = 1; + uint8_t bs1 = 1; + uint8_t bs2 = 1; + uint8_t bs3 = 1; + + if (tc[0] < 0) + bs0 = 0; + if (tc[1] < 0) + bs1 = 0; + if (tc[2] < 0) + bs2 = 0; + if (tc[3] < 0) + bs3 = 0; + + avc_loopfilter_luma_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3, + tc[0], tc[1], tc[2], tc[3], + alpha, beta, img_width); +} + +void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, int img_width, + int alpha, int beta, int8_t *tc) +{ + uint8_t bs0 = 1; + uint8_t bs1 = 1; + uint8_t bs2 = 1; + uint8_t bs3 = 1; + + if (tc[0] < 0) + bs0 = 0; + if (tc[1] < 0) + bs1 = 0; + if (tc[2] < 0) + bs2 = 0; + if (tc[3] < 0) + bs3 = 0; + + avc_loopfilter_cb_or_cr_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3, + tc[0], tc[1], tc[2], tc[3], + alpha, beta, img_width); +} + +void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, int img_width, + int alpha, int beta, int8_t *tc) +{ + uint8_t bs0 = 1; + uint8_t bs1 = 1; + uint8_t bs2 = 1; + uint8_t bs3 = 1; + + if (tc[0] < 0) + bs0 = 0; + if (tc[1] < 0) + bs1 = 0; + if (tc[2] < 0) + bs2 = 0; + if (tc[3] < 0) + bs3 = 0; + + avc_loopfilter_cb_or_cr_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3, + tc[0], tc[1], tc[2], tc[3], + alpha, beta, img_width); +} + +void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, int img_width, + int alpha, int beta) +{ + avc_loopfilter_luma_intra_edge_ver_msa(data, (uint8_t) alpha, + (uint8_t) beta, + (unsigned int) img_width); +} + +void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, int img_width, + int alpha, int beta) +{ + avc_loopfilter_luma_intra_edge_hor_msa(data, (uint8_t) alpha, + (uint8_t) beta, + (unsigned int) img_width); +} + +void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, int img_width, + int alpha, int beta) +{ + avc_loopfilter_cb_or_cr_intra_edge_ver_msa(data, (uint8_t) alpha, + (uint8_t) beta, + (unsigned int) img_width); +} + +void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, int img_width, + int alpha, int beta) +{ + avc_loopfilter_cb_or_cr_intra_edge_hor_msa(data, (uint8_t) alpha, + (uint8_t) beta, + (unsigned int) img_width); +} + +void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src, + int32_t ystride, + int32_t alpha, int32_t beta, + int8_t *tc0) +{ + avc_h_loop_filter_chroma422_msa(src, ystride, alpha, beta, tc0); +} + +void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, + int32_t ystride, + int32_t alpha, + int32_t beta, + int8_t *tc0) +{ + avc_h_loop_filter_chroma422_mbaff_msa(src, ystride, alpha, beta, tc0); +} + +void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src, + int32_t ystride, + int32_t alpha, + int32_t beta, + int8_t *tc0) +{ + avc_h_loop_filter_luma_mbaff_msa(src, ystride, alpha, beta, tc0); +} + +void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, + int32_t ystride, + int32_t alpha, + int32_t beta) +{ + avc_h_loop_filter_luma_mbaff_intra_msa(src, ystride, alpha, beta); +} + +void ff_weight_h264_pixels16_8_msa(uint8_t *src, int stride, + int height, int log2_denom, + int weight_src, int offset) +{ + avc_wgt_16width_msa(src, stride, height, log2_denom, weight_src, offset); +} + +void ff_weight_h264_pixels8_8_msa(uint8_t *src, int stride, + int height, int log2_denom, + int weight_src, int offset) +{ + avc_wgt_8width_msa(src, stride, height, log2_denom, weight_src, offset); +} + +void ff_weight_h264_pixels4_8_msa(uint8_t *src, int stride, + int height, int log2_denom, + int weight_src, int offset) +{ + avc_wgt_4width_msa(src, stride, height, log2_denom, weight_src, offset); +} + +void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src, + int stride, int height, + int log2_denom, int weight_dst, + int weight_src, int offset) +{ + avc_biwgt_16width_msa(src, stride, dst, stride, height, log2_denom, + weight_src, weight_dst, offset); +} + +void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src, + int stride, int height, + int log2_denom, int weight_dst, + int weight_src, int offset) +{ + avc_biwgt_8width_msa(src, stride, dst, stride, height, log2_denom, + weight_src, weight_dst, offset); +} + +void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src, + int stride, int height, + int log2_denom, int weight_dst, + int weight_src, int offset) +{ + avc_biwgt_4width_msa(src, stride, dst, stride, height, log2_denom, + weight_src, weight_dst, offset); +} diff --git a/libavcodec/mips/h264idct_msa.c b/libavcodec/mips/h264idct_msa.c new file mode 100644 index 0000000000..fac1e7add4 --- /dev/null +++ b/libavcodec/mips/h264idct_msa.c @@ -0,0 +1,469 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "h264dsp_mips.h" +#include "libavcodec/bit_depth_template.c" + +#define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3) \ +{ \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + tmp0_m = in0 + in2; \ + tmp1_m = in0 - in2; \ + tmp2_m = in1 >> 1; \ + tmp2_m = tmp2_m - in3; \ + tmp3_m = in3 >> 1; \ + tmp3_m = in1 + tmp3_m; \ + \ + BUTTERFLY_4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3); \ +} + +static void avc_idct4x4_addblk_msa(uint8_t *dst, int16_t *src, + int32_t dst_stride) +{ + v8i16 src0, src1, src2, src3; + v8i16 hres0, hres1, hres2, hres3; + v8i16 vres0, vres1, vres2, vres3; + v8i16 zeros = { 0 }; + + LD4x4_SH(src, src0, src1, src2, src3); + AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3); + TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3); + AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3); + SRARI_H4_SH(vres0, vres1, vres2, vres3, 6); + ADDBLK_ST4x4_UB(vres0, vres1, vres2, vres3, dst, dst_stride); + ST_SH2(zeros, zeros, src, 8); +} + +static void avc_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src, + int32_t dst_stride) +{ + int16_t dc; + uint32_t src0, src1, src2, src3; + v16u8 pred = { 0 }; + v16i8 out; + v8i16 input_dc, pred_r, pred_l; + + dc = (src[0] + 32) >> 6; + input_dc = __msa_fill_h(dc); + src[0] = 0; + + LW4(dst, dst_stride, src0, src1, src2, src3); + INSERT_W4_UB(src0, src1, src2, src3, pred); + UNPCK_UB_SH(pred, pred_r, pred_l); + + pred_r += input_dc; + pred_l += input_dc; + + CLIP_SH2_0_255(pred_r, pred_l); + out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void avc_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src, + int32_t de_q_val) +{ +#define DC_DEST_STRIDE 16 + int16_t out0, out1, out2, out3; + v8i16 src0, src1, src2, src3; + v8i16 vec0, vec1, vec2, vec3; + v8i16 hres0, hres1, hres2, hres3; + v8i16 vres0, vres1, vres2, vres3; + v4i32 vres0_r, vres1_r, vres2_r, vres3_r; + v4i32 de_q_vec = __msa_fill_w(de_q_val); + + LD4x4_SH(src, src0, src1, src2, src3); + TRANSPOSE4x4_SH_SH(src0, src1, src2, src3, src0, src1, src2, src3); + BUTTERFLY_4(src0, src2, src3, src1, vec0, vec3, vec2, vec1); + BUTTERFLY_4(vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1); + TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3); + BUTTERFLY_4(hres0, hres1, hres3, hres2, vec0, vec3, vec2, vec1); + BUTTERFLY_4(vec0, vec1, vec2, vec3, vres0, vres1, vres2, vres3); + UNPCK_R_SH_SW(vres0, vres0_r); + UNPCK_R_SH_SW(vres1, vres1_r); + UNPCK_R_SH_SW(vres2, vres2_r); + UNPCK_R_SH_SW(vres3, vres3_r); + + vres0_r *= de_q_vec; + vres1_r *= de_q_vec; + vres2_r *= de_q_vec; + vres3_r *= de_q_vec; + + SRARI_W4_SW(vres0_r, vres1_r, vres2_r, vres3_r, 8); + PCKEV_H2_SH(vres1_r, vres0_r, vres3_r, vres2_r, vec0, vec1); + + out0 = __msa_copy_s_h(vec0, 0); + out1 = __msa_copy_s_h(vec0, 1); + out2 = __msa_copy_s_h(vec0, 2); + out3 = __msa_copy_s_h(vec0, 3); + SH(out0, dst); + SH(out1, (dst + 2 * DC_DEST_STRIDE)); + SH(out2, (dst + 8 * DC_DEST_STRIDE)); + SH(out3, (dst + 10 * DC_DEST_STRIDE)); + dst += DC_DEST_STRIDE; + + out0 = __msa_copy_s_h(vec0, 4); + out1 = __msa_copy_s_h(vec0, 5); + out2 = __msa_copy_s_h(vec0, 6); + out3 = __msa_copy_s_h(vec0, 7); + SH(out0, dst); + SH(out1, (dst + 2 * DC_DEST_STRIDE)); + SH(out2, (dst + 8 * DC_DEST_STRIDE)); + SH(out3, (dst + 10 * DC_DEST_STRIDE)); + dst += (3 * DC_DEST_STRIDE); + + out0 = __msa_copy_s_h(vec1, 0); + out1 = __msa_copy_s_h(vec1, 1); + out2 = __msa_copy_s_h(vec1, 2); + out3 = __msa_copy_s_h(vec1, 3); + SH(out0, dst); + SH(out1, (dst + 2 * DC_DEST_STRIDE)); + SH(out2, (dst + 8 * DC_DEST_STRIDE)); + SH(out3, (dst + 10 * DC_DEST_STRIDE)); + dst += DC_DEST_STRIDE; + + out0 = __msa_copy_s_h(vec1, 4); + out1 = __msa_copy_s_h(vec1, 5); + out2 = __msa_copy_s_h(vec1, 6); + out3 = __msa_copy_s_h(vec1, 7); + SH(out0, dst); + SH(out1, (dst + 2 * DC_DEST_STRIDE)); + SH(out2, (dst + 8 * DC_DEST_STRIDE)); + SH(out3, (dst + 10 * DC_DEST_STRIDE)); + +#undef DC_DEST_STRIDE +} + +static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride) +{ + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 vec0, vec1, vec2, vec3; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7; + v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r; + v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l; + v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l; + v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r; + v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l; + v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16i8 zeros = { 0 }; + + src[0] += 32; + + LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7); + + vec0 = src0 + src4; + vec1 = src0 - src4; + vec2 = src2 >> 1; + vec2 = vec2 - src6; + vec3 = src6 >> 1; + vec3 = src2 + vec3; + + BUTTERFLY_4(vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3); + + vec0 = src7 >> 1; + vec0 = src5 - vec0 - src3 - src7; + vec1 = src3 >> 1; + vec1 = src1 - vec1 + src7 - src3; + vec2 = src5 >> 1; + vec2 = vec2 - src1 + src7 + src5; + vec3 = src1 >> 1; + vec3 = vec3 + src3 + src5 + src1; + tmp4 = vec3 >> 2; + tmp4 += vec0; + tmp5 = vec2 >> 2; + tmp5 += vec1; + tmp6 = vec1 >> 2; + tmp6 -= vec2; + tmp7 = vec0 >> 2; + tmp7 = vec3 - tmp7; + + BUTTERFLY_8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, + res0, res1, res2, res3, res4, res5, res6, res7); + TRANSPOSE8x8_SH_SH(res0, res1, res2, res3, res4, res5, res6, res7, + res0, res1, res2, res3, res4, res5, res6, res7); + UNPCK_SH_SW(res0, tmp0_r, tmp0_l); + UNPCK_SH_SW(res1, tmp1_r, tmp1_l); + UNPCK_SH_SW(res2, tmp2_r, tmp2_l); + UNPCK_SH_SW(res3, tmp3_r, tmp3_l); + UNPCK_SH_SW(res4, tmp4_r, tmp4_l); + UNPCK_SH_SW(res5, tmp5_r, tmp5_l); + UNPCK_SH_SW(res6, tmp6_r, tmp6_l); + UNPCK_SH_SW(res7, tmp7_r, tmp7_l); + BUTTERFLY_4(tmp0_r, tmp0_l, tmp4_l, tmp4_r, vec0_r, vec0_l, vec1_l, vec1_r); + + vec2_r = tmp2_r >> 1; + vec2_l = tmp2_l >> 1; + vec2_r -= tmp6_r; + vec2_l -= tmp6_l; + vec3_r = tmp6_r >> 1; + vec3_l = tmp6_l >> 1; + vec3_r += tmp2_r; + vec3_l += tmp2_l; + + BUTTERFLY_4(vec0_r, vec1_r, vec2_r, vec3_r, tmp0_r, tmp2_r, tmp4_r, tmp6_r); + BUTTERFLY_4(vec0_l, vec1_l, vec2_l, vec3_l, tmp0_l, tmp2_l, tmp4_l, tmp6_l); + + vec0_r = tmp7_r >> 1; + vec0_l = tmp7_l >> 1; + vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r; + vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l; + vec1_r = tmp3_r >> 1; + vec1_l = tmp3_l >> 1; + vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r; + vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l; + vec2_r = tmp5_r >> 1; + vec2_l = tmp5_l >> 1; + vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r; + vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l; + vec3_r = tmp1_r >> 1; + vec3_l = tmp1_l >> 1; + vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r; + vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l; + tmp1_r = vec3_r >> 2; + tmp1_l = vec3_l >> 2; + tmp1_r += vec0_r; + tmp1_l += vec0_l; + tmp3_r = vec2_r >> 2; + tmp3_l = vec2_l >> 2; + tmp3_r += vec1_r; + tmp3_l += vec1_l; + tmp5_r = vec1_r >> 2; + tmp5_l = vec1_l >> 2; + tmp5_r -= vec2_r; + tmp5_l -= vec2_l; + tmp7_r = vec0_r >> 2; + tmp7_l = vec0_l >> 2; + tmp7_r = vec3_r - tmp7_r; + tmp7_l = vec3_l - tmp7_l; + + BUTTERFLY_4(tmp0_r, tmp0_l, tmp7_l, tmp7_r, res0_r, res0_l, res7_l, res7_r); + BUTTERFLY_4(tmp2_r, tmp2_l, tmp5_l, tmp5_r, res1_r, res1_l, res6_l, res6_r); + BUTTERFLY_4(tmp4_r, tmp4_l, tmp3_l, tmp3_r, res2_r, res2_l, res5_l, res5_r); + BUTTERFLY_4(tmp6_r, tmp6_l, tmp1_l, tmp1_r, res3_r, res3_l, res4_l, res4_r); + SRA_4V(res0_r, res0_l, res1_r, res1_l, 6); + SRA_4V(res2_r, res2_l, res3_r, res3_l, 6); + SRA_4V(res4_r, res4_l, res5_r, res5_l, 6); + SRA_4V(res6_r, res6_l, res7_r, res7_l, 6); + PCKEV_H4_SH(res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r, + res0, res1, res2, res3); + PCKEV_H4_SH(res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r, + res4, res5, res6, res7); + LD_SB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + ILVR_B4_SH(zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3, + tmp0, tmp1, tmp2, tmp3); + ILVR_B4_SH(zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7, + tmp4, tmp5, tmp6, tmp7); + ADD4(res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3, + res0, res1, res2, res3); + ADD4(res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7, + res4, res5, res6, res7); + CLIP_SH4_0_255(res0, res1, res2, res3); + CLIP_SH4_0_255(res4, res5, res6, res7); + PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, + dst0, dst1, dst2, dst3); + ST8x4_UB(dst0, dst1, dst, dst_stride); + dst += (4 * dst_stride); + ST8x4_UB(dst2, dst3, dst, dst_stride); +} + +static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src, + int32_t dst_stride) +{ + int32_t dc_val; + v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; + v8i16 dc; + v16i8 zeros = { 0 }; + + dc_val = (src[0] + 32) >> 6; + dc = __msa_fill_h(dc_val); + + src[0] = 0; + + LD_SB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + ILVR_B4_SH(zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3, + dst0_r, dst1_r, dst2_r, dst3_r); + ILVR_B4_SH(zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7, + dst4_r, dst5_r, dst6_r, dst7_r); + ADD4(dst0_r, dc, dst1_r, dc, dst2_r, dc, dst3_r, dc, + dst0_r, dst1_r, dst2_r, dst3_r); + ADD4(dst4_r, dc, dst5_r, dc, dst6_r, dc, dst7_r, dc, + dst4_r, dst5_r, dst6_r, dst7_r); + CLIP_SH4_0_255(dst0_r, dst1_r, dst2_r, dst3_r); + CLIP_SH4_0_255(dst4_r, dst5_r, dst6_r, dst7_r); + PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r, + dst0, dst1, dst2, dst3); + ST8x4_UB(dst0, dst1, dst, dst_stride); + dst += (4 * dst_stride); + ST8x4_UB(dst2, dst3, dst, dst_stride); +} + +void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, + int32_t dst_stride) +{ + avc_idct4x4_addblk_msa(dst, src, dst_stride); + memset(src, 0, 16 * sizeof(dctcoef)); +} + +void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src, + int32_t dst_stride) +{ + avc_idct8_addblk_msa(dst, src, dst_stride); + memset(src, 0, 64 * sizeof(dctcoef)); +} + +void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src, + int32_t dst_stride) +{ + avc_idct4x4_addblk_dc_msa(dst, src, dst_stride); +} + +void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src, + int32_t dst_stride) +{ + avc_idct8_dc_addblk_msa(dst, src, dst_stride); +} + +void ff_h264_idct_add16_msa(uint8_t *dst, + const int32_t *blk_offset, + int16_t *block, int32_t dst_stride, + const uint8_t nzc[15 * 8]) +{ + int32_t i; + + for (i = 0; i < 16; i++) { + int32_t nnz = nzc[scan8[i]]; + + if (nnz) { + if (nnz == 1 && ((dctcoef *) block)[i * 16]) + ff_h264_idct4x4_addblk_dc_msa(dst + blk_offset[i], + block + i * 16 * sizeof(pixel), + dst_stride); + else + ff_h264_idct_add_msa(dst + blk_offset[i], + block + i * 16 * sizeof(pixel), + dst_stride); + } + } +} + +void ff_h264_idct8_add4_msa(uint8_t *dst, const int32_t *blk_offset, + int16_t *block, int32_t dst_stride, + const uint8_t nzc[15 * 8]) +{ + int32_t cnt; + + for (cnt = 0; cnt < 16; cnt += 4) { + int32_t nnz = nzc[scan8[cnt]]; + + if (nnz) { + if (nnz == 1 && ((dctcoef *) block)[cnt * 16]) + ff_h264_idct8_dc_addblk_msa(dst + blk_offset[cnt], + block + cnt * 16 * sizeof(pixel), + dst_stride); + else + ff_h264_idct8_addblk_msa(dst + blk_offset[cnt], + block + cnt * 16 * sizeof(pixel), + dst_stride); + } + } +} + +void ff_h264_idct_add8_msa(uint8_t **dst, + const int32_t *blk_offset, + int16_t *block, int32_t dst_stride, + const uint8_t nzc[15 * 8]) +{ + int32_t i, j; + + for (j = 1; j < 3; j++) { + for (i = (j * 16); i < (j * 16 + 4); i++) { + if (nzc[scan8[i]]) + ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i], + block + i * 16 * sizeof(pixel), + dst_stride); + else if (((dctcoef *) block)[i * 16]) + ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i], + block + i * 16 * sizeof(pixel), + dst_stride); + } + } +} + +void ff_h264_idct_add8_422_msa(uint8_t **dst, + const int32_t *blk_offset, + int16_t *block, int32_t dst_stride, + const uint8_t nzc[15 * 8]) +{ + int32_t i, j; + + for (j = 1; j < 3; j++) { + for (i = (j * 16); i < (j * 16 + 4); i++) { + if (nzc[scan8[i]]) + ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i], + block + i * 16 * sizeof(pixel), + dst_stride); + else if (((dctcoef *) block)[i * 16]) + ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i], + block + i * 16 * sizeof(pixel), + dst_stride); + } + } + + for (j = 1; j < 3; j++) { + for (i = (j * 16 + 4); i < (j * 16 + 8); i++) { + if (nzc[scan8[i + 4]]) + ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i + 4], + block + i * 16 * sizeof(pixel), + dst_stride); + else if (((dctcoef *) block)[i * 16]) + ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i + 4], + block + i * 16 * sizeof(pixel), + dst_stride); + } + } +} + +void ff_h264_idct_add16_intra_msa(uint8_t *dst, + const int32_t *blk_offset, + int16_t *block, + int32_t dst_stride, + const uint8_t nzc[15 * 8]) +{ + int32_t i; + + for (i = 0; i < 16; i++) { + if (nzc[scan8[i]]) + ff_h264_idct_add_msa(dst + blk_offset[i], + block + i * 16 * sizeof(pixel), dst_stride); + else if (((dctcoef *) block)[i * 16]) + ff_h264_idct4x4_addblk_dc_msa(dst + blk_offset[i], + block + i * 16 * sizeof(pixel), + dst_stride); + } +} + +void ff_h264_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src, + int32_t de_qval) +{ + avc_deq_idct_luma_dc_msa(dst, src, de_qval); +} diff --git a/libavcodec/mips/h264pred_init_mips.c b/libavcodec/mips/h264pred_init_mips.c new file mode 100644 index 0000000000..c33d8f7cdb --- /dev/null +++ b/libavcodec/mips/h264pred_init_mips.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) + * Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "h264dsp_mips.h" +#include "h264pred_mips.h" + +#if HAVE_MSA +static av_cold void h264_pred_init_msa(H264PredContext *h, int codec_id, + const int bit_depth, + const int chroma_format_idc) +{ + if (8 == bit_depth) { + if (chroma_format_idc == 1) { + h->pred8x8[VERT_PRED8x8] = ff_h264_intra_pred_vert_8x8_msa; + h->pred8x8[HOR_PRED8x8] = ff_h264_intra_pred_horiz_8x8_msa; + } + + if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) { + if (chroma_format_idc == 1) { + h->pred8x8[PLANE_PRED8x8] = ff_h264_intra_predict_plane_8x8_msa; + } + } + if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 + && codec_id != AV_CODEC_ID_VP8) { + if (chroma_format_idc == 1) { + h->pred8x8[DC_PRED8x8] = ff_h264_intra_predict_dc_4blk_8x8_msa; + h->pred8x8[LEFT_DC_PRED8x8] = + ff_h264_intra_predict_hor_dc_8x8_msa; + h->pred8x8[TOP_DC_PRED8x8] = + ff_h264_intra_predict_vert_dc_8x8_msa; + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = + ff_h264_intra_predict_mad_cow_dc_l0t_8x8_msa; + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = + ff_h264_intra_predict_mad_cow_dc_0lt_8x8_msa; + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = + ff_h264_intra_predict_mad_cow_dc_l00_8x8_msa; + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = + ff_h264_intra_predict_mad_cow_dc_0l0_8x8_msa; + } + } else { + if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) { + h->pred8x8[7] = ff_vp8_pred8x8_127_dc_8_msa; + h->pred8x8[8] = ff_vp8_pred8x8_129_dc_8_msa; + } + } + + if (chroma_format_idc == 1) { + h->pred8x8[DC_128_PRED8x8] = ff_h264_intra_pred_dc_128_8x8_msa; + } + + h->pred16x16[DC_PRED8x8] = ff_h264_intra_pred_dc_16x16_msa; + h->pred16x16[VERT_PRED8x8] = ff_h264_intra_pred_vert_16x16_msa; + h->pred16x16[HOR_PRED8x8] = ff_h264_intra_pred_horiz_16x16_msa; + + switch (codec_id) { + case AV_CODEC_ID_SVQ3: + ; + break; + case AV_CODEC_ID_RV40: + ; + break; + case AV_CODEC_ID_VP7: + case AV_CODEC_ID_VP8: + h->pred16x16[7] = ff_vp8_pred16x16_127_dc_8_msa; + h->pred16x16[8] = ff_vp8_pred16x16_129_dc_8_msa; + break; + default: + h->pred16x16[PLANE_PRED8x8] = + ff_h264_intra_predict_plane_16x16_msa; + break; + } + + h->pred16x16[LEFT_DC_PRED8x8] = ff_h264_intra_pred_dc_left_16x16_msa; + h->pred16x16[TOP_DC_PRED8x8] = ff_h264_intra_pred_dc_top_16x16_msa; + h->pred16x16[DC_128_PRED8x8] = ff_h264_intra_pred_dc_128_16x16_msa; + } +} +#endif // #if HAVE_MSA + +#if HAVE_MMI +static av_cold void h264_pred_init_mmi(H264PredContext *h, int codec_id, + const int bit_depth, const int chroma_format_idc) +{ + if (bit_depth == 8) { + if (chroma_format_idc == 1) { + h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_mmi; + h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmi; + } else { + h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x16_vertical_8_mmi; + h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x16_horizontal_8_mmi; + } + + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_mmi; + h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_8_mmi; + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmi; + h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_mmi; + h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmi; + +#if ARCH_MIPS64 + switch (codec_id) { + case AV_CODEC_ID_SVQ3: + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_svq3_8_mmi; + break; + case AV_CODEC_ID_RV40: + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_rv40_8_mmi; + break; + case AV_CODEC_ID_VP7: + case AV_CODEC_ID_VP8: + break; + default: + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_h264_8_mmi; + break; + } +#endif + + if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) { + if (chroma_format_idc == 1) { + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_8_mmi; + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_8_mmi; + } + } + } +} +#endif /* HAVE_MMI */ + +av_cold void ff_h264_pred_init_mips(H264PredContext *h, int codec_id, + int bit_depth, + const int chroma_format_idc) +{ +#if HAVE_MSA + h264_pred_init_msa(h, codec_id, bit_depth, chroma_format_idc); +#endif // #if HAVE_MSA +#if HAVE_MMI + h264_pred_init_mmi(h, codec_id, bit_depth, chroma_format_idc); +#endif /* HAVE_MMI */ +} diff --git a/libavcodec/mips/h264pred_mips.h b/libavcodec/mips/h264pred_mips.h new file mode 100644 index 0000000000..136e291252 --- /dev/null +++ b/libavcodec/mips/h264pred_mips.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_H264PRED_MIPS_H +#define AVCODEC_MIPS_H264PRED_MIPS_H + +#include "constants.h" +#include "libavcodec/h264pred.h" + +void ff_pred16x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_dc_8_mmi(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8l_top_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright, + ptrdiff_t stride); +void ff_pred8x8l_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright, + ptrdiff_t stride); +void ff_pred8x8l_vertical_8_mmi(uint8_t *src, int has_topleft, + int has_topright, ptrdiff_t stride); +void ff_pred4x4_dc_8_mmi(uint8_t *src, const uint8_t *topright, + ptrdiff_t stride); +void ff_pred8x8_vertical_8_mmi(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_plane_svq3_8_mmi(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_plane_rv40_8_mmi(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_plane_h264_8_mmi(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_top_dc_8_mmi(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_dc_8_mmi(uint8_t *src, ptrdiff_t stride); +void ff_pred8x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride); +void ff_pred8x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride); + +#endif /* AVCODEC_MIPS_H264PRED_MIPS_H */ diff --git a/libavcodec/mips/h264pred_mmi.c b/libavcodec/mips/h264pred_mmi.c new file mode 100644 index 0000000000..bb795a1aba --- /dev/null +++ b/libavcodec/mips/h264pred_mmi.c @@ -0,0 +1,992 @@ +/* + * Loongson SIMD optimized h264pred + * + * Copyright (c) 2015 Loongson Technology Corporation Limited + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "h264pred_mips.h" +#include "libavcodec/bit_depth_template.c" +#include "libavutil/mips/asmdefs.h" +#include "constants.h" + +void ff_pred16x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride) +{ + double ftmp[2]; + uint64_t tmp[1]; + + __asm__ volatile ( + "dli %[tmp0], 0x08 \n\t" + "gsldlc1 %[ftmp0], 0x07(%[srcA]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[srcA]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[srcA]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[srcA]) \n\t" + "1: \n\t" + "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t" + "gssdlc1 %[ftmp1], 0x0f(%[src]) \n\t" + "gssdrc1 %[ftmp1], 0x08(%[src]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t" + "gssdlc1 %[ftmp1], 0x0f(%[src]) \n\t" + "gssdrc1 %[ftmp1], 0x08(%[src]) \n\t" + "daddi %[tmp0], %[tmp0], -0x01 \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [tmp0]"=&r"(tmp[0]), + [src]"+&r"(src) + : [stride]"r"((mips_reg)stride), [srcA]"r"((mips_reg)(src-stride)) + : "memory" + ); +} + +void ff_pred16x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride) +{ + uint64_t tmp[3]; + mips_reg addr[2]; + + __asm__ volatile ( + PTR_ADDI "%[addr0], %[src], -0x01 \n\t" + PTR_ADDU "%[addr1], %[src], $0 \n\t" + "dli %[tmp2], 0x08 \n\t" + "1: \n\t" + "lbu %[tmp0], 0x00(%[addr0]) \n\t" + "dmul %[tmp1], %[tmp0], %[ff_pb_1] \n\t" + "swl %[tmp1], 0x07(%[addr1]) \n\t" + "swr %[tmp1], 0x00(%[addr1]) \n\t" + "swl %[tmp1], 0x0f(%[addr1]) \n\t" + "swr %[tmp1], 0x08(%[addr1]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t" + "lbu %[tmp0], 0x00(%[addr0]) \n\t" + "dmul %[tmp1], %[tmp0], %[ff_pb_1] \n\t" + "swl %[tmp1], 0x07(%[addr1]) \n\t" + "swr %[tmp1], 0x00(%[addr1]) \n\t" + "swl %[tmp1], 0x0f(%[addr1]) \n\t" + "swr %[tmp1], 0x08(%[addr1]) \n\t" + "daddi %[tmp2], %[tmp2], -0x01 \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t" + "bnez %[tmp2], 1b \n\t" + : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [tmp2]"=&r"(tmp[2]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]) + : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride), + [ff_pb_1]"r"(ff_pb_1) + : "memory" + ); +} + +void ff_pred16x16_dc_8_mmi(uint8_t *src, ptrdiff_t stride) +{ + uint64_t tmp[4]; + mips_reg addr[2]; + + __asm__ volatile ( + PTR_ADDI "%[addr0], %[src], -0x01 \n\t" + "dli %[tmp0], 0x08 \n\t" + "xor %[tmp3], %[tmp3], %[tmp3] \n\t" + "1: \n\t" + "lbu %[tmp1], 0x00(%[addr0]) \n\t" + "daddu %[tmp3], %[tmp3], %[tmp1] \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "lbu %[tmp1], 0x00(%[addr0]) \n\t" + "daddi %[tmp0], %[tmp0], -0x01 \n\t" + "daddu %[tmp3], %[tmp3], %[tmp1] \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "bnez %[tmp0], 1b \n\t" + + "dli %[tmp0], 0x08 \n\t" + PTR_SUBU "%[addr0], %[src], %[stride] \n\t" + "2: \n\t" + "lbu %[tmp1], 0x00(%[addr0]) \n\t" + "daddu %[tmp3], %[tmp3], %[tmp1] \n\t" + PTR_ADDIU "%[addr0], %[addr0], 0x01 \n\t" + "lbu %[tmp1], 0x00(%[addr0]) \n\t" + "daddi %[tmp0], %[tmp0], -0x01 \n\t" + "daddu %[tmp3], %[tmp3], %[tmp1] \n\t" + PTR_ADDIU "%[addr0], %[addr0], 0x01 \n\t" + "bnez %[tmp0], 2b \n\t" + + "daddiu %[tmp3], %[tmp3], 0x10 \n\t" + "dsra %[tmp3], 0x05 \n\t" + "dmul %[tmp2], %[tmp3], %[ff_pb_1] \n\t" + PTR_ADDU "%[addr0], %[src], $0 \n\t" + "dli %[tmp0], 0x08 \n\t" + "3: \n\t" + "swl %[tmp2], 0x07(%[addr0]) \n\t" + "swr %[tmp2], 0x00(%[addr0]) \n\t" + "swl %[tmp2], 0x0f(%[addr0]) \n\t" + "swr %[tmp2], 0x08(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "swl %[tmp2], 0x07(%[addr0]) \n\t" + "swr %[tmp2], 0x00(%[addr0]) \n\t" + "swl %[tmp2], 0x0f(%[addr0]) \n\t" + "swr %[tmp2], 0x08(%[addr0]) \n\t" + "daddi %[tmp0], %[tmp0], -0x01 \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "bnez %[tmp0], 3b \n\t" + : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]) + : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride), + [ff_pb_1]"r"(ff_pb_1) + : "memory" + ); +} + +void ff_pred8x8l_top_dc_8_mmi(uint8_t *src, int has_topleft, + int has_topright, ptrdiff_t stride) +{ + uint32_t dc; + double ftmp[11]; + mips_reg tmp[3]; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[ftmp10], 0x07(%[srcA]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[srcA]) \n\t" + "gsldlc1 %[ftmp9], 0x07(%[src0]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[src0]) \n\t" + "gsldlc1 %[ftmp8], 0x07(%[src1]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[src1]) \n\t" + + "punpcklbh %[ftmp7], %[ftmp10], %[ftmp0] \n\t" + "punpckhbh %[ftmp6], %[ftmp10], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp9], %[ftmp0] \n\t" + "punpckhbh %[ftmp4], %[ftmp9], %[ftmp0] \n\t" + "punpcklbh %[ftmp3], %[ftmp8], %[ftmp0] \n\t" + "punpckhbh %[ftmp2], %[ftmp8], %[ftmp0] \n\t" + "bnez %[has_topleft], 1f \n\t" + "pinsrh_0 %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + + "1: \n\t" + "bnez %[has_topright], 2f \n\t" + "pinsrh_3 %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + + "2: \n\t" + "dli %[tmp0], 0x02 \n\t" + "mtc1 %[tmp0], %[ftmp1] \n\t" + "pmullh %[ftmp5], %[ftmp5], %[ff_pw_2] \n\t" + "pmullh %[ftmp4], %[ftmp4], %[ff_pw_2] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ff_pw_2] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ff_pw_2] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "packushb %[ftmp9], %[ftmp7], %[ftmp6] \n\t" + "biadd %[ftmp10], %[ftmp9] \n\t" + "mfc1 %[tmp1], %[ftmp10] \n\t" + "addiu %[tmp1], %[tmp1], 0x04 \n\t" + "srl %[tmp1], %[tmp1], 0x03 \n\t" + "mul %[dc], %[tmp1], %[ff_pb_1] \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [dc]"=r"(dc) + : [srcA]"r"((mips_reg)(src-stride-1)), + [src0]"r"((mips_reg)(src-stride)), + [src1]"r"((mips_reg)(src-stride+1)), + [has_topleft]"r"(has_topleft), [has_topright]"r"(has_topright), + [ff_pb_1]"r"(ff_pb_1), [ff_pw_2]"f"(ff_pw_2) + : "memory" + ); + + __asm__ volatile ( + "dli %[tmp0], 0x02 \n\t" + "punpcklwd %[ftmp0], %[dc], %[dc] \n\t" + "1: \n\t" + "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t" + "gssdxc1 %[ftmp0], 0x00(%[src], %[stride]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t" + "gssdxc1 %[ftmp0], 0x00(%[src], %[stride]) \n\t" + "daddi %[tmp0], %[tmp0], -0x01 \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]), + [src]"+&r"(src) + : [dc]"f"(dc), [stride]"r"((mips_reg)stride) + : "memory" + ); +} + +void ff_pred8x8l_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright, + ptrdiff_t stride) +{ + uint32_t dc, dc1, dc2; + double ftmp[14]; + mips_reg tmp[1]; + + const int l0 = ((has_topleft ? src[-1+-1*stride] : src[-1+0*stride]) + 2*src[-1+0*stride] + src[-1+1*stride] + 2) >> 2; + const int l1 = (src[-1+0*stride] + 2*src[-1+1*stride] + src[-1+2*stride] + 2) >> 2; + const int l2 = (src[-1+1*stride] + 2*src[-1+2*stride] + src[-1+3*stride] + 2) >> 2; + const int l3 = (src[-1+2*stride] + 2*src[-1+3*stride] + src[-1+4*stride] + 2) >> 2; + const int l4 = (src[-1+3*stride] + 2*src[-1+4*stride] + src[-1+5*stride] + 2) >> 2; + const int l5 = (src[-1+4*stride] + 2*src[-1+5*stride] + src[-1+6*stride] + 2) >> 2; + const int l6 = (src[-1+5*stride] + 2*src[-1+6*stride] + src[-1+7*stride] + 2) >> 2; + const int l7 = (src[-1+6*stride] + 2*src[-1+7*stride] + src[-1+7*stride] + 2) >> 2; + + __asm__ volatile ( + "gsldlc1 %[ftmp4], 0x07(%[srcA]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[srcA]) \n\t" + "gsldlc1 %[ftmp5], 0x07(%[src0]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src0]) \n\t" + "gsldlc1 %[ftmp6], 0x07(%[src1]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src1]) \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x03 \n\t" + "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t" + "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp1] \n\t" + "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp6], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp6], %[ftmp0] \n\t" + "pshufh %[ftmp3], %[ftmp8], %[ftmp1] \n\t" + "pshufh %[ftmp13], %[ftmp12], %[ftmp1] \n\t" + "pinsrh_3 %[ftmp8], %[ftmp8], %[ftmp13] \n\t" + "pinsrh_3 %[ftmp12], %[ftmp12], %[ftmp3] \n\t" + "bnez %[has_topleft], 1f \n\t" + "pinsrh_0 %[ftmp7], %[ftmp7], %[ftmp9] \n\t" + + "1: \n\t" + "bnez %[has_topright], 2f \n\t" + "pshufh %[ftmp13], %[ftmp10], %[ftmp1] \n\t" + "pinsrh_3 %[ftmp8], %[ftmp8], %[ftmp13] \n\t" + + "2: \n\t" + "dli %[tmp0], 0x02 \n\t" + "mtc1 %[tmp0], %[ftmp1] \n\t" + "pshufh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "pmullh %[ftmp9], %[ftmp9], %[ftmp2] \n\t" + "pmullh %[ftmp10], %[ftmp10], %[ftmp2] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp9] \n\t" + "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp11] \n\t" + "paddh %[ftmp8], %[ftmp8], %[ftmp12] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp2] \n\t" + "paddh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp1] \n\t" + "packushb %[ftmp5], %[ftmp7], %[ftmp8] \n\t" + "biadd %[ftmp4], %[ftmp5] \n\t" + "mfc1 %[dc2], %[ftmp4] \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), + [tmp0]"=&r"(tmp[0]), [dc2]"=r"(dc2) + : [srcA]"r"((mips_reg)(src-stride-1)), + [src0]"r"((mips_reg)(src-stride)), + [src1]"r"((mips_reg)(src-stride+1)), + [has_topleft]"r"(has_topleft), [has_topright]"r"(has_topright) + : "memory" + ); + + dc1 = l0+l1+l2+l3+l4+l5+l6+l7; + dc = ((dc1+dc2+8)>>4)*0x01010101U; + + __asm__ volatile ( + "dli %[tmp0], 0x02 \n\t" + "punpcklwd %[ftmp0], %[dc], %[dc] \n\t" + "1: \n\t" + "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t" + "gssdxc1 %[ftmp0], 0x00(%[src], %[stride]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t" + "gssdxc1 %[ftmp0], 0x00(%[src], %[stride]) \n\t" + "daddi %[tmp0], %[tmp0], -0x01 \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]), + [src]"+&r"(src) + : [dc]"f"(dc), [stride]"r"((mips_reg)stride) + : "memory" + ); +} + +void ff_pred8x8l_vertical_8_mmi(uint8_t *src, int has_topleft, + int has_topright, ptrdiff_t stride) +{ + double ftmp[12]; + mips_reg tmp[1]; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[ftmp3], 0x07(%[srcA]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[srcA]) \n\t" + "gsldlc1 %[ftmp4], 0x07(%[src0]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[src0]) \n\t" + "gsldlc1 %[ftmp5], 0x07(%[src1]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src1]) \n\t" + "punpcklbh %[ftmp6], %[ftmp3], %[ftmp0] \n\t" + "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" + "punpckhbh %[ftmp9], %[ftmp4], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp11], %[ftmp5], %[ftmp0] \n\t" + "bnez %[has_topleft], 1f \n\t" + "pinsrh_0 %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + + "1: \n\t" + "bnez %[has_topright], 2f \n\t" + "pinsrh_3 %[ftmp11], %[ftmp11], %[ftmp9] \n\t" + + "2: \n\t" + "dli %[tmp0], 0x02 \n\t" + "mtc1 %[tmp0], %[ftmp1] \n\t" + "pshufh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" + "pmullh %[ftmp9], %[ftmp9], %[ftmp2] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp9] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp11] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp2] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "packushb %[ftmp4], %[ftmp6], %[ftmp7] \n\t" + "sdc1 %[ftmp4], 0x00(%[src]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [src]"=r"(src) + : [srcA]"r"((mips_reg)(src-stride-1)), + [src0]"r"((mips_reg)(src-stride)), + [src1]"r"((mips_reg)(src-stride+1)), + [has_topleft]"r"(has_topleft), [has_topright]"r"(has_topright) + : "memory" + ); + + __asm__ volatile ( + "dli %[tmp0], 0x02 \n\t" + "1: \n\t" + "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t" + "daddi %[tmp0], %[tmp0], -0x01 \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]), + [src]"+&r"(src) + : [stride]"r"((mips_reg)stride) + : "memory" + ); +} + +void ff_pred4x4_dc_8_mmi(uint8_t *src, const uint8_t *topright, + ptrdiff_t stride) +{ + const int dc = (src[-stride] + src[1-stride] + src[2-stride] + + src[3-stride] + src[-1+0*stride] + src[-1+1*stride] + + src[-1+2*stride] + src[-1+3*stride] + 4) >>3; + uint64_t tmp[2]; + mips_reg addr[1]; + + __asm__ volatile ( + PTR_ADDU "%[tmp0], %[dc], $0 \n\t" + "dmul %[tmp1], %[tmp0], %[ff_pb_1] \n\t" + "xor %[addr0], %[addr0], %[addr0] \n\t" + "gsswx %[tmp1], 0x00(%[src], %[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "gsswx %[tmp1], 0x00(%[src], %[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "gsswx %[tmp1], 0x00(%[src], %[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "gsswx %[tmp1], 0x00(%[src], %[addr0]) \n\t" + : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [addr0]"=&r"(addr[0]) + : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride), + [dc]"r"(dc), [ff_pb_1]"r"(ff_pb_1) + : "memory" + ); +} + +void ff_pred8x8_vertical_8_mmi(uint8_t *src, ptrdiff_t stride) +{ + uint64_t tmp[2]; + mips_reg addr[2]; + + __asm__ volatile ( + PTR_SUBU "%[addr0], %[src], %[stride] \n\t" + PTR_ADDU "%[addr1], %[src], $0 \n\t" + "ldl %[tmp0], 0x07(%[addr0]) \n\t" + "ldr %[tmp0], 0x00(%[addr0]) \n\t" + "dli %[tmp1], 0x04 \n\t" + "1: \n\t" + "sdl %[tmp0], 0x07(%[addr1]) \n\t" + "sdr %[tmp0], 0x00(%[addr1]) \n\t" + PTR_ADDU "%[addr1], %[stride] \n\t" + "sdl %[tmp0], 0x07(%[addr1]) \n\t" + "sdr %[tmp0], 0x00(%[addr1]) \n\t" + "daddi %[tmp1], -0x01 \n\t" + PTR_ADDU "%[addr1], %[stride] \n\t" + "bnez %[tmp1], 1b \n\t" + : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]) + : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride) + : "memory" + ); +} + +void ff_pred8x8_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride) +{ + uint64_t tmp[3]; + mips_reg addr[2]; + + __asm__ volatile ( + PTR_ADDI "%[addr0], %[src], -0x01 \n\t" + PTR_ADDU "%[addr1], %[src], $0 \n\t" + "dli %[tmp0], 0x04 \n\t" + "1: \n\t" + "lbu %[tmp1], 0x00(%[addr0]) \n\t" + "dmul %[tmp2], %[tmp1], %[ff_pb_1] \n\t" + "swl %[tmp2], 0x07(%[addr1]) \n\t" + "swr %[tmp2], 0x00(%[addr1]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t" + "lbu %[tmp1], 0x00(%[addr0]) \n\t" + "dmul %[tmp2], %[tmp1], %[ff_pb_1] \n\t" + "swl %[tmp2], 0x07(%[addr1]) \n\t" + "swr %[tmp2], 0x00(%[addr1]) \n\t" + "daddi %[tmp0], %[tmp0], -0x01 \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t" + "bnez %[tmp0], 1b \n\t" + : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [tmp2]"=&r"(tmp[2]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]) + : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride), + [ff_pb_1]"r"(ff_pb_1) + : "memory" + ); +} + +void ff_pred8x8_top_dc_8_mmi(uint8_t *src, ptrdiff_t stride) +{ + double ftmp[4]; + uint64_t tmp[1]; + mips_reg addr[1]; + + __asm__ volatile ( + "dli %[tmp0], 0x02 \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + PTR_SUBU "%[addr0], %[src], %[stride] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" + "biadd %[ftmp2], %[ftmp2] \n\t" + "biadd %[ftmp3], %[ftmp3] \n\t" + "mtc1 %[tmp0], %[ftmp1] \n\t" + "pshufh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "pshufh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "paddush %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "paddush %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "mtc1 %[tmp0], %[ftmp1] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "psrlh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp3] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), + [src]"+&r"(src) + : [stride]"r"((mips_reg)stride) + : "memory" + ); +} + +void ff_pred8x8_dc_8_mmi(uint8_t *src, ptrdiff_t stride) +{ + double ftmp[5]; + mips_reg addr[7]; + + __asm__ volatile ( + "negu %[addr0], %[stride] \n\t" + PTR_ADDU "%[addr0], %[addr0], %[src] \n\t" + PTR_ADDIU "%[addr1], %[addr0], 0x04 \n\t" + "lbu %[addr2], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr3], $0, %[addr2] \n\t" + PTR_ADDIU "%[addr0], 0x01 \n\t" + "lbu %[addr2], 0x00(%[addr1]) \n\t" + PTR_ADDU "%[addr4], $0, %[addr2] \n\t" + PTR_ADDIU "%[addr1], 0x01 \n\t" + "lbu %[addr2], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr3], %[addr3], %[addr2] \n\t" + PTR_ADDIU "%[addr0], 0x01 \n\t" + "lbu %[addr2], 0x00(%[addr1]) \n\t" + PTR_ADDU "%[addr4], %[addr4], %[addr2] \n\t" + PTR_ADDIU "%[addr1], 0x01 \n\t" + "lbu %[addr2], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr3], %[addr3], %[addr2] \n\t" + PTR_ADDIU "%[addr0], 0x01 \n\t" + "lbu %[addr2], 0x00(%[addr1]) \n\t" + PTR_ADDU "%[addr4], %[addr4], %[addr2] \n\t" + PTR_ADDIU "%[addr1], 0x01 \n\t" + "lbu %[addr2], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr3], %[addr3], %[addr2] \n\t" + PTR_ADDIU "%[addr0], 0x01 \n\t" + "lbu %[addr2], 0x00(%[addr1]) \n\t" + PTR_ADDU "%[addr4], %[addr4], %[addr2] \n\t" + PTR_ADDIU "%[addr1], 0x01 \n\t" + "dli %[addr2], -0x01 \n\t" + PTR_ADDU "%[addr2], %[addr2], %[src] \n\t" + "lbu %[addr1], 0x00(%[addr2]) \n\t" + PTR_ADDU "%[addr5], $0, %[addr1] \n\t" + PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t" + "lbu %[addr1], 0x00(%[addr2]) \n\t" + PTR_ADDU "%[addr5], %[addr5], %[addr1] \n\t" + PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t" + "lbu %[addr1], 0x00(%[addr2]) \n\t" + PTR_ADDU "%[addr5], %[addr5], %[addr1] \n\t" + PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t" + "lbu %[addr1], 0x00(%[addr2]) \n\t" + PTR_ADDU "%[addr5], %[addr5], %[addr1] \n\t" + PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t" + "lbu %[addr1], 0x00(%[addr2]) \n\t" + PTR_ADDU "%[addr6], $0, %[addr1] \n\t" + PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t" + "lbu %[addr1], 0x00(%[addr2]) \n\t" + PTR_ADDU "%[addr6], %[addr6], %[addr1] \n\t" + PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t" + "lbu %[addr1], 0x00(%[addr2]) \n\t" + PTR_ADDU "%[addr6], %[addr6], %[addr1] \n\t" + PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t" + "lbu %[addr1], 0x00(%[addr2]) \n\t" + PTR_ADDU "%[addr6], %[addr6], %[addr1] \n\t" + PTR_ADDU "%[addr3], %[addr3], %[addr5] \n\t" + PTR_ADDIU "%[addr3], %[addr3], 0x04 \n\t" + PTR_ADDIU "%[addr4], %[addr4], 0x02 \n\t" + PTR_ADDIU "%[addr1], %[addr6], 0x02 \n\t" + PTR_ADDU "%[addr2], %[addr4], %[addr1] \n\t" + PTR_SRL "%[addr3], 0x03 \n\t" + PTR_SRL "%[addr4], 0x02 \n\t" + PTR_SRL "%[addr1], 0x02 \n\t" + PTR_SRL "%[addr2], 0x03 \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dmtc1 %[addr3], %[ftmp1] \n\t" + "pshufh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "dmtc1 %[addr4], %[ftmp2] \n\t" + "pshufh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "dmtc1 %[addr1], %[ftmp3] \n\t" + "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "dmtc1 %[addr2], %[ftmp4] \n\t" + "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "packushb %[ftmp2], %[ftmp3], %[ftmp4] \n\t" + PTR_ADDU "%[addr0], $0, %[src] \n\t" + "sdc1 %[ftmp1], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "sdc1 %[ftmp1], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "sdc1 %[ftmp1], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "sdc1 %[ftmp1], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "sdc1 %[ftmp2], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "sdc1 %[ftmp2], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "sdc1 %[ftmp2], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "sdc1 %[ftmp2], 0x00(%[addr0]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]), + [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]), + [addr6]"=&r"(addr[6]) + : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride) + : "memory" + ); +} + +void ff_pred8x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride) +{ + double ftmp[1]; + uint64_t tmp[1]; + + __asm__ volatile ( + "gsldlc1 %[ftmp0], 0x07(%[srcA]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[srcA]) \n\t" + "dli %[tmp0], 0x04 \n\t" + "1: \n\t" + "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[src]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src]) \n\t" + "daddi %[tmp0], %[tmp0], -0x01 \n\t" + PTR_ADDU "%[src], %[src], %[stride] \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), + [tmp0]"=&r"(tmp[0]), + [src]"+&r"(src) + : [stride]"r"((mips_reg)stride), [srcA]"r"((mips_reg)(src-stride)) + : "memory" + ); +} + +void ff_pred8x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride) +{ + uint64_t tmp[3]; + mips_reg addr[2]; + + __asm__ volatile ( + PTR_ADDI "%[addr0], %[src], -0x01 \n\t" + PTR_ADDU "%[addr1], %[src], $0 \n\t" + "dli %[tmp0], 0x08 \n\t" + "1: \n\t" + "lbu %[tmp1], 0x00(%[addr0]) \n\t" + "dmul %[tmp2], %[tmp1], %[ff_pb_1] \n\t" + "swl %[tmp2], 0x07(%[addr1]) \n\t" + "swr %[tmp2], 0x00(%[addr1]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t" + "lbu %[tmp1], 0x00(%[addr0]) \n\t" + "dmul %[tmp2], %[tmp1], %[ff_pb_1] \n\t" + "swl %[tmp2], 0x07(%[addr1]) \n\t" + "swr %[tmp2], 0x00(%[addr1]) \n\t" + "daddi %[tmp0], %[tmp0], -0x01 \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t" + "bnez %[tmp0], 1b \n\t" + : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [tmp2]"=&r"(tmp[2]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]) + : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride), + [ff_pb_1]"r"(ff_pb_1) + : "memory" + ); +} + +static inline void pred16x16_plane_compat_mmi(uint8_t *src, int stride, + const int svq3, const int rv40) +{ + double ftmp[11]; + uint64_t tmp[7]; + mips_reg addr[1]; + + __asm__ volatile( + PTR_SUBU "%[addr0], %[src], %[stride] \n\t" + "dli %[tmp2], 0x20 \n\t" + "dmtc1 %[tmp2], %[ftmp4] \n\t" + "gsldlc1 %[ftmp0], 0x06(%[addr0]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[addr0]) \n\t" + "gsldrc1 %[ftmp0], -0x01(%[addr0]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[addr0]) \n\t" + "dsrl %[ftmp1], %[ftmp0], %[ftmp4] \n\t" + "dsrl %[ftmp3], %[ftmp2], %[ftmp4] \n\t" + "xor %[ftmp4], %[ftmp4], %[ftmp4] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp4] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "pmullh %[ftmp0], %[ftmp0], %[ff_pw_m8tom5] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ff_pw_m4tom1] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ff_pw_1to4] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[ff_pw_5to8] \n\t" + "paddsh %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "paddsh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "paddsh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "dli %[tmp2], 0x0e \n\t" + "dmtc1 %[tmp2], %[ftmp4] \n\t" + "pshufh %[ftmp1], %[ftmp0], %[ftmp4] \n\t" + "paddsh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "dli %[tmp2], 0x01 \n\t" + "dmtc1 %[tmp2], %[ftmp4] \n\t" + "pshufh %[ftmp1], %[ftmp0], %[ftmp4] \n\t" + "paddsh %[ftmp5], %[ftmp0], %[ftmp1] \n\t" + + PTR_ADDIU "%[addr0], %[src], -0x01 \n\t" + PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t" + "lbu %[tmp2], 0x00(%[addr0]) \n\t" + "lbu %[tmp6], 0x10(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "lbu %[tmp3], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "lbu %[tmp4], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "lbu %[tmp5], 0x00(%[addr0]) \n\t" + "dsll %[tmp3], %[tmp3], 0x10 \n\t" + "dsll %[tmp4], %[tmp4], 0x20 \n\t" + "dsll %[tmp5], %[tmp5], 0x30 \n\t" + "or %[tmp4], %[tmp4], %[tmp5] \n\t" + "or %[tmp2], %[tmp2], %[tmp3] \n\t" + "or %[tmp2], %[tmp2], %[tmp4] \n\t" + "dmtc1 %[tmp2], %[ftmp0] \n\t" + + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "lbu %[tmp2], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "lbu %[tmp3], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "lbu %[tmp4], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "lbu %[tmp5], 0x00(%[addr0]) \n\t" + "dsll %[tmp3], %[tmp3], 0x10 \n\t" + "dsll %[tmp4], %[tmp4], 0x20 \n\t" + "dsll %[tmp5], %[tmp5], 0x30 \n\t" + "or %[tmp4], %[tmp4], %[tmp5] \n\t" + "or %[tmp2], %[tmp2], %[tmp3] \n\t" + "or %[tmp2], %[tmp2], %[tmp4] \n\t" + "dmtc1 %[tmp2], %[ftmp1] \n\t" + + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "lbu %[tmp2], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "lbu %[tmp3], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "lbu %[tmp4], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "lbu %[tmp5], 0x00(%[addr0]) \n\t" + "dsll %[tmp3], %[tmp3], 0x10 \n\t" + "dsll %[tmp4], %[tmp4], 0x20 \n\t" + "dsll %[tmp5], %[tmp5], 0x30 \n\t" + "or %[tmp4], %[tmp4], %[tmp5] \n\t" + "or %[tmp2], %[tmp2], %[tmp3] \n\t" + "or %[tmp2], %[tmp2], %[tmp4] \n\t" + "dmtc1 %[tmp2], %[ftmp2] \n\t" + + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "lbu %[tmp2], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "lbu %[tmp3], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "lbu %[tmp4], 0x00(%[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "lbu %[tmp5], 0x00(%[addr0]) \n\t" + "daddu %[tmp6], %[tmp6], %[tmp5] \n\t" + "daddiu %[tmp6], %[tmp6], 0x01 \n\t" + "dsll %[tmp6], %[tmp6], 0x04 \n\t" + + "dsll %[tmp3], %[tmp3], 0x10 \n\t" + "dsll %[tmp4], %[tmp4], 0x20 \n\t" + "dsll %[tmp5], %[tmp5], 0x30 \n\t" + "or %[tmp4], %[tmp4], %[tmp5] \n\t" + "or %[tmp2], %[tmp2], %[tmp3] \n\t" + "or %[tmp2], %[tmp2], %[tmp4] \n\t" + "dmtc1 %[tmp2], %[ftmp3] \n\t" + + "pmullh %[ftmp0], %[ftmp0], %[ff_pw_m8tom5] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ff_pw_m4tom1] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ff_pw_1to4] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[ff_pw_5to8] \n\t" + "paddsh %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "paddsh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "paddsh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "dli %[tmp2], 0x0e \n\t" + "dmtc1 %[tmp2], %[ftmp4] \n\t" + "pshufh %[ftmp1], %[ftmp0], %[ftmp4] \n\t" + "paddsh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "dli %[tmp2], 0x01 \n\t" + "dmtc1 %[tmp2], %[ftmp4] \n\t" + "pshufh %[ftmp1], %[ftmp0], %[ftmp4] \n\t" + "paddsh %[ftmp6], %[ftmp0], %[ftmp1] \n\t" + + "dmfc1 %[tmp0], %[ftmp5] \n\t" + "dsll %[tmp0], %[tmp0], 0x30 \n\t" + "dsra %[tmp0], %[tmp0], 0x30 \n\t" + "dmfc1 %[tmp1], %[ftmp6] \n\t" + "dsll %[tmp1], %[tmp1], 0x30 \n\t" + "dsra %[tmp1], %[tmp1], 0x30 \n\t" + + "beqz %[svq3], 1f \n\t" + "dli %[tmp2], 0x04 \n\t" + "ddiv %[tmp0], %[tmp0], %[tmp2] \n\t" + "ddiv %[tmp1], %[tmp1], %[tmp2] \n\t" + "dli %[tmp2], 0x05 \n\t" + "dmul %[tmp0], %[tmp0], %[tmp2] \n\t" + "dmul %[tmp1], %[tmp1], %[tmp2] \n\t" + "dli %[tmp2], 0x10 \n\t" + "ddiv %[tmp0], %[tmp0], %[tmp2] \n\t" + "ddiv %[tmp1], %[tmp1], %[tmp2] \n\t" + "daddu %[tmp2], %[tmp0], $0 \n\t" + "daddu %[tmp0], %[tmp1], $0 \n\t" + "daddu %[tmp1], %[tmp2], $0 \n\t" + "b 2f \n\t" + + "1: \n\t" + "beqz %[rv40], 1f \n\t" + "dsra %[tmp2], %[tmp0], 0x02 \n\t" + "daddu %[tmp0], %[tmp0], %[tmp2] \n\t" + "dsra %[tmp2], %[tmp1], 0x02 \n\t" + "daddu %[tmp1], %[tmp1], %[tmp2] \n\t" + "dsra %[tmp0], %[tmp0], 0x04 \n\t" + "dsra %[tmp1], %[tmp1], 0x04 \n\t" + "b 2f \n\t" + + "1: \n\t" + "dli %[tmp2], 0x05 \n\t" + "dmul %[tmp0], %[tmp0], %[tmp2] \n\t" + "dmul %[tmp1], %[tmp1], %[tmp2] \n\t" + "daddiu %[tmp0], %[tmp0], 0x20 \n\t" + "daddiu %[tmp1], %[tmp1], 0x20 \n\t" + "dsra %[tmp0], %[tmp0], 0x06 \n\t" + "dsra %[tmp1], %[tmp1], 0x06 \n\t" + + "2: \n\t" + "daddu %[tmp3], %[tmp0], %[tmp1] \n\t" + "dli %[tmp2], 0x07 \n\t" + "dmul %[tmp3], %[tmp3], %[tmp2] \n\t" + "dsubu %[tmp6], %[tmp6], %[tmp3] \n\t" + + "xor %[ftmp4], %[ftmp4], %[ftmp4] \n\t" + "dmtc1 %[tmp0], %[ftmp0] \n\t" + "pshufh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "dmtc1 %[tmp1], %[ftmp5] \n\t" + "pshufh %[ftmp5], %[ftmp5], %[ftmp4] \n\t" + "dmtc1 %[tmp6], %[ftmp6] \n\t" + "pshufh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "dli %[tmp2], 0x05 \n\t" + "dmtc1 %[tmp2], %[ftmp7] \n\t" + "pmullh %[ftmp1], %[ff_pw_0to3], %[ftmp0] \n\t" + "dmtc1 %[ff_pw_4to7], %[ftmp2] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "dmtc1 %[ff_pw_8tob], %[ftmp3] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "dmtc1 %[ff_pw_ctof], %[ftmp4] \n\t" + "pmullh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + + "dli %[tmp0], 0x10 \n\t" + PTR_ADDU "%[addr0], %[src], $0 \n\t" + "1: \n\t" + "paddsh %[ftmp8], %[ftmp1], %[ftmp6] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "paddsh %[ftmp9], %[ftmp2], %[ftmp6] \n\t" + "psrah %[ftmp9], %[ftmp9], %[ftmp7] \n\t" + "packushb %[ftmp0], %[ftmp8], %[ftmp9] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[addr0]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[addr0]) \n\t" + + "paddsh %[ftmp8], %[ftmp3], %[ftmp6] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "paddsh %[ftmp9], %[ftmp4], %[ftmp6] \n\t" + "psrah %[ftmp9], %[ftmp9], %[ftmp7] \n\t" + "packushb %[ftmp0], %[ftmp8], %[ftmp9] \n\t" + "gssdlc1 %[ftmp0], 0x0f(%[addr0]) \n\t" + "gssdrc1 %[ftmp0], 0x08(%[addr0]) \n\t" + + "paddsh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t" + "daddiu %[tmp0], %[tmp0], -0x01 \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]), + [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]), + [tmp6]"=&r"(tmp[6]), + [addr0]"=&r"(addr[0]) + : [src]"r"(src), [stride]"r"((mips_reg)stride), + [svq3]"r"(svq3), [rv40]"r"(rv40), + [ff_pw_m8tom5]"f"(ff_pw_m8tom5), [ff_pw_m4tom1]"f"(ff_pw_m4tom1), + [ff_pw_1to4]"f"(ff_pw_1to4), [ff_pw_5to8]"f"(ff_pw_5to8), + [ff_pw_0to3]"f"(ff_pw_0to3), [ff_pw_4to7]"r"(ff_pw_4to7), + [ff_pw_8tob]"r"(ff_pw_8tob), [ff_pw_ctof]"r"(ff_pw_ctof) + : "memory" + ); +} + +void ff_pred16x16_plane_h264_8_mmi(uint8_t *src, ptrdiff_t stride) +{ + pred16x16_plane_compat_mmi(src, stride, 0, 0); +} + +void ff_pred16x16_plane_svq3_8_mmi(uint8_t *src, ptrdiff_t stride) +{ + pred16x16_plane_compat_mmi(src, stride, 1, 0); +} + +void ff_pred16x16_plane_rv40_8_mmi(uint8_t *src, ptrdiff_t stride) +{ + pred16x16_plane_compat_mmi(src, stride, 0, 1); +} diff --git a/libavcodec/mips/h264pred_msa.c b/libavcodec/mips/h264pred_msa.c new file mode 100644 index 0000000000..cddcd2e878 --- /dev/null +++ b/libavcodec/mips/h264pred_msa.c @@ -0,0 +1,723 @@ +/* + * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "h264dsp_mips.h" + +static void intra_predict_vert_8x8_msa(uint8_t *src, uint8_t *dst, + int32_t dst_stride) +{ + uint32_t row; + uint32_t src_data1, src_data2; + + src_data1 = LW(src); + src_data2 = LW(src + 4); + + for (row = 8; row--;) { + SW(src_data1, dst); + SW(src_data2, (dst + 4)); + dst += dst_stride; + } +} + +static void intra_predict_vert_16x16_msa(uint8_t *src, uint8_t *dst, + int32_t dst_stride) +{ + uint32_t row; + v16u8 src0; + + src0 = LD_UB(src); + + for (row = 16; row--;) { + ST_UB(src0, dst); + dst += dst_stride; + } +} + +static void intra_predict_horiz_8x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + out0 = src[0 * src_stride] * 0x0101010101010101; + out1 = src[1 * src_stride] * 0x0101010101010101; + out2 = src[2 * src_stride] * 0x0101010101010101; + out3 = src[3 * src_stride] * 0x0101010101010101; + out4 = src[4 * src_stride] * 0x0101010101010101; + out5 = src[5 * src_stride] * 0x0101010101010101; + out6 = src[6 * src_stride] * 0x0101010101010101; + out7 = src[7 * src_stride] * 0x0101010101010101; + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); +} + +static void intra_predict_horiz_16x16_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16u8 src0, src1, src2, src3; + + for (row = 4; row--;) { + inp0 = src[0]; + src += src_stride; + inp1 = src[0]; + src += src_stride; + inp2 = src[0]; + src += src_stride; + inp3 = src[0]; + src += src_stride; + + src0 = (v16u8) __msa_fill_b(inp0); + src1 = (v16u8) __msa_fill_b(inp1); + src2 = (v16u8) __msa_fill_b(inp2); + src3 = (v16u8) __msa_fill_b(inp3); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void intra_predict_dc_8x8_msa(uint8_t *src_top, uint8_t *src_left, + int32_t src_stride_left, + uint8_t *dst, int32_t dst_stride, + uint8_t is_above, uint8_t is_left) +{ + uint32_t row; + uint32_t out, addition = 0; + v16u8 src_above, store; + v8u16 sum_above; + v4u32 sum_top; + v2u64 sum; + + if (is_left && is_above) { + src_above = LD_UB(src_top); + + sum_above = __msa_hadd_u_h(src_above, src_above); + sum_top = __msa_hadd_u_w(sum_above, sum_above); + sum = __msa_hadd_u_d(sum_top, sum_top); + addition = __msa_copy_u_w((v4i32) sum, 0); + + for (row = 0; row < 8; row++) { + addition += src_left[row * src_stride_left]; + } + + addition = (addition + 8) >> 4; + store = (v16u8) __msa_fill_b(addition); + } else if (is_left) { + for (row = 0; row < 8; row++) { + addition += src_left[row * src_stride_left]; + } + + addition = (addition + 4) >> 3; + store = (v16u8) __msa_fill_b(addition); + } else if (is_above) { + src_above = LD_UB(src_top); + + sum_above = __msa_hadd_u_h(src_above, src_above); + sum_top = __msa_hadd_u_w(sum_above, sum_above); + sum = __msa_hadd_u_d(sum_top, sum_top); + sum = (v2u64) __msa_srari_d((v2i64) sum, 3); + store = (v16u8) __msa_splati_b((v16i8) sum, 0); + } else { + store = (v16u8) __msa_ldi_b(128); + } + + out = __msa_copy_u_w((v4i32) store, 0); + + for (row = 8; row--;) { + SW(out, dst); + SW(out, (dst + 4)); + dst += dst_stride; + } +} + +static void intra_predict_dc_16x16_msa(uint8_t *src_top, uint8_t *src_left, + int32_t src_stride_left, + uint8_t *dst, int32_t dst_stride, + uint8_t is_above, uint8_t is_left) +{ + uint32_t row; + uint32_t addition = 0; + v16u8 src_above, store; + v8u16 sum_above; + v4u32 sum_top; + v2u64 sum; + + if (is_left && is_above) { + src_above = LD_UB(src_top); + + sum_above = __msa_hadd_u_h(src_above, src_above); + sum_top = __msa_hadd_u_w(sum_above, sum_above); + sum = __msa_hadd_u_d(sum_top, sum_top); + sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum); + sum = __msa_hadd_u_d(sum_top, sum_top); + addition = __msa_copy_u_w((v4i32) sum, 0); + + for (row = 0; row < 16; row++) { + addition += src_left[row * src_stride_left]; + } + + addition = (addition + 16) >> 5; + store = (v16u8) __msa_fill_b(addition); + } else if (is_left) { + for (row = 0; row < 16; row++) { + addition += src_left[row * src_stride_left]; + } + + addition = (addition + 8) >> 4; + store = (v16u8) __msa_fill_b(addition); + } else if (is_above) { + src_above = LD_UB(src_top); + + sum_above = __msa_hadd_u_h(src_above, src_above); + sum_top = __msa_hadd_u_w(sum_above, sum_above); + sum = __msa_hadd_u_d(sum_top, sum_top); + sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum); + sum = __msa_hadd_u_d(sum_top, sum_top); + sum = (v2u64) __msa_srari_d((v2i64) sum, 4); + store = (v16u8) __msa_splati_b((v16i8) sum, 0); + } else { + store = (v16u8) __msa_ldi_b(128); + } + + for (row = 16; row--;) { + ST_UB(store, dst); + dst += dst_stride; + } +} + +#define INTRA_PREDICT_VALDC_8X8_MSA(val) \ +static void intra_predict_##val##dc_8x8_msa(uint8_t *dst, \ + int32_t dst_stride) \ +{ \ + uint32_t row, out; \ + v16i8 store; \ + \ + store = __msa_ldi_b(val); \ + out = __msa_copy_u_w((v4i32) store, 0); \ + \ + for (row = 8; row--;) { \ + SW(out, dst); \ + SW(out, (dst + 4)); \ + dst += dst_stride; \ + } \ +} + +INTRA_PREDICT_VALDC_8X8_MSA(127); +INTRA_PREDICT_VALDC_8X8_MSA(129); + +#define INTRA_PREDICT_VALDC_16X16_MSA(val) \ +static void intra_predict_##val##dc_16x16_msa(uint8_t *dst, \ + int32_t dst_stride) \ +{ \ + uint32_t row; \ + v16u8 store; \ + \ + store = (v16u8) __msa_ldi_b(val); \ + \ + for (row = 16; row--;) { \ + ST_UB(store, dst); \ + dst += dst_stride; \ + } \ +} + +INTRA_PREDICT_VALDC_16X16_MSA(127); +INTRA_PREDICT_VALDC_16X16_MSA(129); + +static void intra_predict_plane_8x8_msa(uint8_t *src, int32_t stride) +{ + uint8_t lpcnt; + int32_t res, res0, res1, res2, res3; + uint64_t out0, out1; + v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 }; + v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 }; + v4i32 int_multiplier = { 0, 1, 2, 3 }; + v16u8 src_top; + v8i16 vec9, vec10, vec11; + v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8; + v2i64 sum; + + src_top = LD_UB(src - (stride + 1)); + src_top = (v16u8) __msa_vshf_b(shf_mask, (v16i8) src_top, (v16i8) src_top); + + vec9 = __msa_hsub_u_h(src_top, src_top); + vec9 *= short_multiplier; + vec8 = __msa_hadd_s_w(vec9, vec9); + sum = __msa_hadd_s_d(vec8, vec8); + + res0 = __msa_copy_s_w((v4i32) sum, 0); + + res1 = (src[4 * stride - 1] - src[2 * stride - 1]) + + 2 * (src[5 * stride - 1] - src[stride - 1]) + + 3 * (src[6 * stride - 1] - src[-1]) + + 4 * (src[7 * stride - 1] - src[-stride - 1]); + + res0 *= 17; + res1 *= 17; + res0 = (res0 + 16) >> 5; + res1 = (res1 + 16) >> 5; + + res3 = 3 * (res0 + res1); + res2 = 16 * (src[7 * stride - 1] + src[-stride + 7] + 1); + res = res2 - res3; + + vec8 = __msa_fill_w(res0); + vec4 = __msa_fill_w(res); + vec2 = __msa_fill_w(res1); + vec5 = vec8 * int_multiplier; + vec3 = vec8 * 4; + + for (lpcnt = 4; lpcnt--;) { + vec0 = vec5; + vec0 += vec4; + vec1 = vec0 + vec3; + vec6 = vec5; + vec4 += vec2; + vec6 += vec4; + vec7 = vec6 + vec3; + + SRA_4V(vec0, vec1, vec6, vec7, 5); + PCKEV_H2_SH(vec1, vec0, vec7, vec6, vec10, vec11); + CLIP_SH2_0_255(vec10, vec11); + PCKEV_B2_SH(vec10, vec10, vec11, vec11, vec10, vec11); + + out0 = __msa_copy_s_d((v2i64) vec10, 0); + out1 = __msa_copy_s_d((v2i64) vec11, 0); + SD(out0, src); + src += stride; + SD(out1, src); + src += stride; + + vec4 += vec2; + } +} + +static void intra_predict_plane_16x16_msa(uint8_t *src, int32_t stride) +{ + uint8_t lpcnt; + int32_t res0, res1, res2, res3; + uint64_t load0, load1; + v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 }; + v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 }; + v4i32 int_multiplier = { 0, 1, 2, 3 }; + v16u8 src_top = { 0 }; + v8i16 vec9, vec10; + v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add; + + load0 = LD(src - (stride + 1)); + load1 = LD(src - (stride + 1) + 9); + + INSERT_D2_UB(load0, load1, src_top); + + src_top = (v16u8) __msa_vshf_b(shf_mask, (v16i8) src_top, (v16i8) src_top); + + vec9 = __msa_hsub_u_h(src_top, src_top); + vec9 *= short_multiplier; + vec8 = __msa_hadd_s_w(vec9, vec9); + res_add = (v4i32) __msa_hadd_s_d(vec8, vec8); + + res0 = __msa_copy_s_w(res_add, 0) + __msa_copy_s_w(res_add, 2); + + res1 = (src[8 * stride - 1] - src[6 * stride - 1]) + + 2 * (src[9 * stride - 1] - src[5 * stride - 1]) + + 3 * (src[10 * stride - 1] - src[4 * stride - 1]) + + 4 * (src[11 * stride - 1] - src[3 * stride - 1]) + + 5 * (src[12 * stride - 1] - src[2 * stride - 1]) + + 6 * (src[13 * stride - 1] - src[stride - 1]) + + 7 * (src[14 * stride - 1] - src[-1]) + + 8 * (src[15 * stride - 1] - src[-1 * stride - 1]); + + res0 *= 5; + res1 *= 5; + res0 = (res0 + 32) >> 6; + res1 = (res1 + 32) >> 6; + + res3 = 7 * (res0 + res1); + res2 = 16 * (src[15 * stride - 1] + src[-stride + 15] + 1); + res2 -= res3; + + vec8 = __msa_fill_w(res0); + vec4 = __msa_fill_w(res2); + vec5 = __msa_fill_w(res1); + vec6 = vec8 * 4; + vec7 = vec8 * int_multiplier; + + for (lpcnt = 16; lpcnt--;) { + vec0 = vec7; + vec0 += vec4; + vec1 = vec0 + vec6; + vec2 = vec1 + vec6; + vec3 = vec2 + vec6; + + SRA_4V(vec0, vec1, vec2, vec3, 5); + PCKEV_H2_SH(vec1, vec0, vec3, vec2, vec9, vec10); + CLIP_SH2_0_255(vec9, vec10); + PCKEV_ST_SB(vec9, vec10, src); + src += stride; + + vec4 += vec5; + } +} + +static void intra_predict_dc_4blk_8x8_msa(uint8_t *src, int32_t stride) +{ + uint8_t lp_cnt; + uint32_t src0, src1, src3, src2 = 0; + uint32_t out0, out1, out2, out3; + v16u8 src_top; + v8u16 add; + v4u32 sum; + + src_top = LD_UB(src - stride); + add = __msa_hadd_u_h((v16u8) src_top, (v16u8) src_top); + sum = __msa_hadd_u_w(add, add); + src0 = __msa_copy_u_w((v4i32) sum, 0); + src1 = __msa_copy_u_w((v4i32) sum, 1); + + for (lp_cnt = 0; lp_cnt < 4; lp_cnt++) { + src0 += src[lp_cnt * stride - 1]; + src2 += src[(4 + lp_cnt) * stride - 1]; + } + + src0 = (src0 + 4) >> 3; + src3 = (src1 + src2 + 4) >> 3; + src1 = (src1 + 2) >> 2; + src2 = (src2 + 2) >> 2; + out0 = src0 * 0x01010101; + out1 = src1 * 0x01010101; + out2 = src2 * 0x01010101; + out3 = src3 * 0x01010101; + + for (lp_cnt = 4; lp_cnt--;) { + SW(out0, src); + SW(out1, (src + 4)); + SW(out2, (src + 4 * stride)); + SW(out3, (src + 4 * stride + 4)); + src += stride; + } +} + +static void intra_predict_hor_dc_8x8_msa(uint8_t *src, int32_t stride) +{ + uint8_t lp_cnt; + uint32_t src0 = 0, src1 = 0; + uint64_t out0, out1; + + for (lp_cnt = 0; lp_cnt < 4; lp_cnt++) { + src0 += src[lp_cnt * stride - 1]; + src1 += src[(4 + lp_cnt) * stride - 1]; + } + + src0 = (src0 + 2) >> 2; + src1 = (src1 + 2) >> 2; + out0 = src0 * 0x0101010101010101; + out1 = src1 * 0x0101010101010101; + + for (lp_cnt = 4; lp_cnt--;) { + SD(out0, src); + SD(out1, (src + 4 * stride)); + src += stride; + } +} + +static void intra_predict_vert_dc_8x8_msa(uint8_t *src, int32_t stride) +{ + uint8_t lp_cnt; + uint32_t out0 = 0, out1 = 0; + v16u8 src_top; + v8u16 add; + v4u32 sum; + v4i32 res0, res1; + + src_top = LD_UB(src - stride); + add = __msa_hadd_u_h(src_top, src_top); + sum = __msa_hadd_u_w(add, add); + sum = (v4u32) __msa_srari_w((v4i32) sum, 2); + res0 = (v4i32) __msa_splati_b((v16i8) sum, 0); + res1 = (v4i32) __msa_splati_b((v16i8) sum, 4); + out0 = __msa_copy_u_w(res0, 0); + out1 = __msa_copy_u_w(res1, 0); + + for (lp_cnt = 8; lp_cnt--;) { + SW(out0, src); + SW(out1, src + 4); + src += stride; + } +} + +static void intra_predict_mad_cow_dc_l0t_8x8_msa(uint8_t *src, int32_t stride) +{ + uint8_t lp_cnt; + uint32_t src0, src1, src2 = 0; + uint32_t out0, out1, out2; + v16u8 src_top; + v8u16 add; + v4u32 sum; + + src_top = LD_UB(src - stride); + add = __msa_hadd_u_h(src_top, src_top); + sum = __msa_hadd_u_w(add, add); + src0 = __msa_copy_u_w((v4i32) sum, 0); + src1 = __msa_copy_u_w((v4i32) sum, 1); + + for (lp_cnt = 0; lp_cnt < 4; lp_cnt++) { + src2 += src[lp_cnt * stride - 1]; + } + src2 = (src0 + src2 + 4) >> 3; + src0 = (src0 + 2) >> 2; + src1 = (src1 + 2) >> 2; + out0 = src0 * 0x01010101; + out1 = src1 * 0x01010101; + out2 = src2 * 0x01010101; + + for (lp_cnt = 4; lp_cnt--;) { + SW(out2, src); + SW(out1, src + 4); + SW(out0, src + stride * 4); + SW(out1, src + stride * 4 + 4); + src += stride; + } +} + +static void intra_predict_mad_cow_dc_0lt_8x8_msa(uint8_t *src, int32_t stride) +{ + uint8_t lp_cnt; + uint32_t src0, src1, src2 = 0, src3; + uint32_t out0, out1, out2, out3; + v16u8 src_top; + v8u16 add; + v4u32 sum; + + src_top = LD_UB(src - stride); + add = __msa_hadd_u_h(src_top, src_top); + sum = __msa_hadd_u_w(add, add); + src0 = __msa_copy_u_w((v4i32) sum, 0); + src1 = __msa_copy_u_w((v4i32) sum, 1); + + for (lp_cnt = 0; lp_cnt < 4; lp_cnt++) { + src2 += src[(4 + lp_cnt) * stride - 1]; + } + + src0 = (src0 + 2) >> 2; + src3 = (src1 + src2 + 4) >> 3; + src1 = (src1 + 2) >> 2; + src2 = (src2 + 2) >> 2; + + out0 = src0 * 0x01010101; + out1 = src1 * 0x01010101; + out2 = src2 * 0x01010101; + out3 = src3 * 0x01010101; + + for (lp_cnt = 4; lp_cnt--;) { + SW(out0, src); + SW(out1, src + 4); + SW(out2, src + stride * 4); + SW(out3, src + stride * 4 + 4); + src += stride; + } +} + +static void intra_predict_mad_cow_dc_l00_8x8_msa(uint8_t *src, int32_t stride) +{ + uint8_t lp_cnt; + uint32_t src0 = 0; + uint64_t out0, out1; + + for (lp_cnt = 0; lp_cnt < 4; lp_cnt++) { + src0 += src[lp_cnt * stride - 1]; + } + + src0 = (src0 + 2) >> 2; + out0 = src0 * 0x0101010101010101; + out1 = 0x8080808080808080; + + for (lp_cnt = 4; lp_cnt--;) { + SD(out0, src); + SD(out1, src + stride * 4); + src += stride; + } +} + +static void intra_predict_mad_cow_dc_0l0_8x8_msa(uint8_t *src, int32_t stride) +{ + uint8_t lp_cnt; + uint32_t src0 = 0; + uint64_t out0, out1; + + for (lp_cnt = 0; lp_cnt < 4; lp_cnt++) { + src0 += src[(4 + lp_cnt) * stride - 1]; + } + + src0 = (src0 + 2) >> 2; + + out0 = 0x8080808080808080; + out1 = src0 * 0x0101010101010101; + + for (lp_cnt = 4; lp_cnt--;) { + SD(out0, src); + SD(out1, src + stride * 4); + src += stride; + } +} + +void ff_h264_intra_predict_plane_8x8_msa(uint8_t *src, ptrdiff_t stride) +{ + intra_predict_plane_8x8_msa(src, stride); +} + +void ff_h264_intra_predict_dc_4blk_8x8_msa(uint8_t *src, ptrdiff_t stride) +{ + intra_predict_dc_4blk_8x8_msa(src, stride); +} + +void ff_h264_intra_predict_hor_dc_8x8_msa(uint8_t *src, ptrdiff_t stride) +{ + intra_predict_hor_dc_8x8_msa(src, stride); +} + +void ff_h264_intra_predict_vert_dc_8x8_msa(uint8_t *src, ptrdiff_t stride) +{ + intra_predict_vert_dc_8x8_msa(src, stride); +} + +void ff_h264_intra_predict_mad_cow_dc_l0t_8x8_msa(uint8_t *src, + ptrdiff_t stride) +{ + intra_predict_mad_cow_dc_l0t_8x8_msa(src, stride); +} + +void ff_h264_intra_predict_mad_cow_dc_0lt_8x8_msa(uint8_t *src, + ptrdiff_t stride) +{ + intra_predict_mad_cow_dc_0lt_8x8_msa(src, stride); +} + +void ff_h264_intra_predict_mad_cow_dc_l00_8x8_msa(uint8_t *src, + ptrdiff_t stride) +{ + intra_predict_mad_cow_dc_l00_8x8_msa(src, stride); +} + +void ff_h264_intra_predict_mad_cow_dc_0l0_8x8_msa(uint8_t *src, + ptrdiff_t stride) +{ + intra_predict_mad_cow_dc_0l0_8x8_msa(src, stride); +} + +void ff_h264_intra_predict_plane_16x16_msa(uint8_t *src, ptrdiff_t stride) +{ + intra_predict_plane_16x16_msa(src, stride); +} + +void ff_h264_intra_pred_vert_8x8_msa(uint8_t *src, ptrdiff_t stride) +{ + uint8_t *dst = src; + + intra_predict_vert_8x8_msa(src - stride, dst, stride); +} + +void ff_h264_intra_pred_horiz_8x8_msa(uint8_t *src, ptrdiff_t stride) +{ + uint8_t *dst = src; + + intra_predict_horiz_8x8_msa(src - 1, stride, dst, stride); +} + +void ff_h264_intra_pred_dc_16x16_msa(uint8_t *src, ptrdiff_t stride) +{ + uint8_t *src_top = src - stride; + uint8_t *src_left = src - 1; + uint8_t *dst = src; + + intra_predict_dc_16x16_msa(src_top, src_left, stride, dst, stride, 1, 1); +} + +void ff_h264_intra_pred_vert_16x16_msa(uint8_t *src, ptrdiff_t stride) +{ + uint8_t *dst = src; + + intra_predict_vert_16x16_msa(src - stride, dst, stride); +} + +void ff_h264_intra_pred_horiz_16x16_msa(uint8_t *src, ptrdiff_t stride) +{ + uint8_t *dst = src; + + intra_predict_horiz_16x16_msa(src - 1, stride, dst, stride); +} + +void ff_h264_intra_pred_dc_left_16x16_msa(uint8_t *src, ptrdiff_t stride) +{ + uint8_t *src_top = src - stride; + uint8_t *src_left = src - 1; + uint8_t *dst = src; + + intra_predict_dc_16x16_msa(src_top, src_left, stride, dst, stride, 0, 1); +} + +void ff_h264_intra_pred_dc_top_16x16_msa(uint8_t *src, ptrdiff_t stride) +{ + uint8_t *src_top = src - stride; + uint8_t *src_left = src - 1; + uint8_t *dst = src; + + intra_predict_dc_16x16_msa(src_top, src_left, stride, dst, stride, 1, 0); +} + +void ff_h264_intra_pred_dc_128_8x8_msa(uint8_t *src, ptrdiff_t stride) +{ + uint8_t *src_top = src - stride; + uint8_t *src_left = src - 1; + uint8_t *dst = src; + + intra_predict_dc_8x8_msa(src_top, src_left, stride, dst, stride, 0, 0); +} + +void ff_h264_intra_pred_dc_128_16x16_msa(uint8_t *src, ptrdiff_t stride) +{ + uint8_t *src_top = src - stride; + uint8_t *src_left = src - 1; + uint8_t *dst = src; + + intra_predict_dc_16x16_msa(src_top, src_left, stride, dst, stride, 0, 0); +} + +void ff_vp8_pred8x8_127_dc_8_msa(uint8_t *src, ptrdiff_t stride) +{ + intra_predict_127dc_8x8_msa(src, stride); +} + +void ff_vp8_pred8x8_129_dc_8_msa(uint8_t *src, ptrdiff_t stride) +{ + intra_predict_129dc_8x8_msa(src, stride); +} + +void ff_vp8_pred16x16_127_dc_8_msa(uint8_t *src, ptrdiff_t stride) +{ + intra_predict_127dc_16x16_msa(src, stride); +} + +void ff_vp8_pred16x16_129_dc_8_msa(uint8_t *src, ptrdiff_t stride) +{ + intra_predict_129dc_16x16_msa(src, stride); +} diff --git a/libavcodec/mips/h264qpel_init_mips.c b/libavcodec/mips/h264qpel_init_mips.c new file mode 100644 index 0000000000..92219f8877 --- /dev/null +++ b/libavcodec/mips/h264qpel_init_mips.c @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) + * Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "h264dsp_mips.h" + +#if HAVE_MSA +static av_cold void h264qpel_init_msa(H264QpelContext *c, int bit_depth) +{ + if (8 == bit_depth) { + c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_msa; + c->put_h264_qpel_pixels_tab[0][1] = ff_put_h264_qpel16_mc10_msa; + c->put_h264_qpel_pixels_tab[0][2] = ff_put_h264_qpel16_mc20_msa; + c->put_h264_qpel_pixels_tab[0][3] = ff_put_h264_qpel16_mc30_msa; + c->put_h264_qpel_pixels_tab[0][4] = ff_put_h264_qpel16_mc01_msa; + c->put_h264_qpel_pixels_tab[0][5] = ff_put_h264_qpel16_mc11_msa; + c->put_h264_qpel_pixels_tab[0][6] = ff_put_h264_qpel16_mc21_msa; + c->put_h264_qpel_pixels_tab[0][7] = ff_put_h264_qpel16_mc31_msa; + c->put_h264_qpel_pixels_tab[0][8] = ff_put_h264_qpel16_mc02_msa; + c->put_h264_qpel_pixels_tab[0][9] = ff_put_h264_qpel16_mc12_msa; + c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_msa; + c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_msa; + c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_msa; + c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_msa; + c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_msa; + c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_msa; + + c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_msa; + c->put_h264_qpel_pixels_tab[1][1] = ff_put_h264_qpel8_mc10_msa; + c->put_h264_qpel_pixels_tab[1][2] = ff_put_h264_qpel8_mc20_msa; + c->put_h264_qpel_pixels_tab[1][3] = ff_put_h264_qpel8_mc30_msa; + c->put_h264_qpel_pixels_tab[1][4] = ff_put_h264_qpel8_mc01_msa; + c->put_h264_qpel_pixels_tab[1][5] = ff_put_h264_qpel8_mc11_msa; + c->put_h264_qpel_pixels_tab[1][6] = ff_put_h264_qpel8_mc21_msa; + c->put_h264_qpel_pixels_tab[1][7] = ff_put_h264_qpel8_mc31_msa; + c->put_h264_qpel_pixels_tab[1][8] = ff_put_h264_qpel8_mc02_msa; + c->put_h264_qpel_pixels_tab[1][9] = ff_put_h264_qpel8_mc12_msa; + c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_msa; + c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_msa; + c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_msa; + c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_msa; + c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_msa; + c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_msa; + + c->put_h264_qpel_pixels_tab[2][1] = ff_put_h264_qpel4_mc10_msa; + c->put_h264_qpel_pixels_tab[2][2] = ff_put_h264_qpel4_mc20_msa; + c->put_h264_qpel_pixels_tab[2][3] = ff_put_h264_qpel4_mc30_msa; + c->put_h264_qpel_pixels_tab[2][4] = ff_put_h264_qpel4_mc01_msa; + c->put_h264_qpel_pixels_tab[2][5] = ff_put_h264_qpel4_mc11_msa; + c->put_h264_qpel_pixels_tab[2][6] = ff_put_h264_qpel4_mc21_msa; + c->put_h264_qpel_pixels_tab[2][7] = ff_put_h264_qpel4_mc31_msa; + c->put_h264_qpel_pixels_tab[2][8] = ff_put_h264_qpel4_mc02_msa; + c->put_h264_qpel_pixels_tab[2][9] = ff_put_h264_qpel4_mc12_msa; + c->put_h264_qpel_pixels_tab[2][10] = ff_put_h264_qpel4_mc22_msa; + c->put_h264_qpel_pixels_tab[2][11] = ff_put_h264_qpel4_mc32_msa; + c->put_h264_qpel_pixels_tab[2][12] = ff_put_h264_qpel4_mc03_msa; + c->put_h264_qpel_pixels_tab[2][13] = ff_put_h264_qpel4_mc13_msa; + c->put_h264_qpel_pixels_tab[2][14] = ff_put_h264_qpel4_mc23_msa; + c->put_h264_qpel_pixels_tab[2][15] = ff_put_h264_qpel4_mc33_msa; + + c->avg_h264_qpel_pixels_tab[0][0] = ff_avg_h264_qpel16_mc00_msa; + c->avg_h264_qpel_pixels_tab[0][1] = ff_avg_h264_qpel16_mc10_msa; + c->avg_h264_qpel_pixels_tab[0][2] = ff_avg_h264_qpel16_mc20_msa; + c->avg_h264_qpel_pixels_tab[0][3] = ff_avg_h264_qpel16_mc30_msa; + c->avg_h264_qpel_pixels_tab[0][4] = ff_avg_h264_qpel16_mc01_msa; + c->avg_h264_qpel_pixels_tab[0][5] = ff_avg_h264_qpel16_mc11_msa; + c->avg_h264_qpel_pixels_tab[0][6] = ff_avg_h264_qpel16_mc21_msa; + c->avg_h264_qpel_pixels_tab[0][7] = ff_avg_h264_qpel16_mc31_msa; + c->avg_h264_qpel_pixels_tab[0][8] = ff_avg_h264_qpel16_mc02_msa; + c->avg_h264_qpel_pixels_tab[0][9] = ff_avg_h264_qpel16_mc12_msa; + c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_msa; + c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_msa; + c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_msa; + c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_msa; + c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_msa; + c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_msa; + + c->avg_h264_qpel_pixels_tab[1][0] = ff_avg_h264_qpel8_mc00_msa; + c->avg_h264_qpel_pixels_tab[1][1] = ff_avg_h264_qpel8_mc10_msa; + c->avg_h264_qpel_pixels_tab[1][2] = ff_avg_h264_qpel8_mc20_msa; + c->avg_h264_qpel_pixels_tab[1][3] = ff_avg_h264_qpel8_mc30_msa; + c->avg_h264_qpel_pixels_tab[1][4] = ff_avg_h264_qpel8_mc01_msa; + c->avg_h264_qpel_pixels_tab[1][5] = ff_avg_h264_qpel8_mc11_msa; + c->avg_h264_qpel_pixels_tab[1][6] = ff_avg_h264_qpel8_mc21_msa; + c->avg_h264_qpel_pixels_tab[1][7] = ff_avg_h264_qpel8_mc31_msa; + c->avg_h264_qpel_pixels_tab[1][8] = ff_avg_h264_qpel8_mc02_msa; + c->avg_h264_qpel_pixels_tab[1][9] = ff_avg_h264_qpel8_mc12_msa; + c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_msa; + c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_msa; + c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_msa; + c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_msa; + c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_msa; + c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_msa; + + c->avg_h264_qpel_pixels_tab[2][0] = ff_avg_h264_qpel4_mc00_msa; + c->avg_h264_qpel_pixels_tab[2][1] = ff_avg_h264_qpel4_mc10_msa; + c->avg_h264_qpel_pixels_tab[2][2] = ff_avg_h264_qpel4_mc20_msa; + c->avg_h264_qpel_pixels_tab[2][3] = ff_avg_h264_qpel4_mc30_msa; + c->avg_h264_qpel_pixels_tab[2][4] = ff_avg_h264_qpel4_mc01_msa; + c->avg_h264_qpel_pixels_tab[2][5] = ff_avg_h264_qpel4_mc11_msa; + c->avg_h264_qpel_pixels_tab[2][6] = ff_avg_h264_qpel4_mc21_msa; + c->avg_h264_qpel_pixels_tab[2][7] = ff_avg_h264_qpel4_mc31_msa; + c->avg_h264_qpel_pixels_tab[2][8] = ff_avg_h264_qpel4_mc02_msa; + c->avg_h264_qpel_pixels_tab[2][9] = ff_avg_h264_qpel4_mc12_msa; + c->avg_h264_qpel_pixels_tab[2][10] = ff_avg_h264_qpel4_mc22_msa; + c->avg_h264_qpel_pixels_tab[2][11] = ff_avg_h264_qpel4_mc32_msa; + c->avg_h264_qpel_pixels_tab[2][12] = ff_avg_h264_qpel4_mc03_msa; + c->avg_h264_qpel_pixels_tab[2][13] = ff_avg_h264_qpel4_mc13_msa; + c->avg_h264_qpel_pixels_tab[2][14] = ff_avg_h264_qpel4_mc23_msa; + c->avg_h264_qpel_pixels_tab[2][15] = ff_avg_h264_qpel4_mc33_msa; + } +} +#endif // #if HAVE_MSA + +#if HAVE_MMI +static av_cold void h264qpel_init_mmi(H264QpelContext *c, int bit_depth) +{ + if (8 == bit_depth) { + c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_mmi; + c->put_h264_qpel_pixels_tab[0][1] = ff_put_h264_qpel16_mc10_mmi; + c->put_h264_qpel_pixels_tab[0][2] = ff_put_h264_qpel16_mc20_mmi; + c->put_h264_qpel_pixels_tab[0][3] = ff_put_h264_qpel16_mc30_mmi; + c->put_h264_qpel_pixels_tab[0][4] = ff_put_h264_qpel16_mc01_mmi; + c->put_h264_qpel_pixels_tab[0][5] = ff_put_h264_qpel16_mc11_mmi; + c->put_h264_qpel_pixels_tab[0][6] = ff_put_h264_qpel16_mc21_mmi; + c->put_h264_qpel_pixels_tab[0][7] = ff_put_h264_qpel16_mc31_mmi; + c->put_h264_qpel_pixels_tab[0][8] = ff_put_h264_qpel16_mc02_mmi; + c->put_h264_qpel_pixels_tab[0][9] = ff_put_h264_qpel16_mc12_mmi; + c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_mmi; + c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_mmi; + c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_mmi; + c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_mmi; + c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_mmi; + c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_mmi; + + c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_mmi; + c->put_h264_qpel_pixels_tab[1][1] = ff_put_h264_qpel8_mc10_mmi; + c->put_h264_qpel_pixels_tab[1][2] = ff_put_h264_qpel8_mc20_mmi; + c->put_h264_qpel_pixels_tab[1][3] = ff_put_h264_qpel8_mc30_mmi; + c->put_h264_qpel_pixels_tab[1][4] = ff_put_h264_qpel8_mc01_mmi; + c->put_h264_qpel_pixels_tab[1][5] = ff_put_h264_qpel8_mc11_mmi; + c->put_h264_qpel_pixels_tab[1][6] = ff_put_h264_qpel8_mc21_mmi; + c->put_h264_qpel_pixels_tab[1][7] = ff_put_h264_qpel8_mc31_mmi; + c->put_h264_qpel_pixels_tab[1][8] = ff_put_h264_qpel8_mc02_mmi; + c->put_h264_qpel_pixels_tab[1][9] = ff_put_h264_qpel8_mc12_mmi; + c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_mmi; + c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_mmi; + c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_mmi; + c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_mmi; + c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_mmi; + c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_mmi; + + c->put_h264_qpel_pixels_tab[2][0] = ff_put_h264_qpel4_mc00_mmi; + c->put_h264_qpel_pixels_tab[2][1] = ff_put_h264_qpel4_mc10_mmi; + c->put_h264_qpel_pixels_tab[2][2] = ff_put_h264_qpel4_mc20_mmi; + c->put_h264_qpel_pixels_tab[2][3] = ff_put_h264_qpel4_mc30_mmi; + c->put_h264_qpel_pixels_tab[2][4] = ff_put_h264_qpel4_mc01_mmi; + c->put_h264_qpel_pixels_tab[2][5] = ff_put_h264_qpel4_mc11_mmi; + c->put_h264_qpel_pixels_tab[2][6] = ff_put_h264_qpel4_mc21_mmi; + c->put_h264_qpel_pixels_tab[2][7] = ff_put_h264_qpel4_mc31_mmi; + c->put_h264_qpel_pixels_tab[2][8] = ff_put_h264_qpel4_mc02_mmi; + c->put_h264_qpel_pixels_tab[2][9] = ff_put_h264_qpel4_mc12_mmi; + c->put_h264_qpel_pixels_tab[2][10] = ff_put_h264_qpel4_mc22_mmi; + c->put_h264_qpel_pixels_tab[2][11] = ff_put_h264_qpel4_mc32_mmi; + c->put_h264_qpel_pixels_tab[2][12] = ff_put_h264_qpel4_mc03_mmi; + c->put_h264_qpel_pixels_tab[2][13] = ff_put_h264_qpel4_mc13_mmi; + c->put_h264_qpel_pixels_tab[2][14] = ff_put_h264_qpel4_mc23_mmi; + c->put_h264_qpel_pixels_tab[2][15] = ff_put_h264_qpel4_mc33_mmi; + + c->avg_h264_qpel_pixels_tab[0][0] = ff_avg_h264_qpel16_mc00_mmi; + c->avg_h264_qpel_pixels_tab[0][1] = ff_avg_h264_qpel16_mc10_mmi; + c->avg_h264_qpel_pixels_tab[0][2] = ff_avg_h264_qpel16_mc20_mmi; + c->avg_h264_qpel_pixels_tab[0][3] = ff_avg_h264_qpel16_mc30_mmi; + c->avg_h264_qpel_pixels_tab[0][4] = ff_avg_h264_qpel16_mc01_mmi; + c->avg_h264_qpel_pixels_tab[0][5] = ff_avg_h264_qpel16_mc11_mmi; + c->avg_h264_qpel_pixels_tab[0][6] = ff_avg_h264_qpel16_mc21_mmi; + c->avg_h264_qpel_pixels_tab[0][7] = ff_avg_h264_qpel16_mc31_mmi; + c->avg_h264_qpel_pixels_tab[0][8] = ff_avg_h264_qpel16_mc02_mmi; + c->avg_h264_qpel_pixels_tab[0][9] = ff_avg_h264_qpel16_mc12_mmi; + c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_mmi; + c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_mmi; + c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_mmi; + c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_mmi; + c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_mmi; + c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_mmi; + + c->avg_h264_qpel_pixels_tab[1][0] = ff_avg_h264_qpel8_mc00_mmi; + c->avg_h264_qpel_pixels_tab[1][1] = ff_avg_h264_qpel8_mc10_mmi; + c->avg_h264_qpel_pixels_tab[1][2] = ff_avg_h264_qpel8_mc20_mmi; + c->avg_h264_qpel_pixels_tab[1][3] = ff_avg_h264_qpel8_mc30_mmi; + c->avg_h264_qpel_pixels_tab[1][4] = ff_avg_h264_qpel8_mc01_mmi; + c->avg_h264_qpel_pixels_tab[1][5] = ff_avg_h264_qpel8_mc11_mmi; + c->avg_h264_qpel_pixels_tab[1][6] = ff_avg_h264_qpel8_mc21_mmi; + c->avg_h264_qpel_pixels_tab[1][7] = ff_avg_h264_qpel8_mc31_mmi; + c->avg_h264_qpel_pixels_tab[1][8] = ff_avg_h264_qpel8_mc02_mmi; + c->avg_h264_qpel_pixels_tab[1][9] = ff_avg_h264_qpel8_mc12_mmi; + c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_mmi; + c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_mmi; + c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_mmi; + c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_mmi; + c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_mmi; + c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_mmi; + + c->avg_h264_qpel_pixels_tab[2][0] = ff_avg_h264_qpel4_mc00_mmi; + c->avg_h264_qpel_pixels_tab[2][1] = ff_avg_h264_qpel4_mc10_mmi; + c->avg_h264_qpel_pixels_tab[2][2] = ff_avg_h264_qpel4_mc20_mmi; + c->avg_h264_qpel_pixels_tab[2][3] = ff_avg_h264_qpel4_mc30_mmi; + c->avg_h264_qpel_pixels_tab[2][4] = ff_avg_h264_qpel4_mc01_mmi; + c->avg_h264_qpel_pixels_tab[2][5] = ff_avg_h264_qpel4_mc11_mmi; + c->avg_h264_qpel_pixels_tab[2][6] = ff_avg_h264_qpel4_mc21_mmi; + c->avg_h264_qpel_pixels_tab[2][7] = ff_avg_h264_qpel4_mc31_mmi; + c->avg_h264_qpel_pixels_tab[2][8] = ff_avg_h264_qpel4_mc02_mmi; + c->avg_h264_qpel_pixels_tab[2][9] = ff_avg_h264_qpel4_mc12_mmi; + c->avg_h264_qpel_pixels_tab[2][10] = ff_avg_h264_qpel4_mc22_mmi; + c->avg_h264_qpel_pixels_tab[2][11] = ff_avg_h264_qpel4_mc32_mmi; + c->avg_h264_qpel_pixels_tab[2][12] = ff_avg_h264_qpel4_mc03_mmi; + c->avg_h264_qpel_pixels_tab[2][13] = ff_avg_h264_qpel4_mc13_mmi; + c->avg_h264_qpel_pixels_tab[2][14] = ff_avg_h264_qpel4_mc23_mmi; + c->avg_h264_qpel_pixels_tab[2][15] = ff_avg_h264_qpel4_mc33_mmi; + } +} +#endif /* HAVE_MMI */ + +av_cold void ff_h264qpel_init_mips(H264QpelContext *c, int bit_depth) +{ +#if HAVE_MSA + h264qpel_init_msa(c, bit_depth); +#endif // #if HAVE_MSA +#if HAVE_MMI + h264qpel_init_mmi(c, bit_depth); +#endif /* HAVE_MMI */ +} diff --git a/libavcodec/mips/h264qpel_mmi.c b/libavcodec/mips/h264qpel_mmi.c new file mode 100644 index 0000000000..b4e83e427f --- /dev/null +++ b/libavcodec/mips/h264qpel_mmi.c @@ -0,0 +1,3263 @@ +/* + * Loongson SIMD optimized h264qpel + * + * Copyright (c) 2015 Loongson Technology Corporation Limited + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "h264dsp_mips.h" +#include "hpeldsp_mips.h" +#include "libavcodec/bit_depth_template.c" +#include "libavutil/mips/asmdefs.h" + +static inline void copy_block4_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride, int h) +{ + double ftmp[1]; + uint64_t low32; + + __asm__ volatile ( + "1: \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "gsswlc1 %[ftmp0], 0x03(%[dst]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[dst]) \n\t" + "addi %[h], %[h], -0x01 \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), + [dst]"+&r"(dst), [src]"+&r"(src), + [h]"+&r"(h), + [low32]"=&r"(low32) + : [dstStride]"r"((mips_reg)dstStride), + [srcStride]"r"((mips_reg)srcStride) + : "memory" + ); +} + +static inline void copy_block8_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride, int h) +{ + double ftmp[1]; + + __asm__ volatile ( + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src]) \n\t" + "gssdlc1 %[ftmp0], 0x07(%[dst]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[dst]) \n\t" + "addi %[h], %[h], -0x01 \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), + [dst]"+&r"(dst), [src]"+&r"(src), + [h]"+&r"(h) + : [dstStride]"r"((mips_reg)dstStride), + [srcStride]"r"((mips_reg)srcStride) + : "memory" + ); +} + +static inline void copy_block16_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride, int h) +{ + double ftmp[1]; + uint64_t tmp[1]; + + __asm__ volatile ( + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src]) \n\t" + "ldl %[tmp0], 0x0f(%[src]) \n\t" + "ldr %[tmp0], 0x08(%[src]) \n\t" + "gssdlc1 %[ftmp0], 0x07(%[dst]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[dst]) \n\t" + "sdl %[tmp0], 0x0f(%[dst]) \n\t" + "sdr %[tmp0], 0x08(%[dst]) \n\t" + "addi %[h], %[h], -0x01 \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), + [tmp0]"=&r"(tmp[0]), + [dst]"+&r"(dst), [src]"+&r"(src), + [h]"+&r"(h) + : [dstStride]"r"((mips_reg)dstStride), + [srcStride]"r"((mips_reg)srcStride) + : "memory" + ); +} + +#define op2_avg(a, b) a = (((a)+CLIP(((b) + 512)>>10)+1)>>1) +#define op2_put(a, b) a = CLIP(((b) + 512)>>10) +static void put_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + double ftmp[10]; + uint64_t tmp[1]; + uint64_t low32; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x04 \n\t" + "1: \n\t" + "uld %[low32], -0x02(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "uld %[low32], -0x01(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "uld %[low32], 0x01(%[src]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + "uld %[low32], 0x02(%[src]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + "uld %[low32], 0x03(%[src]) \n\t" + "mtc1 %[low32], %[ftmp6] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "paddsh %[ftmp7], %[ftmp3], %[ftmp4] \n\t" + "paddsh %[ftmp8], %[ftmp2], %[ftmp5] \n\t" + "paddsh %[ftmp9], %[ftmp1], %[ftmp6] \n\t" + "pmullh %[ftmp7], %[ftmp7], %[ff_pw_20] \n\t" + "pmullh %[ftmp8], %[ftmp8], %[ff_pw_5] \n\t" + "psubsh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "paddsh %[ftmp9], %[ftmp7], %[ftmp9] \n\t" + "paddsh %[ftmp9], %[ftmp9], %[ff_pw_16] \n\t" + "psrah %[ftmp9], %[ftmp9], %[ff_pw_5] \n\t" + "packushb %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "gsswlc1 %[ftmp9], 0x03(%[dst]) \n\t" + "gsswrc1 %[ftmp9], 0x00(%[dst]) \n\t" + "daddi %[tmp0], %[tmp0], -0x01 \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [tmp0]"=&r"(tmp[0]), + [dst]"+&r"(dst), [src]"+&r"(src), + [low32]"=&r"(low32) + : [dstStride]"r"((mips_reg)dstStride), + [srcStride]"r"((mips_reg)srcStride), + [ff_pw_20]"f"(ff_pw_20), [ff_pw_5]"f"(ff_pw_5), + [ff_pw_16]"f"(ff_pw_16) + : "memory" + ); +} + +static void put_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + double ftmp[11]; + uint64_t tmp[1]; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x08 \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x05(%[src]) \n\t" + "gsldrc1 %[ftmp1], -0x02(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x06(%[src]) \n\t" + "gsldrc1 %[ftmp2], -0x01(%[src]) \n\t" + "gsldlc1 %[ftmp3], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp4], 0x08(%[src]) \n\t" + "gsldrc1 %[ftmp4], 0x01(%[src]) \n\t" + "gsldlc1 %[ftmp5], 0x09(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x02(%[src]) \n\t" + "gsldlc1 %[ftmp6], 0x0a(%[src]) \n\t" + "gsldrc1 %[ftmp6], 0x03(%[src]) \n\t" + "punpcklbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" + "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp9], %[ftmp4], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp4], %[ftmp0] \n\t" + "paddsh %[ftmp3], %[ftmp7], %[ftmp9] \n\t" + "paddsh %[ftmp4], %[ftmp8], %[ftmp10] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[ff_pw_20] \n\t" + "pmullh %[ftmp4], %[ftmp4], %[ff_pw_20] \n\t" + "punpcklbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t" + "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t" + "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp7], %[ftmp9] \n\t" + "paddsh %[ftmp5], %[ftmp8], %[ftmp10] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ff_pw_5] \n\t" + "pmullh %[ftmp5], %[ftmp5], %[ff_pw_5] \n\t" + "punpcklbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp8], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp9], %[ftmp6], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp6], %[ftmp0] \n\t" + "paddsh %[ftmp1], %[ftmp7], %[ftmp9] \n\t" + "paddsh %[ftmp6], %[ftmp8], %[ftmp10] \n\t" + "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "psubsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "paddsh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "paddsh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "paddsh %[ftmp3], %[ftmp3], %[ff_pw_16] \n\t" + "paddsh %[ftmp4], %[ftmp4], %[ff_pw_16] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ff_pw_5] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ff_pw_5] \n\t" + "packushb %[ftmp9], %[ftmp3], %[ftmp4] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[dst]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[dst]) \n\t" + "daddi %[tmp0], %[tmp0], -0x01 \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), + [tmp0]"=&r"(tmp[0]), + [dst]"+&r"(dst), [src]"+&r"(src) + : [dstStride]"r"((mips_reg)dstStride), + [srcStride]"r"((mips_reg)srcStride), + [ff_pw_20]"f"(ff_pw_20), [ff_pw_5]"f"(ff_pw_5), + [ff_pw_16]"f"(ff_pw_16) + : "memory" + ); +} + +static void put_h264_qpel16_h_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + put_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride); + put_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride); + src += 8*srcStride; + dst += 8*dstStride; + put_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride); + put_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride); +} + +static void avg_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + double ftmp[11]; + uint64_t tmp[1]; + uint64_t low32; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x04 \n\t" + "1: \n\t" + "uld %[low32], -0x02(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "uld %[low32], -0x01(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "uld %[low32], 0x01(%[src]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + "uld %[low32], 0x02(%[src]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + "uld %[low32], 0x03(%[src]) \n\t" + "mtc1 %[low32], %[ftmp6] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "paddsh %[ftmp7], %[ftmp3], %[ftmp4] \n\t" + "paddsh %[ftmp8], %[ftmp2], %[ftmp5] \n\t" + "paddsh %[ftmp9], %[ftmp1], %[ftmp6] \n\t" + "pmullh %[ftmp7], %[ftmp7], %[ff_pw_20] \n\t" + "pmullh %[ftmp8], %[ftmp8], %[ff_pw_5] \n\t" + "psubsh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "paddsh %[ftmp9], %[ftmp7], %[ftmp9] \n\t" + "paddsh %[ftmp9], %[ftmp9], %[ff_pw_16] \n\t" + "psrah %[ftmp9], %[ftmp9], %[ff_pw_5] \n\t" + "packushb %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "lwc1 %[ftmp10], 0x00(%[dst]) \n\t" + "pavgb %[ftmp9], %[ftmp9], %[ftmp10] \n\t" + "gsswlc1 %[ftmp9], 0x03(%[dst]) \n\t" + "gsswrc1 %[ftmp9], 0x00(%[dst]) \n\t" + "daddi %[tmp0], %[tmp0], -0x01 \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), + [tmp0]"=&r"(tmp[0]), + [dst]"+&r"(dst), [src]"+&r"(src), + [low32]"=&r"(low32) + : [dstStride]"r"((mips_reg)dstStride), + [srcStride]"r"((mips_reg)srcStride), + [ff_pw_20]"f"(ff_pw_20), [ff_pw_5]"f"(ff_pw_5), + [ff_pw_16]"f"(ff_pw_16) + : "memory" + ); +} + +static void avg_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + double ftmp[11]; + uint64_t tmp[1]; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x08 \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x05(%[src]) \n\t" + "gsldrc1 %[ftmp1], -0x02(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x06(%[src]) \n\t" + "gsldrc1 %[ftmp2], -0x01(%[src]) \n\t" + "gsldlc1 %[ftmp3], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp4], 0x08(%[src]) \n\t" + "gsldrc1 %[ftmp4], 0x01(%[src]) \n\t" + "gsldlc1 %[ftmp5], 0x09(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x02(%[src]) \n\t" + "gsldlc1 %[ftmp6], 0x0a(%[src]) \n\t" + "gsldrc1 %[ftmp6], 0x03(%[src]) \n\t" + "punpcklbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" + "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp9], %[ftmp4], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp4], %[ftmp0] \n\t" + "paddsh %[ftmp3], %[ftmp7], %[ftmp9] \n\t" + "paddsh %[ftmp4], %[ftmp8], %[ftmp10] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[ff_pw_20] \n\t" + "pmullh %[ftmp4], %[ftmp4], %[ff_pw_20] \n\t" + "punpcklbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t" + "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t" + "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp7], %[ftmp9] \n\t" + "paddsh %[ftmp5], %[ftmp8], %[ftmp10] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ff_pw_5] \n\t" + "pmullh %[ftmp5], %[ftmp5], %[ff_pw_5] \n\t" + "punpcklbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp8], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp9], %[ftmp6], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp6], %[ftmp0] \n\t" + "paddsh %[ftmp1], %[ftmp7], %[ftmp9] \n\t" + "paddsh %[ftmp6], %[ftmp8], %[ftmp10] \n\t" + "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "psubsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "paddsh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "paddsh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "paddsh %[ftmp3], %[ftmp3], %[ff_pw_16] \n\t" + "paddsh %[ftmp4], %[ftmp4], %[ff_pw_16] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ff_pw_5] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ff_pw_5] \n\t" + "packushb %[ftmp9], %[ftmp3], %[ftmp4] \n\t" + "ldc1 %[ftmp10], 0x00(%[dst]) \n\t" + "pavgb %[ftmp9], %[ftmp9], %[ftmp10] \n\t" + "sdc1 %[ftmp9], 0x00(%[dst]) \n\t" + "daddi %[tmp0], %[tmp0], -0x01 \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), + [tmp0]"=&r"(tmp[0]), + [dst]"+&r"(dst), [src]"+&r"(src) + : [dstStride]"r"((mips_reg)dstStride), + [srcStride]"r"((mips_reg)srcStride), + [ff_pw_20]"f"(ff_pw_20), [ff_pw_5]"f"(ff_pw_5), + [ff_pw_16]"f"(ff_pw_16) + : "memory" + ); +} + +static void avg_h264_qpel16_h_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + avg_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride); + avg_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride); + src += 8*srcStride; + dst += 8*dstStride; + avg_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride); + avg_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride); +} + +static void put_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + double ftmp[12]; + uint64_t tmp[1]; + uint64_t low32; + + src -= 2 * srcStride; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x02 \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "mtc1 %[tmp0], %[ftmp10] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "dli %[tmp0], 0x05 \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp6] \n\t" + "paddh %[ftmp7], %[ftmp3], %[ftmp4] \n\t" + "psllh %[ftmp7], %[ftmp7], %[ftmp10] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp2] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "pmullh %[ftmp7], %[ftmp7], %[ff_pw_5] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp11] \n\t" + "packushb %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "swc1 %[ftmp7], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "paddh %[ftmp7], %[ftmp4], %[ftmp5] \n\t" + "psllh %[ftmp7], %[ftmp7], %[ftmp10] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp6] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "pmullh %[ftmp7], %[ftmp7], %[ff_pw_5] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp2] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp11] \n\t" + "packushb %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "swc1 %[ftmp7], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "paddh %[ftmp7], %[ftmp5], %[ftmp6] \n\t" + "psllh %[ftmp7], %[ftmp7], %[ftmp10] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp4] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pmullh %[ftmp7], %[ftmp7], %[ff_pw_5] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp11] \n\t" + "packushb %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "swc1 %[ftmp7], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "paddh %[ftmp7], %[ftmp6], %[ftmp1] \n\t" + "psllh %[ftmp7], %[ftmp7], %[ftmp10] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp2] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "pmullh %[ftmp7], %[ftmp7], %[ff_pw_5] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t" + "paddh %[ftmp7], %[ftmp7], %[ftmp4] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp11] \n\t" + "packushb %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "swc1 %[ftmp7], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + ".set pop \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [dst]"+&r"(dst), [src]"+&r"(src), + [low32]"=&r"(low32) + : [dstStride]"r"((mips_reg)dstStride), + [srcStride]"r"((mips_reg)srcStride), + [ff_pw_5]"f"(ff_pw_5), [ff_pw_16]"f"(ff_pw_16) + : "memory" + ); +} + +static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + int w = 2; + int h = 8; + double ftmp[10]; + uint64_t tmp[1]; + uint64_t low32; + + src -= 2 * srcStride; + + while (w--) { + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "dli %[tmp0], 0x02 \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "dli %[tmp0], 0x05 \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp3], %[ftmp4] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_16] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "paddh %[ftmp6], %[ftmp4], %[ftmp5] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_16] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + "paddh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + "paddh %[ftmp6], %[ftmp0], %[ftmp1] \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "punpcklbh %[ftmp3] , %[ftmp3], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + "paddh %[ftmp6], %[ftmp1], %[ftmp2] \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp4] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + "paddh %[ftmp6], %[ftmp3], %[ftmp4] \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + "bne %[h], 0x10, 2f \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "paddh %[ftmp6], %[ftmp4], %[ftmp5] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_16] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + "paddh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + "paddh %[ftmp6], %[ftmp0], %[ftmp1] \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + "paddh %[ftmp6], %[ftmp1], %[ftmp2] \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp4] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + "paddh %[ftmp6], %[ftmp3], %[ftmp4] \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + "paddh %[ftmp6], %[ftmp4], %[ftmp5] \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + "paddh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "2: \n\t" + ".set pop \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [tmp0]"=&r"(tmp[0]), + [src]"+&r"(src), [dst]"+&r"(dst), + [h]"+&r"(h), + [low32]"=&r"(low32) + : [dstStride]"r"((mips_reg)dstStride), + [srcStride]"r"((mips_reg)srcStride), + [ff_pw_5]"f"(ff_pw_5), [ff_pw_16]"f"(ff_pw_16) + : "memory" + ); + + src += 4 - (h + 5) * srcStride; + dst += 4 - h * dstStride; + } +} + +static void put_h264_qpel16_v_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + put_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride); + put_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride); + src += 8*srcStride; + dst += 8*dstStride; + put_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride); + put_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride); +} + +static void avg_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + double ftmp[10]; + uint64_t tmp[1]; + + src -= 2 * srcStride; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "dli %[tmp0], 0x02 \n\t" + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x05 \n\t" + "lwc1 %[ftmp0], 0x00(%[src]) \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "lwc1 %[ftmp1], 0x00(%[src]) \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "lwc1 %[ftmp2], 0x00(%[src]) \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "lwc1 %[ftmp3], 0x00(%[src]) \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "lwc1 %[ftmp4], 0x00(%[src]) \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "lwc1 %[ftmp5], 0x00(%[src]) \n\t" + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp0], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "lwc1 %[ftmp0], 0x00(%[src]) \n\t" + "paddh %[ftmp6], %[ftmp3], %[ftmp4] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp1], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "lwc1 %[ftmp1], 0x00(%[src]) \n\t" + "paddh %[ftmp6], %[ftmp4], %[ftmp5] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp2], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "lwc1 %[ftmp2], 0x00(%[src]) \n\t" + "paddh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp3], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + ".set pop \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [tmp0]"=&r"(tmp[0]), + [src]"+&r"(src), [dst]"+&r"(dst) + : [dstStride]"r"((mips_reg)dstStride), + [srcStride]"r"((mips_reg)srcStride), + [ff_pw_5]"f"(ff_pw_5), [ff_pw_16]"f"(ff_pw_16) + : "memory" + ); +} + +static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + int w = 2; + int h = 8; + double ftmp[10]; + uint64_t tmp[1]; + uint64_t low32; + + src -= 2 * srcStride; + + while (w--) { + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "dli %[tmp0], 0x02 \n\t" + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x05 \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp0], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp3], %[ftmp4] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp1], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp4], %[ftmp5] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp2], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "paddh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp3], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "paddh %[ftmp6], %[ftmp0], %[ftmp1] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp4], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + "paddh %[ftmp6], %[ftmp1], %[ftmp2] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp4] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp5], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp0], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp3], %[ftmp4] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp1], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + "bne %[h], 0x10, 2f \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp4], %[ftmp5] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp2], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "paddh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp3], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "paddh %[ftmp6], %[ftmp0], %[ftmp1] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp4], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + "paddh %[ftmp6], %[ftmp1], %[ftmp2] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp4] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp5], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp0], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp3], %[ftmp4] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp1], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp4], %[ftmp5] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp2], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "paddh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_pw_16] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "lwc1 %[ftmp3], 0x00(%[dst]) \n\t" + "pavgb %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "swc1 %[ftmp6], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "2: \n\t" + ".set pop \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [tmp0]"=&r"(tmp[0]), + [src]"+&r"(src), [dst]"+&r"(dst), + [h]"+&r"(h), + [low32]"=&r"(low32) + : [dstStride]"r"((mips_reg)dstStride), + [srcStride]"r"((mips_reg)srcStride), + [ff_pw_5]"f"(ff_pw_5), [ff_pw_16]"f"(ff_pw_16) + : "memory" + ); + + src += 4 - (h + 5) * srcStride; + dst += 4 - h * dstStride; + } +} + +static void avg_h264_qpel16_v_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + avg_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride); + avg_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride); + src += 8*srcStride; + dst += 8*dstStride; + avg_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride); + avg_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride); +} + +static void put_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + INIT_CLIP + int i; + int16_t _tmp[36]; + int16_t *tmp = _tmp; + double ftmp[10]; + uint64_t tmp0; + uint64_t low32; + + src -= 2*srcStride; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x09 \n\t" + "1: \n\t" + "uld %[low32], -0x02(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "uld %[low32], -0x01(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "uld %[low32], 0x01(%[src]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + "uld %[low32], 0x02(%[src]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + "uld %[low32], 0x03(%[src]) \n\t" + "mtc1 %[low32], %[ftmp6] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "paddsh %[ftmp7], %[ftmp3], %[ftmp4] \n\t" + "paddsh %[ftmp8], %[ftmp2], %[ftmp5] \n\t" + "paddsh %[ftmp9], %[ftmp1], %[ftmp6] \n\t" + "pmullh %[ftmp7], %[ftmp7], %[ff_pw_20] \n\t" + "pmullh %[ftmp8], %[ftmp8], %[ff_pw_5] \n\t" + "psubsh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "paddsh %[ftmp9], %[ftmp7], %[ftmp9] \n\t" + "sdc1 %[ftmp9], 0x00(%[tmp]) \n\t" + "daddi %[tmp0], %[tmp0], -0x01 \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + PTR_ADDU "%[tmp], %[tmp], %[tmpStride] \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [tmp0]"=&r"(tmp0), + [tmp]"+&r"(tmp), [src]"+&r"(src), + [low32]"=&r"(low32) + : [tmpStride]"r"(8), + [srcStride]"r"((mips_reg)srcStride), + [ff_pw_20]"f"(ff_pw_20), [ff_pw_5]"f"(ff_pw_5) + : "memory" + ); + + tmp -= 28; + + for (i=0; i<4; i++) { + const int16_t tmpB= tmp[-8]; + const int16_t tmpA= tmp[-4]; + const int16_t tmp0= tmp[ 0]; + const int16_t tmp1= tmp[ 4]; + const int16_t tmp2= tmp[ 8]; + const int16_t tmp3= tmp[12]; + const int16_t tmp4= tmp[16]; + const int16_t tmp5= tmp[20]; + const int16_t tmp6= tmp[24]; + op2_put(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3)); + op2_put(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4)); + op2_put(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5)); + op2_put(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6)); + dst++; + tmp++; + } +} + +static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp, + const uint8_t *src, ptrdiff_t tmpStride, ptrdiff_t srcStride, int size) +{ + int w = (size + 8) >> 2; + double ftmp[11]; + uint64_t tmp0; + uint64_t low32; + + src -= 2 * srcStride + 2; + + while (w--) { + __asm__ volatile ( + "dli %[tmp0], 0x02 \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp10] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ff_pw_16] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "sdc1 %[ftmp6], 0x00(%[tmp]) \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp3], %[ftmp4] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_16] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "sdc1 %[ftmp6], 0x30(%[tmp]) \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp4], %[ftmp5] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_16] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "sdc1 %[ftmp6], 0x60(%[tmp]) \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "paddh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_pw_16] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "sdc1 %[ftmp6], 0x90(%[tmp]) \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "paddh %[ftmp6], %[ftmp0], %[ftmp1] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ff_pw_16] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "sdc1 %[ftmp6], 0xc0(%[tmp]) \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + "paddh %[ftmp6], %[ftmp1], %[ftmp2] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ff_pw_16] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp4] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "sdc1 %[ftmp6], 0xf0(%[tmp]) \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ff_pw_16] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "sdc1 %[ftmp6], 0x120(%[tmp]) \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp3], %[ftmp4] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_16] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "sdc1 %[ftmp6], 0x150(%[tmp]) \n\t" + "bne %[size], 0x10, 2f \n\t" + + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp4], %[ftmp5] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_16] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "sdc1 %[ftmp6], 0x180(%[tmp]) \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "paddh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_pw_16] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "sdc1 %[ftmp6], 0x1b0(%[tmp]) \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "paddh %[ftmp6], %[ftmp0], %[ftmp1] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ff_pw_16] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "sdc1 %[ftmp6], 0x1e0(%[tmp]) \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + "paddh %[ftmp6], %[ftmp1], %[ftmp2] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ff_pw_16] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp4] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "sdc1 %[ftmp6], 0x210(%[tmp]) \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ff_pw_16] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "sdc1 %[ftmp6], 0x240(%[tmp]) \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp3], %[ftmp4] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_pw_16] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "sdc1 %[ftmp6], 0x270(%[tmp]) \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp4], %[ftmp5] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_16] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "sdc1 %[ftmp6], 0x2a0(%[tmp]) \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "paddh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_pw_16] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ff_pw_5] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "sdc1 %[ftmp6], 0x2d0(%[tmp]) \n\t" + "2: \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), + [tmp0]"=&r"(tmp0), + [src]"+&r"(src), + [low32]"=&r"(low32) + : [tmp]"r"(tmp), [size]"r"(size), + [srcStride]"r"((mips_reg)srcStride), + [ff_pw_5]"f"(ff_pw_5), [ff_pw_16]"f"(ff_pw_16) + : "memory" + ); + + tmp += 4; + src += 4 - (size + 5) * srcStride; + } +} + +static void put_h264_qpel8or16_hv2_lowpass_mmi(uint8_t *dst, + int16_t *tmp, ptrdiff_t dstStride, ptrdiff_t tmpStride, int size) +{ + int w = size >> 4; + double ftmp[10]; + uint64_t tmp0; + + do { + int h = size; + + __asm__ volatile ( + "dli %[tmp0], 0x02 \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x06 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "1: \n\t" + "ldc1 %[ftmp0], 0x00(%[tmp]) \n\t" + "ldc1 %[ftmp3], 0x08(%[tmp]) \n\t" + "ldc1 %[ftmp6], 0x10(%[tmp]) \n\t" + "gsldlc1 %[ftmp1], 0x09(%[tmp]) \n\t" + "gsldrc1 %[ftmp1], 0x02(%[tmp]) \n\t" + "gsldlc1 %[ftmp4], 0x11(%[tmp]) \n\t" + "gsldrc1 %[ftmp4], 0x0a(%[tmp]) \n\t" + "gsldlc1 %[ftmp5], 0x19(%[tmp]) \n\t" + "gsldrc1 %[ftmp5], 0x12(%[tmp]) \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "gsldlc1 %[ftmp2], 0x0b(%[tmp]) \n\t" + "gsldrc1 %[ftmp2], 0x04(%[tmp]) \n\t" + "gsldlc1 %[ftmp6], 0x0d(%[tmp]) \n\t" + "gsldrc1 %[ftmp6], 0x06(%[tmp]) \n\t" + "gsldlc1 %[ftmp5], 0x13(%[tmp]) \n\t" + "gsldrc1 %[ftmp5], 0x0c(%[tmp]) \n\t" + "gsldlc1 %[ftmp7], 0x15(%[tmp]) \n\t" + "gsldrc1 %[ftmp7], 0x0e(%[tmp]) \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp8] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "paddsh %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "paddsh %[ftmp3] , %[ftmp3], %[ftmp5] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp8] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "packushb %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "addi %[h], %[h], -0x01 \n\t" + "gssdlc1 %[ftmp0], 0x07(%[dst]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[dst]) \n\t" + PTR_ADDIU "%[tmp], %[tmp], 0x30 \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [tmp0]"=&r"(tmp0), + [tmp]"+&r"(tmp), [dst]"+&r"(dst), + [h]"+&r"(h) + : [dstStride]"r"((mips_reg)dstStride) + : "memory" + ); + + tmp += 8 - size * 24; + dst += 8 - size * dstStride; + } while (w--); +} + +static void put_h264_qpel8or16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp, + const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride, + ptrdiff_t srcStride, int size) +{ + put_h264_qpel8or16_hv1_lowpass_mmi(tmp, src, tmpStride, srcStride, size); + put_h264_qpel8or16_hv2_lowpass_mmi(dst, tmp, dstStride, tmpStride, size); +} + +static void put_h264_qpel8_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp, + const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride, + ptrdiff_t srcStride) +{ + put_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride, + srcStride, 8); +} + +static void put_h264_qpel16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp, + const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride, + ptrdiff_t srcStride) +{ + put_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride, + srcStride, 16); +} + +static void put_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src, + const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride) +{ + int h = 8; + double ftmp[9]; + uint64_t tmp[1]; + uint64_t low32; + + __asm__ volatile ( + "dli %[tmp0], 0x02 \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" + "dli %[tmp0], 0x05 \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp3], 0x08(%[src]) \n\t" + "gsldrc1 %[ftmp3], 0x01(%[src]) \n\t" + "punpckhbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psllh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "psllh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "gsldlc1 %[ftmp3], 0x06(%[src]) \n\t" + "gsldrc1 %[ftmp3], -0x01(%[src]) \n\t" + "gsldlc1 %[ftmp5], 0x09(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x02(%[src]) \n\t" + "punpckhbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "punpckhbh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ff_pw_5] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ff_pw_5] \n\t" + "uld %[low32], -0x02(%[src]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "uld %[low32], 0x07(%[src]) \n\t" + "mtc1 %[low32], %[ftmp6] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_pw_16] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ff_pw_16] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp8] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp8] \n\t" + "gsldlc1 %[ftmp5], 0x07(%[src2]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src2]) \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + PTR_ADDU "%[src], %[src], %[dstStride] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + PTR_ADDU "%[h], %[h], -0x01 \n\t" + "sdc1 %[ftmp1], 0x00(%[dst]) \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + PTR_ADDU "%[src2], %[src2], %[src2Stride] \n\t" + "bgtz %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), + [tmp0]"=&r"(tmp[0]), + [src]"+&r"(src), [dst]"+&r"(dst), + [src2]"+&r"(src2), [h]"+&r"(h), + [low32]"=&r"(low32) + : [src2Stride]"r"((mips_reg)src2Stride), + [dstStride]"r"((mips_reg)dstStride), + [ff_pw_5]"f"(ff_pw_5), [ff_pw_16]"f"(ff_pw_16) + : "memory" + ); +} + +static void put_pixels8_l2_shift5_mmi(uint8_t *dst, int16_t *src16, + const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int h) +{ + double ftmp[7]; + uint64_t tmp0; + + do { + __asm__ volatile ( + "dli %[tmp0], 0x05 \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src16]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src16]) \n\t" + "mtc1 %[tmp0], %[ftmp6] \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[src16]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[src16]) \n\t" + "gsldlc1 %[ftmp2], 0x37(%[src16]) \n\t" + "gsldrc1 %[ftmp2], 0x30(%[src16]) \n\t" + "gsldlc1 %[ftmp3], 0x3f(%[src16]) \n\t" + "gsldrc1 %[ftmp3], 0x38(%[src16]) \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "ldc1 %[ftmp5], 0x00(%[src8]) \n\t" + "gsldxc1 %[ftmp4], 0x00(%[src8], %[src8Stride]) \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + "pavgb %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "sdc1 %[ftmp0], 0x00(%[dst]) \n\t" + "gssdxc1 %[ftmp2], 0x00(%[dst], %[dstStride]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), + [tmp0]"=&r"(tmp0) + : [src8]"r"(src8), [src16]"r"(src16), + [dst]"r"(dst), + [src8Stride]"r"((mips_reg)src8Stride), + [dstStride]"r"((mips_reg)dstStride) + : "memory" + ); + + src8 += 2 * src8Stride; + src16 += 48; + dst += 2 * dstStride; + } while (h -= 2); +} + +static void put_h264_qpel16_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src, + const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride) +{ + put_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride); + put_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride, + src2Stride); + + src += 8 * dstStride; + dst += 8 * dstStride; + src2 += 8 * src2Stride; + + put_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride); + put_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride, + src2Stride); +} + +static void put_pixels16_l2_shift5_mmi(uint8_t *dst, int16_t *src16, + const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int h) +{ + put_pixels8_l2_shift5_mmi(dst, src16, src8, dstStride, src8Stride, h); + put_pixels8_l2_shift5_mmi(dst + 8, src16 + 8, src8 + 8, dstStride, + src8Stride, h); +} + +static void avg_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + INIT_CLIP + int i; + int16_t _tmp[36]; + int16_t *tmp = _tmp; + double ftmp[10]; + uint64_t tmp0; + uint64_t low32; + + src -= 2*srcStride; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dli %[tmp0], 0x09 \n\t" + "1: \n\t" + "uld %[low32], -0x02(%[src]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "uld %[low32], -0x01(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "uld %[low32], 0x00(%[src]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "uld %[low32], 0x01(%[src]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + "uld %[low32], 0x02(%[src]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + "uld %[low32], 0x03(%[src]) \n\t" + "mtc1 %[low32], %[ftmp6] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "paddsh %[ftmp7], %[ftmp3], %[ftmp4] \n\t" + "paddsh %[ftmp8], %[ftmp2], %[ftmp5] \n\t" + "paddsh %[ftmp9], %[ftmp1], %[ftmp6] \n\t" + "pmullh %[ftmp7], %[ftmp7], %[ff_pw_20] \n\t" + "pmullh %[ftmp8], %[ftmp8], %[ff_pw_5] \n\t" + "psubsh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "paddsh %[ftmp9], %[ftmp7], %[ftmp9] \n\t" + "sdc1 %[ftmp9], 0x00(%[tmp]) \n\t" + "daddi %[tmp0], %[tmp0], -0x01 \n\t" + PTR_ADDU "%[src], %[src], %[srcStride] \n\t" + PTR_ADDU "%[tmp], %[tmp], %[tmpStride] \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [tmp0]"=&r"(tmp0), + [tmp]"+&r"(tmp), [src]"+&r"(src), + [low32]"=&r"(low32) + : [tmpStride]"r"(8), + [srcStride]"r"((mips_reg)srcStride), + [ff_pw_20]"f"(ff_pw_20), [ff_pw_5]"f"(ff_pw_5) + : "memory" + ); + + tmp -= 28; + + for (i=0; i<4; i++) { + const int16_t tmpB= tmp[-8]; + const int16_t tmpA= tmp[-4]; + const int16_t tmp0= tmp[ 0]; + const int16_t tmp1= tmp[ 4]; + const int16_t tmp2= tmp[ 8]; + const int16_t tmp3= tmp[12]; + const int16_t tmp4= tmp[16]; + const int16_t tmp5= tmp[20]; + const int16_t tmp6= tmp[24]; + op2_avg(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3)); + op2_avg(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4)); + op2_avg(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5)); + op2_avg(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6)); + dst++; + tmp++; + } +} + +static void avg_h264_qpel8or16_hv2_lowpass_mmi(uint8_t *dst, + int16_t *tmp, ptrdiff_t dstStride, ptrdiff_t tmpStride, int size) +{ + int w = size >> 4; + double ftmp[11]; + uint64_t tmp0; + + do { + int h = size; + __asm__ volatile ( + "dli %[tmp0], 0x02 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x06 \n\t" + "mtc1 %[tmp0], %[ftmp10] \n\t" + "1: \n\t" + "ldc1 %[ftmp0], 0x00(%[tmp]) \n\t" + "ldc1 %[ftmp3], 0x08(%[tmp]) \n\t" + "gsldlc1 %[ftmp1], 0x09(%[tmp]) \n\t" + "gsldrc1 %[ftmp1], 0x02(%[tmp]) \n\t" + "gsldlc1 %[ftmp4], 0x11(%[tmp]) \n\t" + "gsldrc1 %[ftmp4], 0x0a(%[tmp]) \n\t" + "ldc1 %[ftmp7], 0x10(%[tmp]) \n\t" + "gsldlc1 %[ftmp8], 0x19(%[tmp]) \n\t" + "gsldrc1 %[ftmp8], 0x12(%[tmp]) \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "gsldlc1 %[ftmp2], 0x0b(%[tmp]) \n\t" + "gsldrc1 %[ftmp2], 0x04(%[tmp]) \n\t" + "gsldlc1 %[ftmp5], 0x13(%[tmp]) \n\t" + "gsldrc1 %[ftmp5], 0x0c(%[tmp]) \n\t" + "gsldlc1 %[ftmp7], 0x0d(%[tmp]) \n\t" + "gsldrc1 %[ftmp7], 0x06(%[tmp]) \n\t" + "gsldlc1 %[ftmp8], 0x15(%[tmp]) \n\t" + "gsldrc1 %[ftmp8], 0x0e(%[tmp]) \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp8] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "paddsh %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp10] \n\t" + "packushb %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "ldc1 %[ftmp6], 0x00(%[dst]) \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "sdc1 %[ftmp0], 0x00(%[dst]) \n\t" + "addi %[h], %[h], -0x01 \n\t" + PTR_ADDI "%[tmp], %[tmp], 0x30 \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), + [tmp0]"=&r"(tmp0), + [tmp]"+&r"(tmp), [dst]"+&r"(dst), + [h]"+&r"(h) + : [dstStride]"r"((mips_reg)dstStride) + : "memory" + ); + + tmp += 8 - size * 24; + dst += 8 - size * dstStride; + } while (w--); +} + +static void avg_h264_qpel8or16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp, + const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride, + ptrdiff_t srcStride, int size) +{ + put_h264_qpel8or16_hv1_lowpass_mmi(tmp, src, tmpStride, srcStride, size); + avg_h264_qpel8or16_hv2_lowpass_mmi(dst, tmp, dstStride, tmpStride, size); +} + +static void avg_h264_qpel8_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp, + const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride, + ptrdiff_t srcStride) +{ + avg_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride, + srcStride, 8); +} + +static void avg_h264_qpel16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp, + const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride, + ptrdiff_t srcStride) +{ + avg_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride, + srcStride, 16); +} + +static void avg_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src, + const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride) +{ + double ftmp[10]; + uint64_t tmp[2]; + uint64_t low32; + + __asm__ volatile ( + "dli %[tmp1], 0x02 \n\t" + "ori %[tmp0], $0, 0x8 \n\t" + "mtc1 %[tmp1], %[ftmp7] \n\t" + "dli %[tmp1], 0x05 \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "mtc1 %[tmp1], %[ftmp8] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x08(%[src]) \n\t" + "gsldrc1 %[ftmp2], 0x01(%[src]) \n\t" + "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psllh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "psllh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "gsldlc1 %[ftmp2], 0x06(%[src]) \n\t" + "gsldrc1 %[ftmp2], -0x01(%[src]) \n\t" + "gsldlc1 %[ftmp5], 0x09(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x02(%[src]) \n\t" + "punpckhbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "punpckhbh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ff_pw_5] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[ff_pw_5] \n\t" + "uld %[low32], -0x02(%[src]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "uld %[low32], 0x07(%[src]) \n\t" + "mtc1 %[low32], %[ftmp6] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_pw_16] \n\t" + "paddh %[ftmp5], %[ftmp5], %[ff_pw_16] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp8] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "gsldlc1 %[ftmp5], 0x07(%[src2]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src2]) \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "ldc1 %[ftmp9], 0x00(%[dst]) \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + PTR_ADDU "%[src], %[src], %[dstStride] \n\t" + "sdc1 %[ftmp1], 0x00(%[dst]) \n\t" + "daddi %[tmp0], %[tmp0], -0x01 \n\t" + PTR_ADDU "%[dst], %[dst], %[dstStride] \n\t" + PTR_ADDU "%[src2], %[src2], %[src2Stride] \n\t" + "bgtz %[tmp0], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [dst]"+&r"(dst), [src]"+&r"(src), + [src2]"+&r"(src2), + [low32]"=&r"(low32) + : [dstStride]"r"((mips_reg)dstStride), + [src2Stride]"r"((mips_reg)src2Stride), + [ff_pw_5]"f"(ff_pw_5), [ff_pw_16]"f"(ff_pw_16) + : "memory" + ); +} + +static void avg_h264_qpel16_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src, + const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride) +{ + avg_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride); + avg_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride, + src2Stride); + + src += 8 * dstStride; + dst += 8 * dstStride; + src2 += 8 * src2Stride; + + avg_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride); + avg_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride, + src2Stride); +} + +static void avg_pixels8_l2_shift5_mmi(uint8_t *dst, int16_t *src16, + const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int b) +{ + double ftmp[8]; + uint64_t tmp0; + + do { + __asm__ volatile ( + "dli %[tmp0], 0x05 \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src16]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src16]) \n\t" + "mtc1 %[tmp0], %[ftmp6] \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[src16]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[src16]) \n\t" + "gsldlc1 %[ftmp2], 0x37(%[src16]) \n\t" + "gsldrc1 %[ftmp2], 0x30(%[src16]) \n\t" + "gsldlc1 %[ftmp3], 0x3f(%[src16]) \n\t" + "gsldrc1 %[ftmp3], 0x38(%[src16]) \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "ldc1 %[ftmp4], 0x00(%[src8]) \n\t" + "gsldxc1 %[ftmp5], 0x00(%[src8], %[src8Stride]) \n\t" + "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "pavgb %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + "ldc1 %[ftmp7], 0x00(%[dst]) \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "sdc1 %[ftmp0], 0x00(%[dst]) \n\t" + "gsldxc1 %[ftmp7], 0x00(%[dst], %[dstStride]) \n\t" + "pavgb %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "gssdxc1 %[ftmp2], 0x00(%[dst], %[dstStride]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [tmp0]"=&r"(tmp0) + : [src8]"r"(src8), [src16]"r"(src16), + [dst]"r"(dst), + [src8Stride]"r"((mips_reg)src8Stride), + [dstStride]"r"((mips_reg)dstStride) + : "memory" + ); + + src8 += 2 * src8Stride; + src16 += 48; + dst += 2 * dstStride; + } while (b -= 2); +} + +static void avg_pixels16_l2_shift5_mmi(uint8_t *dst, int16_t *src16, + const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int b) +{ + avg_pixels8_l2_shift5_mmi(dst, src16, src8, dstStride, src8Stride, b); + avg_pixels8_l2_shift5_mmi(dst + 8, src16 + 8, src8 + 8, dstStride, + src8Stride, b); +} + +//DEF_H264_MC_MMI(put_, 4) +void ff_put_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + ff_put_pixels4_8_mmi(dst, src, stride, 4); +} + +void ff_put_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[16]; + put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride); + ff_put_pixels4_l2_8_mmi(dst, src, half, stride, stride, 4, 4); +} + +void ff_put_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + put_h264_qpel4_h_lowpass_mmi(dst, src, stride, stride); +} + +void ff_put_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[16]; + put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride); + ff_put_pixels4_l2_8_mmi(dst, src+1, half, stride, stride, 4, 4); +} + +void ff_put_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t half[16]; + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4); + ff_put_pixels4_l2_8_mmi(dst, full_mid, half, stride, 4, 4, 4); +} + +void ff_put_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(dst, full_mid, stride, 4); +} + +void ff_put_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t half[16]; + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4); + ff_put_pixels4_l2_8_mmi(dst, full_mid+4, half, stride, 4, 4, 4); +} + +void ff_put_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfH[16]; + uint8_t halfV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride); + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4); +} + +void ff_put_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfH[16]; + uint8_t halfV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride); + copy_block4_mmi(full, src - stride*2 + 1, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4); +} + +void ff_put_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfH[16]; + uint8_t halfV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride); + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4); +} + +void ff_put_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfH[16]; + uint8_t halfV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride); + copy_block4_mmi(full, src - stride*2 + 1, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4); +} + +void ff_put_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + put_h264_qpel4_hv_lowpass_mmi(dst, src, stride, stride); +} + +void ff_put_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[16]; + uint8_t halfHV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride); + put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride); + ff_put_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4); +} + +void ff_put_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[16]; + uint8_t halfHV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride); + put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride); + ff_put_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4); +} + +void ff_put_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfV[16]; + uint8_t halfHV[16]; + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride); + ff_put_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4); +} + +void ff_put_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfV[16]; + uint8_t halfHV[16]; + copy_block4_mmi(full, src - stride*2 + 1, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride); + ff_put_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4); +} + +//DEF_H264_MC_MMI(avg_, 4) +void ff_avg_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + ff_avg_pixels4_8_mmi(dst, src, stride, 4); +} + +void ff_avg_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[16]; + put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride); + ff_avg_pixels4_l2_8_mmi(dst, src, half, stride, stride, 4, 4); +} + +void ff_avg_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_h264_qpel4_h_lowpass_mmi(dst, src, stride, stride); +} + +void ff_avg_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[16]; + put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride); + ff_avg_pixels4_l2_8_mmi(dst, src+1, half, stride, stride, 4, 4); +} + +void ff_avg_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t half[16]; + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4); + ff_avg_pixels4_l2_8_mmi(dst, full_mid, half, stride, 4, 4, 4); +} + +void ff_avg_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + avg_h264_qpel4_v_lowpass_mmi(dst, full_mid, stride, 4); +} + +void ff_avg_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t half[16]; + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4); + ff_avg_pixels4_l2_8_mmi(dst, full_mid+4, half, stride, 4, 4, 4); +} + +void ff_avg_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfH[16]; + uint8_t halfV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride); + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4); +} + +void ff_avg_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfH[16]; + uint8_t halfV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride); + copy_block4_mmi(full, src - stride*2 + 1, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4); +} + +void ff_avg_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfH[16]; + uint8_t halfV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride); + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4); +} + +void ff_avg_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfH[16]; + uint8_t halfV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride); + copy_block4_mmi(full, src - stride*2 + 1, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4); +} + +void ff_avg_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_h264_qpel4_hv_lowpass_mmi(dst, src, stride, stride); +} + +void ff_avg_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[16]; + uint8_t halfHV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride); + put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride); + ff_avg_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4); +} + +void ff_avg_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[16]; + uint8_t halfHV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride); + put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride); + ff_avg_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4); +} + +void ff_avg_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfV[16]; + uint8_t halfHV[16]; + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride); + ff_avg_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4); +} + +void ff_avg_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfV[16]; + uint8_t halfHV[16]; + copy_block4_mmi(full, src - stride*2 + 1, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride); + ff_avg_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4); +} + +//DEF_H264_MC_MMI(put_, 8) +void ff_put_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + ff_put_pixels8_8_mmi(dst, src, stride, 8); +} + +void ff_put_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[64]; + put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride); + ff_put_pixels8_l2_8_mmi(dst, src, half, stride, stride, 8, 8); +} + +void ff_put_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + put_h264_qpel8_h_lowpass_mmi(dst, src, stride, stride); +} + +void ff_put_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[64]; + put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride); + ff_put_pixels8_l2_8_mmi(dst, src+1, half, stride, stride, 8, 8); +} + +void ff_put_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t half[64]; + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8); + ff_put_pixels8_l2_8_mmi(dst, full_mid, half, stride, 8, 8, 8); +} + +void ff_put_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(dst, full_mid, stride, 8); +} + +void ff_put_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t half[64]; + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8); + ff_put_pixels8_l2_8_mmi(dst, full_mid+8, half, stride, 8, 8, 8); +} + +void ff_put_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfH[64]; + uint8_t halfV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride); + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8); +} + +void ff_put_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfH[64]; + uint8_t halfV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride); + copy_block8_mmi(full, src - stride*2 + 1, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8); +} + +void ff_put_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfH[64]; + uint8_t halfV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride); + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8); +} + +void ff_put_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfH[64]; + uint8_t halfV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride); + copy_block8_mmi(full, src - stride*2 + 1, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8); +} + +void ff_put_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint16_t __attribute__ ((aligned(8))) temp[192]; + + put_h264_qpel8_hv_lowpass_mmi(dst, temp, src, stride, 8, stride); +} + +void ff_put_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t __attribute__ ((aligned(8))) temp[448]; + uint8_t *const halfHV = temp; + int16_t *const halfV = (int16_t *) (temp + 64); + + put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride); + put_h264_qpel8_h_lowpass_l2_mmi(dst, src, halfHV, stride, 8); +} + +void ff_put_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t __attribute__ ((aligned(8))) temp[448]; + uint8_t *const halfHV = temp; + int16_t *const halfV = (int16_t *) (temp + 64); + + put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride); + put_h264_qpel8_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 8); +} + +void ff_put_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t __attribute__ ((aligned(8))) temp[448]; + uint8_t *const halfHV = temp; + int16_t *const halfV = (int16_t *) (temp + 64); + + put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride); + put_pixels8_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 8, 8); +} + +void ff_put_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t __attribute__ ((aligned(8))) temp[448]; + uint8_t *const halfHV = temp; + int16_t *const halfV = (int16_t *) (temp + 64); + + put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride); + put_pixels8_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 8, 8); +} + +//DEF_H264_MC_MMI(avg_, 8) +void ff_avg_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + ff_avg_pixels8_8_mmi(dst, src, stride, 8); +} + +void ff_avg_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[64]; + put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride); + ff_avg_pixels8_l2_8_mmi(dst, src, half, stride, stride, 8, 8); +} + +void ff_avg_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_h264_qpel8_h_lowpass_mmi(dst, src, stride, stride); +} + +void ff_avg_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[64]; + put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride); + ff_avg_pixels8_l2_8_mmi(dst, src+1, half, stride, stride, 8, 8); +} + +void ff_avg_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t half[64]; + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8); + ff_avg_pixels8_l2_8_mmi(dst, full_mid, half, stride, 8, 8, 8); +} + +void ff_avg_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + avg_h264_qpel8_v_lowpass_mmi(dst, full_mid, stride, 8); +} + +void ff_avg_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t half[64]; + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8); + ff_avg_pixels8_l2_8_mmi(dst, full_mid+8, half, stride, 8, 8, 8); +} + +void ff_avg_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfH[64]; + uint8_t halfV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride); + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8); +} + +void ff_avg_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfH[64]; + uint8_t halfV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride); + copy_block8_mmi(full, src - stride*2 + 1, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8); +} + +void ff_avg_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfH[64]; + uint8_t halfV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride); + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8); +} + +void ff_avg_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfH[64]; + uint8_t halfV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride); + copy_block8_mmi(full, src - stride*2 + 1, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8); +} + +void ff_avg_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint16_t __attribute__ ((aligned(8))) temp[192]; + + avg_h264_qpel8_hv_lowpass_mmi(dst, temp, src, stride, 8, stride); +} + +void ff_avg_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t __attribute__ ((aligned(8))) temp[448]; + uint8_t *const halfHV = temp; + int16_t *const halfV = (int16_t *) (temp + 64); + + put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride); + avg_h264_qpel8_h_lowpass_l2_mmi(dst, src, halfHV, stride, 8); +} + +void ff_avg_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t __attribute__ ((aligned(8))) temp[448]; + uint8_t *const halfHV = temp; + int16_t *const halfV = (int16_t *) (temp + 64); + + put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride); + avg_h264_qpel8_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 8); +} + +void ff_avg_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t __attribute__ ((aligned(8))) temp[448]; + uint8_t *const halfHV = temp; + int16_t *const halfV = (int16_t *) (temp + 64); + + put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride); + avg_pixels8_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 8, 8); +} + +void ff_avg_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t __attribute__ ((aligned(8))) temp[448]; + uint8_t *const halfHV = temp; + int16_t *const halfV = (int16_t *) (temp + 64); + + put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride); + avg_pixels8_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 8, 8); +} + +//DEF_H264_MC_MMI(put_, 16) +void ff_put_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + ff_put_pixels16_8_mmi(dst, src, stride, 16); +} + +void ff_put_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[256]; + put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride); + ff_put_pixels16_l2_8_mmi(dst, src, half, stride, stride, 16, 16); +} + +void ff_put_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + put_h264_qpel16_h_lowpass_mmi(dst, src, stride, stride); +} + +void ff_put_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[256]; + put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride); + ff_put_pixels16_l2_8_mmi(dst, src+1, half, stride, stride, 16, 16); +} + +void ff_put_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t half[256]; + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16); + ff_put_pixels16_l2_8_mmi(dst, full_mid, half, stride, 16, 16, 16); +} + +void ff_put_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(dst, full_mid, stride, 16); +} + +void ff_put_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t half[256]; + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16); + ff_put_pixels16_l2_8_mmi(dst, full_mid+16, half, stride, 16, 16, 16); +} + +void ff_put_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfH[256]; + uint8_t halfV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride); + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16); +} + +void ff_put_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfH[256]; + uint8_t halfV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride); + copy_block16_mmi(full, src - stride*2 + 1, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16); +} + +void ff_put_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfH[256]; + uint8_t halfV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride); + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16); +} + +void ff_put_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfH[256]; + uint8_t halfV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride); + copy_block16_mmi(full, src - stride*2 + 1, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16); +} + +void ff_put_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint16_t __attribute__ ((aligned(8))) temp[384]; + + put_h264_qpel16_hv_lowpass_mmi(dst, temp, src, stride, 16, stride); +} + +void ff_put_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t __attribute__ ((aligned(8))) temp[1024]; + uint8_t *const halfHV = temp; + int16_t *const halfV = (int16_t *) (temp + 256); + + put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride); + put_h264_qpel16_h_lowpass_l2_mmi(dst, src, halfHV, stride, 16); +} + +void ff_put_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t __attribute__ ((aligned(8))) temp[1024]; + uint8_t *const halfHV = temp; + int16_t *const halfV = (int16_t *) (temp + 256); + + put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride); + put_h264_qpel16_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 16); +} + +void ff_put_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t __attribute__ ((aligned(8))) temp[1024]; + uint8_t *const halfHV = temp; + int16_t *const halfV = (int16_t *) (temp + 256); + + put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride); + put_pixels16_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 16, 16); +} + +void ff_put_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t __attribute__ ((aligned(8))) temp[1024]; + uint8_t *const halfHV = temp; + int16_t *const halfV = (int16_t *) (temp + 256); + + put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride); + put_pixels16_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 16, 16); +} + +//DEF_H264_MC_MMI(avg_, 16) +void ff_avg_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + ff_avg_pixels16_8_mmi(dst, src, stride, 16); +} + +void ff_avg_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[256]; + put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride); + ff_avg_pixels16_l2_8_mmi(dst, src, half, stride, stride, 16, 16); +} + +void ff_avg_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_h264_qpel16_h_lowpass_mmi(dst, src, stride, stride); +} + +void ff_avg_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[256]; + put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride); + ff_avg_pixels16_l2_8_mmi(dst, src+1, half, stride, stride, 16, 16); +} + +void ff_avg_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t half[256]; + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16); + ff_avg_pixels16_l2_8_mmi(dst, full_mid, half, stride, 16, 16, 16); +} + +void ff_avg_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + avg_h264_qpel16_v_lowpass_mmi(dst, full_mid, stride, 16); +} + +void ff_avg_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t half[256]; + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16); + ff_avg_pixels16_l2_8_mmi(dst, full_mid+16, half, stride, 16, 16, 16); +} + +void ff_avg_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfH[256]; + uint8_t halfV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride); + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16); +} + +void ff_avg_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfH[256]; + uint8_t halfV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride); + copy_block16_mmi(full, src - stride*2 + 1, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16); +} + +void ff_avg_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfH[256]; + uint8_t halfV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride); + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16); +} + +void ff_avg_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfH[256]; + uint8_t halfV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride); + copy_block16_mmi(full, src - stride*2 + 1, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16); +} + +void ff_avg_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint16_t __attribute__ ((aligned(8))) temp[384]; + + avg_h264_qpel16_hv_lowpass_mmi(dst, temp, src, stride, 16, stride); +} + +void ff_avg_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t __attribute__ ((aligned(8))) temp[1024]; + uint8_t *const halfHV = temp; + int16_t *const halfV = (int16_t *) (temp + 256); + + put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride); + avg_h264_qpel16_h_lowpass_l2_mmi(dst, src, halfHV, stride, 16); +} + +void ff_avg_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t __attribute__ ((aligned(8))) temp[1024]; + uint8_t *const halfHV = temp; + int16_t *const halfV = (int16_t *) (temp + 256); + + put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride); + avg_h264_qpel16_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 16); +} + +void ff_avg_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t __attribute__ ((aligned(8))) temp[1024]; + uint8_t *const halfHV = temp; + int16_t *const halfV = (int16_t *) (temp + 256); + + put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride); + avg_pixels16_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 16, 16); +} + +void ff_avg_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t __attribute__ ((aligned(8))) temp[1024]; + uint8_t *const halfHV = temp; + int16_t *const halfV = (int16_t *) (temp + 256); + + put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride); + avg_pixels16_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 16, 16); +} + +#undef op2_avg +#undef op2_put diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c new file mode 100644 index 0000000000..c38f1f7a42 --- /dev/null +++ b/libavcodec/mips/h264qpel_msa.c @@ -0,0 +1,3600 @@ +/* + * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "h264dsp_mips.h" + +#define AVC_CALC_DPADD_H_6PIX_2COEFF_SH(in0, in1, in2, in3, in4, in5) \ +( { \ + v4i32 tmp0_m, tmp1_m; \ + v8i16 out0_m, out1_m, out2_m, out3_m; \ + v8i16 minus5h_m = __msa_ldi_h(-5); \ + v8i16 plus20h_m = __msa_ldi_h(20); \ + \ + ILVRL_H2_SW(in5, in0, tmp0_m, tmp1_m); \ + \ + tmp0_m = __msa_hadd_s_w((v8i16) tmp0_m, (v8i16) tmp0_m); \ + tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m); \ + \ + ILVRL_H2_SH(in1, in4, out0_m, out1_m); \ + DPADD_SH2_SW(out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m); \ + ILVRL_H2_SH(in2, in3, out2_m, out3_m); \ + DPADD_SH2_SW(out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m); \ + \ + SRARI_W2_SW(tmp0_m, tmp1_m, 10); \ + SAT_SW2_SW(tmp0_m, tmp1_m, 7); \ + out0_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \ + \ + out0_m; \ +} ) + +#define AVC_HORZ_FILTER_SH(in, mask0, mask1, mask2) \ +( { \ + v8i16 out0_m, out1_m; \ + v16i8 tmp0_m, tmp1_m; \ + v16i8 minus5b = __msa_ldi_b(-5); \ + v16i8 plus20b = __msa_ldi_b(20); \ + \ + tmp0_m = __msa_vshf_b((v16i8) mask0, in, in); \ + out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m); \ + \ + tmp0_m = __msa_vshf_b((v16i8) mask1, in, in); \ + out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m); \ + \ + tmp1_m = __msa_vshf_b((v16i8) (mask2), in, in); \ + out1_m = __msa_dpadd_s_h(out0_m, plus20b, tmp1_m); \ + \ + out1_m; \ +} ) + +static const uint8_t luma_mask_arr[16 * 8] = { + /* 8 width cases */ + 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12, + 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11, + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + + /* 4 width cases */ + 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24, + 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23, + 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22, + + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26 +}; + +#define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5, \ + out1, out2) \ +{ \ + v16i8 tmp0_m, tmp1_m; \ + v16i8 minus5b_m = __msa_ldi_b(-5); \ + v16i8 plus20b_m = __msa_ldi_b(20); \ + \ + ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m); \ + HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2); \ + ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m); \ + DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2); \ + ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m); \ + DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2); \ +} + +#define AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5) \ +( { \ + v8i16 tmp1_m; \ + v16i8 tmp0_m, tmp2_m; \ + v16i8 minus5b_m = __msa_ldi_b(-5); \ + v16i8 plus20b_m = __msa_ldi_b(20); \ + \ + tmp1_m = (v8i16) __msa_ilvr_b((v16i8) vec5, (v16i8) vec0); \ + tmp1_m = __msa_hadd_s_h((v16i8) tmp1_m, (v16i8) tmp1_m); \ + \ + ILVR_B2_SB(vec4, vec1, vec3, vec2, tmp0_m, tmp2_m); \ + DPADD_SB2_SH(tmp0_m, tmp2_m, minus5b_m, plus20b_m, tmp1_m, tmp1_m); \ + \ + tmp1_m; \ +} ) + +#define AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5) \ +( { \ + v4i32 tmp1_m; \ + v8i16 tmp2_m, tmp3_m; \ + v8i16 minus5h_m = __msa_ldi_h(-5); \ + v8i16 plus20h_m = __msa_ldi_h(20); \ + \ + tmp1_m = (v4i32) __msa_ilvr_h((v8i16) vec5, (v8i16) vec0); \ + tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m); \ + \ + ILVR_H2_SH(vec1, vec4, vec2, vec3, tmp2_m, tmp3_m); \ + DPADD_SH2_SW(tmp2_m, tmp3_m, minus5h_m, plus20h_m, tmp1_m, tmp1_m); \ + \ + tmp1_m = __msa_srari_w(tmp1_m, 10); \ + tmp1_m = __msa_sat_s_w(tmp1_m, 7); \ + \ + tmp2_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp1_m); \ + \ + tmp2_m; \ +} ) + +#define AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1, \ + mask0, mask1, mask2) \ +( { \ + v8i16 hz_out_m; \ + v16i8 vec0_m, vec1_m, vec2_m; \ + v16i8 minus5b_m = __msa_ldi_b(-5); \ + v16i8 plus20b_m = __msa_ldi_b(20); \ + \ + vec0_m = __msa_vshf_b((v16i8) mask0, (v16i8) src1, (v16i8) src0); \ + hz_out_m = __msa_hadd_s_h(vec0_m, vec0_m); \ + \ + VSHF_B2_SB(src0, src1, src0, src1, mask1, mask2, vec1_m, vec2_m); \ + DPADD_SB2_SH(vec1_m, vec2_m, minus5b_m, plus20b_m, hz_out_m, hz_out_m); \ + \ + hz_out_m; \ +} ) + +static void avc_luma_hz_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v8i16 res0, res1; + v16u8 out; + v16i8 mask0, mask1, mask2; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v16i8 minus5b = __msa_ldi_b(-5); + v16i8 plus20b = __msa_ldi_b(20); + + LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); + HADD_SB2_SH(vec0, vec1, res0, res1); + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3); + DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1); + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5); + DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1); + SRARI_H2_SH(res0, res1, 5); + SAT_SH2_SH(res0, res1, 7); + out = PCKEV_XORI128_UB(res0, res1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avc_luma_hz_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v8i16 res0, res1, res2, res3; + v16i8 mask0, mask1, mask2; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v16i8 vec6, vec7, vec8, vec9, vec10, vec11; + v16i8 minus5b = __msa_ldi_b(-5); + v16i8 plus20b = __msa_ldi_b(20); + v16u8 out0, out1; + + LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); + HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3); + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, + res0, res1, res2, res3); + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9); + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11); + DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, + plus20b, res0, res1, res2, res3); + SRARI_H4_SH(res0, res1, res2, res3, 5); + SAT_SH4_SH(res0, res1, res2, res3, 7); + out0 = PCKEV_XORI128_UB(res0, res1); + out1 = PCKEV_XORI128_UB(res2, res3); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avc_luma_hz_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7; + v16i8 mask0, mask1, mask2; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v16i8 vec6, vec7, vec8, vec9, vec10, vec11; + v16i8 minus5b = __msa_ldi_b(-5); + v16i8 plus20b = __msa_ldi_b(20); + + LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src, 8, src0, src1); + src += src_stride; + LD_SB2(src, 8, src2, src3); + src += src_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3); + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9); + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4); + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10); + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5); + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11); + HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3); + DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, + minus5b, res0, res1, res2, res3); + DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, + plus20b, res0, res1, res2, res3); + + LD_SB2(src, 8, src4, src5); + src += src_stride; + LD_SB2(src, 8, src6, src7); + src += src_stride; + + XORI_B4_128_SB(src4, src5, src6, src7); + VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3); + VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9); + VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4); + VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10); + VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5); + VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11); + HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7); + DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, + minus5b, res4, res5, res6, res7); + DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, + plus20b, res4, res5, res6, res7); + SRARI_H4_SH(res0, res1, res2, res3, 5); + SRARI_H4_SH(res4, res5, res6, res7, 5); + SAT_SH4_SH(res0, res1, res2, res3, 7); + SAT_SH4_SH(res4, res5, res6, res7, 7); + PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, + vec0, vec1, vec2, vec3); + XORI_B4_128_SB(vec0, vec1, vec2, vec3); + + ST_SB4(vec0, vec1, vec2, vec3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avc_luma_hz_qrt_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, uint8_t hor_offset) +{ + uint8_t slide; + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v8i16 res0, res1; + v16i8 res, mask0, mask1, mask2; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v16i8 minus5b = __msa_ldi_b(-5); + v16i8 plus20b = __msa_ldi_b(20); + + LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); + slide = 2 + hor_offset; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); + HADD_SB2_SH(vec0, vec1, res0, res1); + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3); + DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1); + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5); + DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1); + SRARI_H2_SH(res0, res1, 5); + SAT_SH2_SH(res0, res1, 7); + + res = __msa_pckev_b((v16i8) res1, (v16i8) res0); + src0 = __msa_sld_b(src0, src0, slide); + src1 = __msa_sld_b(src1, src1, slide); + src2 = __msa_sld_b(src2, src2, slide); + src3 = __msa_sld_b(src3, src3, slide); + src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1); + src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); + src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1); + res = __msa_aver_s_b(res, src0); + res = (v16i8) __msa_xori_b((v16u8) res, 128); + + ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avc_luma_hz_qrt_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, uint8_t hor_offset) +{ + uint8_t slide; + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v16i8 tmp0, tmp1; + v8i16 res0, res1, res2, res3; + v16i8 mask0, mask1, mask2; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v16i8 vec6, vec7, vec8, vec9, vec10, vec11; + v16i8 minus5b = __msa_ldi_b(-5); + v16i8 plus20b = __msa_ldi_b(20); + + LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); + slide = 2 + hor_offset; + + for (loop_cnt = height >> 2; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); + HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3); + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, + res0, res1, res2, res3); + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9); + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11); + DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, + plus20b, res0, res1, res2, res3); + + src0 = __msa_sld_b(src0, src0, slide); + src1 = __msa_sld_b(src1, src1, slide); + src2 = __msa_sld_b(src2, src2, slide); + src3 = __msa_sld_b(src3, src3, slide); + + SRARI_H4_SH(res0, res1, res2, res3, 5); + SAT_SH4_SH(res0, res1, res2, res3, 7); + PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1); + PCKEV_D2_SB(src1, src0, src3, src2, src0, src1); + + tmp0 = __msa_aver_s_b(tmp0, src0); + tmp1 = __msa_aver_s_b(tmp1, src1); + + XORI_B2_128_SB(tmp0, tmp1); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + + dst += (4 * dst_stride); + } +} + +static void avc_luma_hz_qrt_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, uint8_t hor_offset) +{ + uint32_t loop_cnt; + v16i8 dst0, dst1; + v16i8 src0, src1, src2, src3; + v16i8 mask0, mask1, mask2, vshf; + v8i16 res0, res1, res2, res3; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v16i8 vec6, vec7, vec8, vec9, vec10, vec11; + v16i8 minus5b = __msa_ldi_b(-5); + v16i8 plus20b = __msa_ldi_b(20); + + LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); + + if (hor_offset) { + vshf = LD_SB(&luma_mask_arr[16 + 96]); + } else { + vshf = LD_SB(&luma_mask_arr[96]); + } + + for (loop_cnt = height >> 1; loop_cnt--;) { + LD_SB2(src, 8, src0, src1); + src += src_stride; + LD_SB2(src, 8, src2, src3); + src += src_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3); + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9); + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4); + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10); + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5); + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11); + HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3); + DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, + minus5b, res0, res1, res2, res3); + DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, + plus20b, res0, res1, res2, res3); + VSHF_B2_SB(src0, src1, src2, src3, vshf, vshf, src0, src2); + SRARI_H4_SH(res0, res1, res2, res3, 5); + SAT_SH4_SH(res0, res1, res2, res3, 7); + PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1); + + dst0 = __msa_aver_s_b(dst0, src0); + dst1 = __msa_aver_s_b(dst1, src2); + + XORI_B2_128_SB(dst0, dst1); + + ST_SB2(dst0, dst1, dst, dst_stride); + dst += (2 * dst_stride); + } +} + +static void avc_luma_vt_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t loop_cnt; + int16_t filt_const0 = 0xfb01; + int16_t filt_const1 = 0x1414; + int16_t filt_const2 = 0x1fb; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16i8 src87_r, src2110, src4332, src6554, src8776; + v16i8 filt0, filt1, filt2; + v8i16 out10, out32; + v16u8 out; + + filt0 = (v16i8) __msa_fill_h(filt_const0); + filt1 = (v16i8) __msa_fill_h(filt_const1); + filt2 = (v16i8) __msa_fill_h(filt_const2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + XORI_B2_128_SB(src2110, src4332); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src5, src6, src7, src8); + src += (4 * src_stride); + + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r); + ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); + XORI_B2_128_SB(src6554, src8776); + out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); + out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); + SRARI_H2_SH(out10, out32, 5); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + + dst += (4 * dst_stride); + src2110 = src6554; + src4332 = src8776; + src4 = src8; + } +} + +static void avc_luma_vt_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t loop_cnt; + int16_t filt_const0 = 0xfb01; + int16_t filt_const1 = 0x1414; + int16_t filt_const2 = 0x1fb; + v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src87_r, src109_r; + v8i16 out0_r, out1_r, out2_r, out3_r; + v16i8 filt0, filt1, filt2; + v16u8 out0, out1; + + filt0 = (v16i8) __msa_fill_h(filt_const0); + filt1 = (v16i8) __msa_fill_h(filt_const1); + filt2 = (v16i8) __msa_fill_h(filt_const2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2); + out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2); + out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2); + out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + out0 = PCKEV_XORI128_UB(out0_r, out1_r); + out1 = PCKEV_XORI128_UB(out2_r, out3_r); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src76_r; + src32_r = src98_r; + src21_r = src87_r; + src43_r = src109_r; + src4 = src10; + } +} + +static void avc_luma_vt_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t loop_cnt; + int16_t filt_const0 = 0xfb01; + int16_t filt_const1 = 0x1414; + int16_t filt_const2 = 0x1fb; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; + v16i8 src65_l, src87_l; + v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + v16u8 res0, res1, res2, res3; + v16i8 filt0, filt1, filt2; + + filt0 = (v16i8) __msa_fill_h(filt_const0); + filt1 = (v16i8) __msa_fill_h(filt_const1); + filt2 = (v16i8) __msa_fill_h(filt_const2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_l, src21_l, src32_l, src43_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src5, src6, src7, src8); + src += (4 * src_stride); + + XORI_B4_128_SB(src5, src6, src7, src8); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r); + ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, + src54_l, src65_l, src76_l, src87_l); + out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); + out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); + out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); + out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); + out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); + out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); + out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); + out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, res0, res1, res2, res3); + XORI_B4_128_UB(res0, res1, res2, res3); + + ST_UB4(res0, res1, res2, res3, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src21_r = src65_r; + src43_r = src87_r; + src10_l = src54_l; + src32_l = src76_l; + src21_l = src65_l; + src43_l = src87_l; + src4 = src8; + } +} + +static void avc_luma_vt_qrt_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, uint8_t ver_offset) +{ + int32_t loop_cnt; + int16_t filt_const0 = 0xfb01; + int16_t filt_const1 = 0x1414; + int16_t filt_const2 = 0x1fb; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16i8 src87_r, src2110, src4332, src6554, src8776; + v8i16 out10, out32; + v16i8 filt0, filt1, filt2; + v16u8 out; + + filt0 = (v16i8) __msa_fill_h(filt_const0); + filt1 = (v16i8) __msa_fill_h(filt_const1); + filt2 = (v16i8) __msa_fill_h(filt_const2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + XORI_B2_128_SB(src2110, src4332); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src5, src6, src7, src8); + src += (4 * src_stride); + + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r); + ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); + XORI_B2_128_SB(src6554, src8776); + out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); + out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); + SRARI_H2_SH(out10, out32, 5); + SAT_SH2_SH(out10, out32, 7); + + out = PCKEV_XORI128_UB(out10, out32); + + if (ver_offset) { + src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4); + src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6); + } else { + src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); + src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5); + } + + src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r); + out = __msa_aver_u_b(out, (v16u8) src32_r); + + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + src2110 = src6554; + src4332 = src8776; + src2 = src6; + src3 = src7; + src4 = src8; + } +} + +static void avc_luma_vt_qrt_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, uint8_t ver_offset) +{ + int32_t loop_cnt; + int16_t filt_const0 = 0xfb01; + int16_t filt_const1 = 0x1414; + int16_t filt_const2 = 0x1fb; + v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src87_r, src109_r; + v8i16 out0_r, out1_r, out2_r, out3_r; + v16i8 res0, res1; + v16i8 filt0, filt1, filt2; + + filt0 = (v16i8) __msa_fill_h(filt_const0); + filt1 = (v16i8) __msa_fill_h(filt_const1); + filt2 = (v16i8) __msa_fill_h(filt_const2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2); + out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2); + out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2); + out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, res0, res1); + + if (ver_offset) { + PCKEV_D2_SB(src4, src3, src8, src7, src10_r, src32_r); + } else { + PCKEV_D2_SB(src3, src2, src7, src4, src10_r, src32_r); + } + + res0 = __msa_aver_s_b(res0, (v16i8) src10_r); + res1 = __msa_aver_s_b(res1, (v16i8) src32_r); + + XORI_B2_128_SB(res0, res1); + ST8x4_UB(res0, res1, dst, dst_stride); + + dst += (4 * dst_stride); + src10_r = src76_r; + src32_r = src98_r; + src21_r = src87_r; + src43_r = src109_r; + src2 = src8; + src3 = src9; + src4 = src10; + } +} + +static void avc_luma_vt_qrt_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, uint8_t ver_offset) +{ + int32_t loop_cnt; + int16_t filt_const0 = 0xfb01; + int16_t filt_const1 = 0x1414; + int16_t filt_const2 = 0x1fb; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; + v16i8 src65_l, src87_l; + v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + v16u8 res0, res1, res2, res3; + v16i8 filt0, filt1, filt2; + + filt0 = (v16i8) __msa_fill_h(filt_const0); + filt1 = (v16i8) __msa_fill_h(filt_const1); + filt2 = (v16i8) __msa_fill_h(filt_const2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_l, src21_l, src32_l, src43_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src5, src6, src7, src8); + src += (4 * src_stride); + + XORI_B4_128_SB(src5, src6, src7, src8); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r); + ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, + src54_l, src65_l, src76_l, src87_l); + out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); + out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); + out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); + out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); + out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); + out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); + out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); + out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, res0, res1, res2, res3); + + if (ver_offset) { + res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3); + res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4); + res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5); + res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6); + } else { + res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2); + res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3); + res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4); + res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5); + } + + XORI_B4_128_UB(res0, res1, res2, res3); + ST_UB4(res0, res1, res2, res3, dst, dst_stride); + + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src21_r = src65_r; + src43_r = src87_r; + src10_l = src54_l; + src32_l = src76_l; + src21_l = src65_l; + src43_l = src87_l; + src2 = src6; + src3 = src7; + src4 = src8; + } +} + +static void avc_luma_mid_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4; + v16i8 mask0, mask1, mask2; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8; + v8i16 dst0, dst1, dst2, dst3; + + LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + + hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1, + mask0, mask1, mask2); + hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3, + mask0, mask1, mask2); + + PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3); + + hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + + hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1, + mask0, mask1, + mask2); + hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3, + mask0, mask1, + mask2); + + PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8); + + dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2, + hz_out3, hz_out4, hz_out5); + dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3, + hz_out4, hz_out5, hz_out6); + dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out2, hz_out3, hz_out4, + hz_out5, hz_out6, hz_out7); + dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out3, hz_out4, hz_out5, + hz_out6, hz_out7, hz_out8); + + PCKEV_B2_SB(dst1, dst0, dst3, dst2, src0, src1); + XORI_B2_128_SB(src0, src1); + + ST4x4_UB(src0, src1, 0, 2, 0, 2, dst, dst_stride); + + dst += (4 * dst_stride); + + hz_out0 = hz_out4; + hz_out1 = hz_out5; + hz_out2 = hz_out6; + hz_out3 = hz_out7; + hz_out4 = hz_out8; + } +} + +static void avc_luma_mid_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4; + v16i8 mask0, mask1, mask2; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8; + v8i16 dst0, dst1, dst2, dst3; + v16u8 out0, out1; + + LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + XORI_B5_128_SB(src0, src1, src2, src3, src4); + src += (5 * src_stride); + + hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2); + hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2); + hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2); + hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2); + hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + + hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2); + hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2); + hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2); + hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2); + dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2, + hz_out3, hz_out4, hz_out5); + dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3, + hz_out4, hz_out5, hz_out6); + dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4, + hz_out5, hz_out6, hz_out7); + dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5, + hz_out6, hz_out7, hz_out8); + out0 = PCKEV_XORI128_UB(dst0, dst1); + out1 = PCKEV_XORI128_UB(dst2, dst3); + ST8x4_UB(out0, out1, dst, dst_stride); + + dst += (4 * dst_stride); + hz_out3 = hz_out7; + hz_out1 = hz_out5; + hz_out5 = hz_out4; + hz_out4 = hz_out8; + hz_out2 = hz_out6; + hz_out0 = hz_out5; + } +} + +static void avc_luma_mid_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + uint32_t multiple8_cnt; + + for (multiple8_cnt = 2; multiple8_cnt--;) { + avc_luma_mid_8w_msa(src, src_stride, dst, dst_stride, height); + src += 8; + dst += 8; + } +} + +static void avc_luma_midh_qrt_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, uint8_t horiz_offset) +{ + uint32_t row; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v8i16 vt_res0, vt_res1, vt_res2, vt_res3; + v4i32 hz_res0, hz_res1; + v8i16 dst0, dst1; + v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5; + v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; + v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; + v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; + v8i16 minus5h = __msa_ldi_h(-5); + v8i16 plus20h = __msa_ldi_h(20); + v8i16 zeros = { 0 }; + v16u8 out; + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + XORI_B5_128_SB(src0, src1, src2, src3, src4); + + for (row = (height >> 1); row--;) { + LD_SB2(src, src_stride, src5, src6); + src += (2 * src_stride); + + XORI_B2_128_SB(src5, src6); + AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5, + vt_res0, vt_res1); + AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6, + vt_res2, vt_res3); + VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, + mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2); + VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, + mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5); + hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); + DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); + hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); + DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); + + SRARI_W2_SW(hz_res0, hz_res1, 10); + SAT_SW2_SW(hz_res0, hz_res1, 7); + + dst0 = __msa_srari_h(shf_vec2, 5); + dst1 = __msa_srari_h(shf_vec5, 5); + + SAT_SH2_SH(dst0, dst1, 7); + + if (horiz_offset) { + dst0 = __msa_ilvod_h(zeros, dst0); + dst1 = __msa_ilvod_h(zeros, dst1); + } else { + ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1); + } + + hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0); + hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1); + dst0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0); + + out = PCKEV_XORI128_UB(dst0, dst0); + ST4x2_UB(out, dst, dst_stride); + + dst += (2 * dst_stride); + + src0 = src2; + src1 = src3; + src2 = src4; + src3 = src5; + src4 = src6; + } +} + +static void avc_luma_midh_qrt_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, uint8_t horiz_offset) +{ + uint32_t multiple8_cnt; + + for (multiple8_cnt = 2; multiple8_cnt--;) { + avc_luma_midh_qrt_4w_msa(src, src_stride, dst, dst_stride, height, + horiz_offset); + + src += 4; + dst += 4; + } +} + +static void avc_luma_midh_qrt_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, uint8_t horiz_offset) +{ + uint32_t multiple8_cnt; + + for (multiple8_cnt = 4; multiple8_cnt--;) { + avc_luma_midh_qrt_4w_msa(src, src_stride, dst, dst_stride, height, + horiz_offset); + + src += 4; + dst += 4; + } +} + +static void avc_luma_midv_qrt_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, uint8_t ver_offset) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4; + v16i8 mask0, mask1, mask2; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + + hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1, + mask0, mask1, mask2); + hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3, + mask0, mask1, mask2); + + PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3); + + hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + XORI_B4_128_SB(src0, src1, src2, src3); + + hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1, + mask0, mask1, + mask2); + hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3, + mask0, mask1, + mask2); + + PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8); + + dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2, + hz_out3, hz_out4, hz_out5); + dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3, + hz_out4, hz_out5, hz_out6); + dst4 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4, + hz_out5, hz_out6, hz_out7); + dst6 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5, + hz_out6, hz_out7, hz_out8); + + if (ver_offset) { + dst1 = __msa_srari_h(hz_out3, 5); + dst3 = __msa_srari_h(hz_out4, 5); + dst5 = __msa_srari_h(hz_out5, 5); + dst7 = __msa_srari_h(hz_out6, 5); + } else { + dst1 = __msa_srari_h(hz_out2, 5); + dst3 = __msa_srari_h(hz_out3, 5); + dst5 = __msa_srari_h(hz_out4, 5); + dst7 = __msa_srari_h(hz_out5, 5); + } + + SAT_SH4_SH(dst1, dst3, dst5, dst7, 7); + + dst0 = __msa_aver_s_h(dst0, dst1); + dst1 = __msa_aver_s_h(dst2, dst3); + dst2 = __msa_aver_s_h(dst4, dst5); + dst3 = __msa_aver_s_h(dst6, dst7); + + PCKEV_B2_SB(dst1, dst0, dst3, dst2, src0, src1); + XORI_B2_128_SB(src0, src1); + + ST4x4_UB(src0, src1, 0, 2, 0, 2, dst, dst_stride); + + dst += (4 * dst_stride); + hz_out0 = hz_out4; + hz_out1 = hz_out5; + hz_out2 = hz_out6; + hz_out3 = hz_out7; + hz_out4 = hz_out8; + } +} + +static void avc_luma_midv_qrt_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, uint8_t ver_offset) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4; + v16i8 mask0, mask1, mask2; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 out; + + LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + XORI_B5_128_SB(src0, src1, src2, src3, src4); + src += (5 * src_stride); + + hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2); + hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2); + hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2); + hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2); + hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + + hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2); + hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2); + hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2); + hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2); + + dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2, + hz_out3, hz_out4, hz_out5); + dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3, + hz_out4, hz_out5, hz_out6); + dst4 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4, + hz_out5, hz_out6, hz_out7); + dst6 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5, + hz_out6, hz_out7, hz_out8); + + if (ver_offset) { + dst1 = __msa_srari_h(hz_out3, 5); + dst3 = __msa_srari_h(hz_out4, 5); + dst5 = __msa_srari_h(hz_out5, 5); + dst7 = __msa_srari_h(hz_out6, 5); + } else { + dst1 = __msa_srari_h(hz_out2, 5); + dst3 = __msa_srari_h(hz_out3, 5); + dst5 = __msa_srari_h(hz_out4, 5); + dst7 = __msa_srari_h(hz_out5, 5); + } + + SAT_SH4_SH(dst1, dst3, dst5, dst7, 7); + + dst0 = __msa_aver_s_h(dst0, dst1); + dst1 = __msa_aver_s_h(dst2, dst3); + dst2 = __msa_aver_s_h(dst4, dst5); + dst3 = __msa_aver_s_h(dst6, dst7); + + out = PCKEV_XORI128_UB(dst0, dst0); + ST8x1_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(dst1, dst1); + ST8x1_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(dst2, dst2); + ST8x1_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(dst3, dst3); + ST8x1_UB(out, dst); + dst += dst_stride; + + hz_out0 = hz_out4; + hz_out1 = hz_out5; + hz_out2 = hz_out6; + hz_out3 = hz_out7; + hz_out4 = hz_out8; + } +} + +static void avc_luma_midv_qrt_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, uint8_t vert_offset) +{ + uint32_t multiple8_cnt; + + for (multiple8_cnt = 2; multiple8_cnt--;) { + avc_luma_midv_qrt_8w_msa(src, src_stride, dst, dst_stride, height, + vert_offset); + + src += 8; + dst += 8; + } +} + +static void avc_luma_hv_qrt_4w_msa(const uint8_t *src_x, const uint8_t *src_y, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) +{ + uint32_t loop_cnt; + v16i8 src_hz0, src_hz1, src_hz2, src_hz3; + v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4; + v16i8 src_vt5, src_vt6, src_vt7, src_vt8; + v16i8 mask0, mask1, mask2; + v8i16 hz_out0, hz_out1, vert_out0, vert_out1; + v8i16 out0, out1; + v16u8 out; + + LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); + + LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); + src_y += (5 * src_stride); + + src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1); + src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2); + src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3); + src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4); + + XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3); + src_x += (4 * src_stride); + + XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); + + hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz0, + src_hz1, mask0, + mask1, mask2); + hz_out1 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz2, + src_hz3, mask0, + mask1, mask2); + + SRARI_H2_SH(hz_out0, hz_out1, 5); + SAT_SH2_SH(hz_out0, hz_out1, 7); + + LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8); + src_y += (4 * src_stride); + + src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5); + src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6); + src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7); + src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8); + + XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7); + + /* filter calc */ + vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1, + src_vt2, src_vt3, + src_vt4, src_vt5); + vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3, + src_vt4, src_vt5, + src_vt6, src_vt7); + + SRARI_H2_SH(vert_out0, vert_out1, 5); + SAT_SH2_SH(vert_out0, vert_out1, 7); + + out0 = __msa_srari_h((hz_out0 + vert_out0), 1); + out1 = __msa_srari_h((hz_out1 + vert_out1), 1); + + SAT_SH2_SH(out0, out1, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + src_vt3 = src_vt7; + src_vt1 = src_vt5; + src_vt0 = src_vt4; + src_vt4 = src_vt8; + src_vt2 = src_vt6; + } +} + +static void avc_luma_hv_qrt_8w_msa(const uint8_t *src_x, const uint8_t *src_y, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) +{ + uint32_t loop_cnt; + v16i8 src_hz0, src_hz1, src_hz2, src_hz3; + v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4; + v16i8 src_vt5, src_vt6, src_vt7, src_vt8; + v16i8 mask0, mask1, mask2; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 vert_out0, vert_out1, vert_out2, vert_out3; + v8i16 out0, out1, out2, out3; + v16u8 tmp0, tmp1; + + LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); + LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); + src_y += (5 * src_stride); + + src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1); + src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2); + src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3); + src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4); + + XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3); + XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); + src_x += (4 * src_stride); + + hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, mask0, mask1, mask2); + hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, mask0, mask1, mask2); + hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, mask0, mask1, mask2); + hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, mask0, mask1, mask2); + + SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5); + SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7); + + LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8); + src_y += (4 * src_stride); + + src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5); + src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6); + src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7); + src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8); + + XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7); + + /* filter calc */ + AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt0, src_vt1, src_vt2, src_vt3, + src_vt4, src_vt5, vert_out0, vert_out1); + AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt2, src_vt3, src_vt4, src_vt5, + src_vt6, src_vt7, vert_out2, vert_out3); + + SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5); + SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7); + + out0 = __msa_srari_h((hz_out0 + vert_out0), 1); + out1 = __msa_srari_h((hz_out1 + vert_out1), 1); + out2 = __msa_srari_h((hz_out2 + vert_out2), 1); + out3 = __msa_srari_h((hz_out3 + vert_out3), 1); + + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + + dst += (4 * dst_stride); + src_vt3 = src_vt7; + src_vt1 = src_vt5; + src_vt5 = src_vt4; + src_vt4 = src_vt8; + src_vt2 = src_vt6; + src_vt0 = src_vt5; + } +} + +static void avc_luma_hv_qrt_16w_msa(const uint8_t *src_x, const uint8_t *src_y, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) +{ + uint32_t multiple8_cnt; + + for (multiple8_cnt = 2; multiple8_cnt--;) { + avc_luma_hv_qrt_8w_msa(src_x, src_y, src_stride, dst, dst_stride, + height); + + src_x += 8; + src_y += 8; + dst += 8; + } +} + +static void avc_luma_hz_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16i8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3, res; + v8i16 res0, res1; + v16i8 mask0, mask1, mask2; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v16i8 minus5b = __msa_ldi_b(-5); + v16i8 plus20b = __msa_ldi_b(20); + + LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); + LD_SB4(src, src_stride, src0, src1, src2, src3); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); + HADD_SB2_SH(vec0, vec1, res0, res1); + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3); + DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1); + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5); + DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1); + SRARI_H2_SH(res0, res1, 5); + SAT_SH2_SH(res0, res1, 7); + res = PCKEV_XORI128_UB(res0, res1); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + + dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0); + res = __msa_aver_u_b(res, dst0); + + ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride); +} + +static void avc_luma_hz_and_aver_dst_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v8i16 res0, res1, res2, res3; + v16i8 mask0, mask1, mask2; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v16i8 vec6, vec7, vec8, vec9, vec10, vec11; + v16i8 minus5b = __msa_ldi_b(-5); + v16i8 plus20b = __msa_ldi_b(20); + + LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); + + for (loop_cnt = 2; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); + HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3); + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, + res0, res1, res2, res3); + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9); + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11); + DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, + plus20b, res0, res1, res2, res3); + SRARI_H4_SH(res0, res1, res2, res3, 5); + SAT_SH4_SH(res0, res1, res2, res3, 7); + CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1, dst2, dst3, + dst, dst_stride); + + dst += (4 * dst_stride); + } +} + +static void avc_luma_hz_and_aver_dst_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 dst0, dst1, dst2, dst3; + v16i8 mask0, mask1, mask2; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v16i8 vec6, vec7, vec8, vec9, vec10, vec11; + v16i8 minus5b = __msa_ldi_b(-5); + v16i8 plus20b = __msa_ldi_b(20); + + LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); + + for (loop_cnt = 4; loop_cnt--;) { + LD_SB2(src, 8, src0, src1); + src += src_stride; + LD_SB2(src, 8, src2, src3); + src += src_stride; + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3); + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9); + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4); + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10); + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5); + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11); + HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3); + DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, + minus5b, res0, res1, res2, res3); + DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, + plus20b, res0, res1, res2, res3); + LD_SB2(src, 8, src4, src5); + src += src_stride; + LD_SB2(src, 8, src6, src7); + src += src_stride; + XORI_B4_128_SB(src4, src5, src6, src7); + VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3); + VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9); + VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4); + VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10); + VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5); + VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11); + HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7); + DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, + minus5b, res4, res5, res6, res7); + DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, + plus20b, res4, res5, res6, res7); + SRARI_H4_SH(res0, res1, res2, res3, 5); + SRARI_H4_SH(res4, res5, res6, res7, 5); + SAT_SH4_SH(res0, res1, res2, res3, 7); + SAT_SH4_SH(res4, res5, res6, res7, 7); + PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, + vec0, vec1, vec2, vec3); + XORI_B4_128_SB(vec0, vec1, vec2, vec3); + AVER_UB4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, + dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avc_luma_hz_qrt_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + uint8_t hor_offset) +{ + uint8_t slide; + v16i8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 mask0, mask1, mask2; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 out0, out1; + v16i8 minus5b = __msa_ldi_b(-5); + v16i8 plus20b = __msa_ldi_b(20); + v16u8 res0, res1; + + LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); + + if (hor_offset) { + slide = 3; + } else { + slide = 2; + } + + LD_SB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); + HADD_SB2_SH(vec0, vec1, out0, out1); + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3); + DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1); + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5); + DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1); + SRARI_H2_SH(out0, out1, 5); + SAT_SH2_SH(out0, out1, 7); + + PCKEV_B2_UB(out0, out0, out1, out1, res0, res1); + + src0 = __msa_sld_b(src0, src0, slide); + src1 = __msa_sld_b(src1, src1, slide); + src2 = __msa_sld_b(src2, src2, slide); + src3 = __msa_sld_b(src3, src3, slide); + src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1); + src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); + res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src0); + res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src1); + + XORI_B2_128_UB(res0, res1); + + dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1); + dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3); + + AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1); + + ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); +} + +static void avc_luma_hz_qrt_and_aver_dst_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + uint8_t hor_offset) +{ + uint8_t slide; + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v16i8 mask0, mask1, mask2; + v16u8 dst0, dst1, dst2, dst3; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v16i8 vec6, vec7, vec8, vec9, vec10, vec11; + v8i16 out0, out1, out2, out3; + v16i8 minus5b = __msa_ldi_b(-5); + v16i8 plus20b = __msa_ldi_b(20); + v16i8 res0, res1, res2, res3; + + LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); + + if (hor_offset) { + slide = 3; + } else { + slide = 2; + } + + for (loop_cnt = 2; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); + HADD_SB4_SH(vec0, vec1, vec2, vec3, out0, out1, out2, out3); + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, + out0, out1, out2, out3); + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9); + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11); + DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, + plus20b, out0, out1, out2, out3); + + src0 = __msa_sld_b(src0, src0, slide); + src1 = __msa_sld_b(src1, src1, slide); + src2 = __msa_sld_b(src2, src2, slide); + src3 = __msa_sld_b(src3, src3, slide); + + SRARI_H4_SH(out0, out1, out2, out3, 5); + SAT_SH4_SH(out0, out1, out2, out3, 7); + + PCKEV_B4_SB(out0, out0, out1, out1, out2, out2, out3, out3, + res0, res1, res2, res3); + + res0 = __msa_aver_s_b(res0, src0); + res1 = __msa_aver_s_b(res1, src1); + res2 = __msa_aver_s_b(res2, src2); + res3 = __msa_aver_s_b(res3, src3); + + XORI_B4_128_SB(res0, res1, res2, res3); + AVER_ST8x4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3, + dst, dst_stride); + + dst += (4 * dst_stride); + } +} + +static void avc_luma_hz_qrt_and_aver_dst_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + uint8_t hor_offset) +{ + uint32_t loop_cnt; + v16i8 out0, out1; + v16i8 src0, src1, src2, src3; + v16i8 mask0, mask1, mask2, vshf; + v16u8 dst0, dst1; + v8i16 res0, res1, res2, res3; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v16i8 vec6, vec7, vec8, vec9, vec10, vec11; + v16i8 minus5b = __msa_ldi_b(-5); + v16i8 plus20b = __msa_ldi_b(20); + + LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); + + if (hor_offset) { + vshf = LD_SB(&luma_mask_arr[16 + 96]); + } else { + vshf = LD_SB(&luma_mask_arr[96]); + } + + for (loop_cnt = 8; loop_cnt--;) { + LD_SB2(src, 8, src0, src1); + src += src_stride; + LD_SB2(src, 8, src2, src3); + src += src_stride; + + LD_UB2(dst, dst_stride, dst0, dst1); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3); + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9); + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4); + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10); + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5); + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11); + HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3); + DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, + minus5b, res0, res1, res2, res3); + DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, + plus20b, res0, res1, res2, res3); + VSHF_B2_SB(src0, src1, src2, src3, vshf, vshf, src0, src2); + SRARI_H4_SH(res0, res1, res2, res3, 5); + SAT_SH4_SH(res0, res1, res2, res3, 7); + PCKEV_B2_SB(res1, res0, res3, res2, out0, out1); + + out0 = __msa_aver_s_b(out0, src0); + out1 = __msa_aver_s_b(out1, src2); + + XORI_B2_128_SB(out0, out1); + AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); + ST_UB2(dst0, dst1, dst, dst_stride); + dst += (2 * dst_stride); + } +} + +static void avc_luma_vt_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + int16_t filt_const0 = 0xfb01; + int16_t filt_const1 = 0x1414; + int16_t filt_const2 = 0x1fb; + v16u8 dst0, dst1, dst2, dst3; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16i8 src87_r, src2110, src4332, src6554, src8776; + v8i16 out10, out32; + v16i8 filt0, filt1, filt2; + v16u8 res; + + filt0 = (v16i8) __msa_fill_h(filt_const0); + filt1 = (v16i8) __msa_fill_h(filt_const1); + filt2 = (v16i8) __msa_fill_h(filt_const2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + XORI_B2_128_SB(src2110, src4332); + LD_SB4(src, src_stride, src5, src6, src7, src8); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r); + ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); + XORI_B2_128_SB(src6554, src8776); + out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); + out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); + SRARI_H2_SH(out10, out32, 5); + SAT_SH2_SH(out10, out32, 7); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + res = PCKEV_XORI128_UB(out10, out32); + + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + + dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0); + dst0 = __msa_aver_u_b(res, dst0); + + ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride); +} + +static void avc_luma_vt_and_aver_dst_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + int32_t loop_cnt; + int16_t filt_const0 = 0xfb01; + int16_t filt_const1 = 0x1414; + int16_t filt_const2 = 0x1fb; + v16u8 dst0, dst1, dst2, dst3; + v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src87_r, src109_r; + v8i16 out0, out1, out2, out3; + v16i8 filt0, filt1, filt2; + + filt0 = (v16i8) __msa_fill_h(filt_const0); + filt1 = (v16i8) __msa_fill_h(filt_const1); + filt2 = (v16i8) __msa_fill_h(filt_const2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + + for (loop_cnt = 2; loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + out0 = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2); + out1 = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2); + out2 = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2); + out3 = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2); + SRARI_H4_SH(out0, out1, out2, out3, 5); + SAT_SH4_SH(out0, out1, out2, out3, 7); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, + dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src76_r; + src32_r = src98_r; + src21_r = src87_r; + src43_r = src109_r; + src4 = src10; + } +} + +static void avc_luma_vt_and_aver_dst_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + int32_t loop_cnt; + int16_t filt_const0 = 0xfb01; + int16_t filt_const1 = 0x1414; + int16_t filt_const2 = 0x1fb; + v16u8 dst0, dst1, dst2, dst3; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; + v16i8 src65_l, src87_l; + v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + v16i8 filt0, filt1, filt2; + v16u8 res0, res1, res2, res3; + + filt0 = (v16i8) __msa_fill_h(filt_const0); + filt1 = (v16i8) __msa_fill_h(filt_const1); + filt2 = (v16i8) __msa_fill_h(filt_const2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_l, src21_l, src32_l, src43_l); + + for (loop_cnt = 4; loop_cnt--;) { + LD_SB4(src, src_stride, src5, src6, src7, src8); + src += (4 * src_stride); + + XORI_B4_128_SB(src5, src6, src7, src8); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r); + ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, + src54_l, src65_l, src76_l, src87_l); + out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); + out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); + out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); + out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); + out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); + out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); + out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); + out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, res0, res1, res2, res3); + XORI_B4_128_UB(res0, res1, res2, res3); + AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3, + res0, res1, res2, res3); + ST_UB4(res0, res1, res2, res3, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src21_r = src65_r; + src43_r = src87_r; + src10_l = src54_l; + src32_l = src76_l; + src21_l = src65_l; + src43_l = src87_l; + src4 = src8; + } +} + +static void avc_luma_vt_qrt_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + uint8_t ver_offset) +{ + int16_t filt_const0 = 0xfb01; + int16_t filt_const1 = 0x1414; + int16_t filt_const2 = 0x1fb; + v16u8 dst0, dst1, dst2, dst3; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16i8 src87_r, src2110, src4332, src6554, src8776; + v8i16 out10, out32; + v16i8 filt0, filt1, filt2; + v16u8 res; + + filt0 = (v16i8) __msa_fill_h(filt_const0); + filt1 = (v16i8) __msa_fill_h(filt_const1); + filt2 = (v16i8) __msa_fill_h(filt_const2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + XORI_B2_128_SB(src2110, src4332); + LD_SB4(src, src_stride, src5, src6, src7, src8); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r); + ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); + XORI_B2_128_SB(src6554, src8776); + out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); + out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); + SRARI_H2_SH(out10, out32, 5); + SAT_SH2_SH(out10, out32, 7); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + res = PCKEV_XORI128_UB(out10, out32); + + if (ver_offset) { + src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4); + src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6); + } else { + src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); + src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5); + } + + src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r); + res = __msa_aver_u_b(res, (v16u8) src32_r); + + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + + dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0); + dst0 = __msa_aver_u_b(res, dst0); + + ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride); +} + +static void avc_luma_vt_qrt_and_aver_dst_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + uint8_t ver_offset) +{ + int32_t loop_cnt; + int16_t filt_const0 = 0xfb01; + int16_t filt_const1 = 0x1414; + int16_t filt_const2 = 0x1fb; + v16u8 dst0, dst1, dst2, dst3; + v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src87_r, src109_r; + v8i16 out0_r, out1_r, out2_r, out3_r; + v16i8 res0, res1; + v16u8 vec0, vec1; + v16i8 filt0, filt1, filt2; + + filt0 = (v16i8) __msa_fill_h(filt_const0); + filt1 = (v16i8) __msa_fill_h(filt_const1); + filt2 = (v16i8) __msa_fill_h(filt_const2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + + for (loop_cnt = 2; loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2); + out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2); + out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2); + out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, res0, res1); + + if (ver_offset) { + PCKEV_D2_SB(src4, src3, src8, src7, src10_r, src32_r); + } else { + PCKEV_D2_SB(src3, src2, src7, src4, src10_r, src32_r); + } + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + + vec0 = (v16u8) __msa_aver_s_b(res0, src10_r); + vec1 = (v16u8) __msa_aver_s_b(res1, src32_r); + + XORI_B2_128_UB(vec0, vec1); + AVER_UB2_UB(vec0, dst0, vec1, dst1, vec0, vec1); + ST8x4_UB(vec0, vec1, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src76_r; + src32_r = src98_r; + src21_r = src87_r; + src43_r = src109_r; + src2 = src8; + src3 = src9; + src4 = src10; + } +} + +static void avc_luma_vt_qrt_and_aver_dst_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + uint8_t ver_offset) +{ + int32_t loop_cnt; + int16_t filt_const0 = 0xfb01; + int16_t filt_const1 = 0x1414; + int16_t filt_const2 = 0x1fb; + v16u8 dst0, dst1, dst2, dst3; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; + v16i8 src65_l, src87_l; + v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + v16i8 out0, out1, out2, out3; + v16i8 filt0, filt1, filt2; + v16u8 res0, res1, res2, res3; + + filt0 = (v16i8) __msa_fill_h(filt_const0); + filt1 = (v16i8) __msa_fill_h(filt_const1); + filt2 = (v16i8) __msa_fill_h(filt_const2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_l, src21_l, src32_l, src43_l); + + for (loop_cnt = 4; loop_cnt--;) { + LD_SB4(src, src_stride, src5, src6, src7, src8); + src += (4 * src_stride); + + XORI_B4_128_SB(src5, src6, src7, src8); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r); + ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, + src54_l, src65_l, src76_l, src87_l); + out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); + out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); + out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); + out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); + out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); + out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); + out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); + out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_SB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, out0, out1, out2, out3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + if (ver_offset) { + res0 = (v16u8) __msa_aver_s_b(out0, src3); + res1 = (v16u8) __msa_aver_s_b(out1, src4); + res2 = (v16u8) __msa_aver_s_b(out2, src5); + res3 = (v16u8) __msa_aver_s_b(out3, src6); + } else { + res0 = (v16u8) __msa_aver_s_b(out0, src2); + res1 = (v16u8) __msa_aver_s_b(out1, src3); + res2 = (v16u8) __msa_aver_s_b(out2, src4); + res3 = (v16u8) __msa_aver_s_b(out3, src5); + } + + XORI_B4_128_UB(res0, res1, res2, res3); + AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3, + dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src21_r = src65_r; + src43_r = src87_r; + src10_l = src54_l; + src32_l = src76_l; + src21_l = src65_l; + src43_l = src87_l; + src2 = src6; + src3 = src7; + src4 = src8; + } +} + +static void avc_luma_mid_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16i8 src0, src1, src2, src3, src4; + v16i8 mask0, mask1, mask2; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8; + v8i16 res0, res1, res2, res3; + v16u8 dst0, dst1, dst2, dst3; + v16u8 tmp0, tmp1, tmp2, tmp3; + + LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + + hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1, + mask0, mask1, mask2); + hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3, + mask0, mask1, mask2); + + PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3); + + hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + + hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1, + mask0, mask1, mask2); + hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3, + mask0, mask1, mask2); + + PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8); + + res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2, + hz_out3, hz_out4, hz_out5); + res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3, + hz_out4, hz_out5, hz_out6); + res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out2, hz_out3, hz_out4, + hz_out5, hz_out6, hz_out7); + res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out3, hz_out4, hz_out5, + hz_out6, hz_out7, hz_out8); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + tmp0 = PCKEV_XORI128_UB(res0, res1); + tmp1 = PCKEV_XORI128_UB(res2, res3); + PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2, tmp3); + AVER_UB2_UB(tmp0, tmp2, tmp1, tmp3, tmp0, tmp1); + + ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride); +} + +static void avc_luma_mid_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4; + v16i8 mask0, mask1, mask2; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8; + v16u8 dst0, dst1, dst2, dst3; + v8i16 res0, res1, res2, res3; + + LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + XORI_B5_128_SB(src0, src1, src2, src3, src4); + src += (5 * src_stride); + + hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2); + hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2); + hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2); + hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2); + hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + + hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2); + hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2); + hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2); + hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2); + + res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2, + hz_out3, hz_out4, hz_out5); + res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3, + hz_out4, hz_out5, hz_out6); + res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4, + hz_out5, hz_out6, hz_out7); + res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5, + hz_out6, hz_out7, hz_out8); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1, dst2, dst3, + dst, dst_stride); + + dst += (4 * dst_stride); + hz_out3 = hz_out7; + hz_out1 = hz_out5; + hz_out5 = hz_out4; + hz_out4 = hz_out8; + hz_out2 = hz_out6; + hz_out0 = hz_out5; + } +} + +static void avc_luma_mid_and_aver_dst_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + avc_luma_mid_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 16); + avc_luma_mid_and_aver_dst_8w_msa(src + 8, src_stride, dst + 8, dst_stride, + 16); +} + +static void avc_luma_midh_qrt_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + uint8_t horiz_offset) +{ + uint32_t row; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v16u8 dst0, dst1, res; + v8i16 vt_res0, vt_res1, vt_res2, vt_res3; + v4i32 hz_res0, hz_res1; + v8i16 res0, res1; + v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5; + v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; + v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; + v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; + v8i16 minus5h = __msa_ldi_h(-5); + v8i16 plus20h = __msa_ldi_h(20); + v8i16 zeros = { 0 }; + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + + for (row = (height >> 1); row--;) { + LD_SB2(src, src_stride, src5, src6); + src += (2 * src_stride); + + XORI_B2_128_SB(src5, src6); + LD_UB2(dst, dst_stride, dst0, dst1); + + dst0 = (v16u8) __msa_ilvr_w((v4i32) dst1, (v4i32) dst0); + + AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5, + vt_res0, vt_res1); + AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6, + vt_res2, vt_res3); + VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, + mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2); + VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, + mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5); + + hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); + DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); + + hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); + DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); + + SRARI_W2_SW(hz_res0, hz_res1, 10); + SAT_SW2_SW(hz_res0, hz_res1, 7); + + res0 = __msa_srari_h(shf_vec2, 5); + res1 = __msa_srari_h(shf_vec5, 5); + + SAT_SH2_SH(res0, res1, 7); + + if (horiz_offset) { + res0 = __msa_ilvod_h(zeros, res0); + res1 = __msa_ilvod_h(zeros, res1); + } else { + ILVEV_H2_SH(res0, zeros, res1, zeros, res0, res1); + } + hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) res0); + hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) res1); + res0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0); + + res = PCKEV_XORI128_UB(res0, res0); + + dst0 = __msa_aver_u_b(res, dst0); + + ST4x2_UB(dst0, dst, dst_stride); + dst += (2 * dst_stride); + + src0 = src2; + src1 = src3; + src2 = src4; + src3 = src5; + src4 = src6; + } +} + +static void avc_luma_midh_qrt_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + uint8_t horiz_offset) +{ + uint32_t multiple8_cnt; + + for (multiple8_cnt = 2; multiple8_cnt--;) { + avc_luma_midh_qrt_and_aver_dst_4w_msa(src, src_stride, dst, dst_stride, + height, horiz_offset); + + src += 4; + dst += 4; + } +} + +static void avc_luma_midh_qrt_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + uint8_t horiz_offset) +{ + uint32_t multiple8_cnt; + + for (multiple8_cnt = 4; multiple8_cnt--;) { + avc_luma_midh_qrt_and_aver_dst_4w_msa(src, src_stride, dst, dst_stride, + height, horiz_offset); + + src += 4; + dst += 4; + } +} + +static void avc_luma_midv_qrt_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + uint8_t ver_offset) +{ + int32_t loop_cnt; + int32_t out0, out1; + v16i8 src0, src1, src2, src3, src4; + v16u8 dst0, dst1; + v16i8 mask0, mask1, mask2; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 hz_out4, hz_out5, hz_out6; + v8i16 res0, res1, res2, res3; + v16u8 vec0, vec1; + + LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + + hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1, + mask0, mask1, mask2); + hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3, + mask0, mask1, mask2); + + PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3); + + hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2); + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, src_stride, src0, src1); + src += (2 * src_stride); + + XORI_B2_128_SB(src0, src1); + LD_UB2(dst, dst_stride, dst0, dst1); + hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1, + mask0, mask1, + mask2); + hz_out6 = (v8i16) __msa_pckod_d((v2i64) hz_out5, (v2i64) hz_out5); + res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2, + hz_out3, hz_out4, hz_out5); + res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3, + hz_out4, hz_out5, hz_out6); + + if (ver_offset) { + res1 = __msa_srari_h(hz_out3, 5); + res3 = __msa_srari_h(hz_out4, 5); + } else { + res1 = __msa_srari_h(hz_out2, 5); + res3 = __msa_srari_h(hz_out3, 5); + } + + SAT_SH2_SH(res1, res3, 7); + + res0 = __msa_aver_s_h(res0, res1); + res1 = __msa_aver_s_h(res2, res3); + + vec0 = PCKEV_XORI128_UB(res0, res0); + vec1 = PCKEV_XORI128_UB(res1, res1); + + AVER_UB2_UB(vec0, dst0, vec1, dst1, dst0, dst1); + + out0 = __msa_copy_u_w((v4i32) dst0, 0); + out1 = __msa_copy_u_w((v4i32) dst1, 0); + SW(out0, dst); + dst += dst_stride; + SW(out1, dst); + dst += dst_stride; + + hz_out0 = hz_out2; + hz_out1 = hz_out3; + hz_out2 = hz_out4; + hz_out3 = hz_out5; + hz_out4 = hz_out6; + } +} + +static void avc_luma_midv_qrt_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + uint8_t vert_offset) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4; + v16u8 dst0, dst1, dst2, dst3; + v16i8 mask0, mask1, mask2; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8; + v8i16 res0, res1, res2, res3; + v8i16 res4, res5, res6, res7; + + LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + XORI_B5_128_SB(src0, src1, src2, src3, src4); + src += (5 * src_stride); + + hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2); + hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2); + hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2); + hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2); + hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2); + hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2); + hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2); + hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2); + + res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2, + hz_out3, hz_out4, hz_out5); + res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3, + hz_out4, hz_out5, hz_out6); + res4 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4, + hz_out5, hz_out6, hz_out7); + res6 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5, + hz_out6, hz_out7, hz_out8); + + if (vert_offset) { + res1 = __msa_srari_h(hz_out3, 5); + res3 = __msa_srari_h(hz_out4, 5); + res5 = __msa_srari_h(hz_out5, 5); + res7 = __msa_srari_h(hz_out6, 5); + } else { + res1 = __msa_srari_h(hz_out2, 5); + res3 = __msa_srari_h(hz_out3, 5); + res5 = __msa_srari_h(hz_out4, 5); + res7 = __msa_srari_h(hz_out5, 5); + } + + SAT_SH4_SH(res1, res3, res5, res7, 7); + + res0 = __msa_aver_s_h(res0, res1); + res1 = __msa_aver_s_h(res2, res3); + res2 = __msa_aver_s_h(res4, res5); + res3 = __msa_aver_s_h(res6, res7); + + CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1, dst2, dst3, + dst, dst_stride); + dst += (4 * dst_stride); + + hz_out0 = hz_out4; + hz_out1 = hz_out5; + hz_out2 = hz_out6; + hz_out3 = hz_out7; + hz_out4 = hz_out8; + } +} + +static void avc_luma_midv_qrt_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + uint8_t vert_offset) +{ + int32_t multiple8_cnt; + + for (multiple8_cnt = 2; multiple8_cnt--;) { + avc_luma_midv_qrt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + height, vert_offset); + + src += 8; + dst += 8; + } +} + +static void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x, + const uint8_t *src_y, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16i8 src_hz0, src_hz1, src_hz2, src_hz3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4; + v16i8 src_vt5, src_vt6, src_vt7, src_vt8; + v16i8 mask0, mask1, mask2; + v8i16 hz_out0, hz_out1, vert_out0, vert_out1; + v8i16 res0, res1; + v16u8 res; + + LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); + LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); + src_y += (5 * src_stride); + + src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1); + src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2); + src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3); + src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4); + + XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3); + LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); + hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz0, src_hz1, + mask0, mask1, mask2); + hz_out1 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz2, src_hz3, + mask0, mask1, mask2); + SRARI_H2_SH(hz_out0, hz_out1, 5); + SAT_SH2_SH(hz_out0, hz_out1, 7); + LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8); + + src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5); + src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6); + src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7); + src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8); + + XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7); + + /* filter calc */ + vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1, src_vt2, + src_vt3, src_vt4, src_vt5); + vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3, src_vt4, + src_vt5, src_vt6, src_vt7); + SRARI_H2_SH(vert_out0, vert_out1, 5); + SAT_SH2_SH(vert_out0, vert_out1, 7); + + res1 = __msa_srari_h((hz_out1 + vert_out1), 1); + res0 = __msa_srari_h((hz_out0 + vert_out0), 1); + + SAT_SH2_SH(res0, res1, 7); + res = PCKEV_XORI128_UB(res0, res1); + + dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1); + dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3); + dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); + dst0 = __msa_aver_u_b(res, dst0); + + ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride); +} + +static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x, + const uint8_t *src_y, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint32_t loop_cnt; + v16i8 src_hz0, src_hz1, src_hz2, src_hz3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 src_vt0, src_vt1, src_vt2, src_vt3; + v16i8 src_vt4, src_vt5, src_vt6, src_vt7, src_vt8; + v16i8 mask0, mask1, mask2; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 vert_out0, vert_out1, vert_out2, vert_out3; + v8i16 out0, out1, out2, out3; + + LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); + + LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); + src_y += (5 * src_stride); + + src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1); + src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2); + src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3); + src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4); + + XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3); + + for (loop_cnt = 2; loop_cnt--;) { + LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3); + XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); + src_x += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, mask0, mask1, mask2); + hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, mask0, mask1, mask2); + hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, mask0, mask1, mask2); + hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, mask0, mask1, mask2); + SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5); + SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7); + LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8); + src_y += (4 * src_stride); + + src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5); + src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6); + src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7); + src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8); + + XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7); + AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt0, src_vt1, src_vt2, src_vt3, + src_vt4, src_vt5, vert_out0, vert_out1); + AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt2, src_vt3, src_vt4, src_vt5, + src_vt6, src_vt7, vert_out2, vert_out3); + SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5); + SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7); + + out0 = __msa_srari_h((hz_out0 + vert_out0), 1); + out1 = __msa_srari_h((hz_out1 + vert_out1), 1); + out2 = __msa_srari_h((hz_out2 + vert_out2), 1); + out3 = __msa_srari_h((hz_out3 + vert_out3), 1); + + SAT_SH4_SH(out0, out1, out2, out3, 7); + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, + dst, dst_stride); + dst += (4 * dst_stride); + + src_vt0 = src_vt4; + src_vt1 = src_vt5; + src_vt2 = src_vt6; + src_vt3 = src_vt7; + src_vt4 = src_vt8; + } +} + +static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x, + const uint8_t *src_y, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint32_t multiple8_cnt; + + for (multiple8_cnt = 2; multiple8_cnt--;) { + avc_luma_hv_qrt_and_aver_dst_8x8_msa(src_x, src_y, src_stride, + dst, dst_stride); + + src_x += 8; + src_y += 8; + dst += 8; + } + + src_x += (8 * src_stride) - 16; + src_y += (8 * src_stride) - 16; + dst += (8 * dst_stride) - 16; + + for (multiple8_cnt = 2; multiple8_cnt--;) { + avc_luma_hv_qrt_and_aver_dst_8x8_msa(src_x, src_y, src_stride, + dst, dst_stride); + + src_x += 8; + src_y += 8; + dst += 8; + } +} + +static void copy_width8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + out4 = __msa_copy_u_d((v2i64) src4, 0); + out5 = __msa_copy_u_d((v2i64) src5, 0); + out6 = __msa_copy_u_d((v2i64) src6, 0); + out7 = __msa_copy_u_d((v2i64) src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + for (cnt = height >> 3; cnt--;) { + LD_UB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + out4 = __msa_copy_u_d((v2i64) src4, 0); + out5 = __msa_copy_u_d((v2i64) src5, 0); + out6 = __msa_copy_u_d((v2i64) src6, 0); + out7 = __msa_copy_u_d((v2i64) src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 4) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 2) { + for (cnt = (height / 2); cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + + SD(out0, dst); + dst += dst_stride; + SD(out1, dst); + dst += dst_stride; + } + } +} + +static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t width) +{ + int32_t cnt, loop_cnt; + const uint8_t *src_tmp; + uint8_t *dst_tmp; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src_tmp, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src_tmp += (8 * src_stride); + + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, + dst_tmp, dst_stride); + dst_tmp += (8 * dst_stride); + } + + src += 16; + dst += 16; + } +} + +static void copy_width16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, + dst, dst_stride); + dst += (8 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); + } else if (0 == height % 4) { + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void avg_width4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + uint32_t out0, out1, out2, out3; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + + if (0 == (height % 4)) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + + out0 = __msa_copy_u_w((v4i32) dst0, 0); + out1 = __msa_copy_u_w((v4i32) dst1, 0); + out2 = __msa_copy_u_w((v4i32) dst2, 0); + out3 = __msa_copy_u_w((v4i32) dst3, 0); + SW4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == (height % 2)) { + for (cnt = (height / 2); cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + + AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1); + + out0 = __msa_copy_u_w((v4i32) dst0, 0); + out1 = __msa_copy_u_w((v4i32) dst1, 0); + SW(out0, dst); + dst += dst_stride; + SW(out1, dst); + dst += dst_stride; + } + } +} + +static void avg_width8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + uint64_t out0, out1, out2, out3; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + + out0 = __msa_copy_u_d((v2i64) dst0, 0); + out1 = __msa_copy_u_d((v2i64) dst1, 0); + out2 = __msa_copy_u_d((v2i64) dst2, 0); + out3 = __msa_copy_u_d((v2i64) dst3, 0); + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avg_width16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + for (cnt = (height / 8); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + copy_width16_msa(src, stride, dst, stride, 16); +} + +void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + copy_width8_msa(src, stride, dst, stride, 8); +} + +void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_width16_msa(src, stride, dst, stride, 16); +} + +void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_width8_msa(src, stride, dst, stride, 8); +} + +void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_width4_msa(src, stride, dst, stride, 4); +} + +void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hz_qrt_16w_msa(src - 2, stride, dst, stride, 16, 0); +} + +void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hz_qrt_16w_msa(src - 2, stride, dst, stride, 16, 1); +} + +void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hz_qrt_8w_msa(src - 2, stride, dst, stride, 8, 0); +} + +void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hz_qrt_8w_msa(src - 2, stride, dst, stride, 8, 1); +} + +void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hz_qrt_4w_msa(src - 2, stride, dst, stride, 4, 0); +} + +void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hz_qrt_4w_msa(src - 2, stride, dst, stride, 4, 1); +} + +void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hz_16w_msa(src - 2, stride, dst, stride, 16); +} + +void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hz_8w_msa(src - 2, stride, dst, stride, 8); +} + +void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hz_4w_msa(src - 2, stride, dst, stride, 4); +} + +void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_vt_qrt_16w_msa(src - (stride * 2), stride, dst, stride, 16, 0); +} + +void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_vt_qrt_16w_msa(src - (stride * 2), stride, dst, stride, 16, 1); +} + +void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_vt_qrt_8w_msa(src - (stride * 2), stride, dst, stride, 8, 0); +} + +void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_vt_qrt_8w_msa(src - (stride * 2), stride, dst, stride, 8, 1); +} + +void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_vt_qrt_4w_msa(src - (stride * 2), stride, dst, stride, 4, 0); +} + +void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_vt_qrt_4w_msa(src - (stride * 2), stride, dst, stride, 4, 1); +} + +void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_16w_msa(src - 2, + src - (stride * 2), stride, dst, stride, 16); +} + +void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_16w_msa(src - 2, + src - (stride * 2) + + sizeof(uint8_t), stride, dst, stride, 16); +} + +void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_16w_msa(src + stride - 2, + src - (stride * 2), stride, dst, stride, 16); +} + +void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_16w_msa(src + stride - 2, + src - (stride * 2) + + sizeof(uint8_t), stride, dst, stride, 16); +} + +void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_8w_msa(src - 2, src - (stride * 2), stride, dst, stride, 8); +} + +void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_8w_msa(src - 2, + src - (stride * 2) + + sizeof(uint8_t), stride, dst, stride, 8); +} + +void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_8w_msa(src + stride - 2, + src - (stride * 2), stride, dst, stride, 8); +} + +void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_8w_msa(src + stride - 2, + src - (stride * 2) + + sizeof(uint8_t), stride, dst, stride, 8); +} + + +void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_4w_msa(src - 2, src - (stride * 2), stride, dst, stride, 4); +} + +void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_4w_msa(src - 2, + src - (stride * 2) + + sizeof(uint8_t), stride, dst, stride, 4); +} + +void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_4w_msa(src + stride - 2, + src - (stride * 2), stride, dst, stride, 4); +} + +void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_4w_msa(src + stride - 2, + src - (stride * 2) + + sizeof(uint8_t), stride, dst, stride, 4); +} + +void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midv_qrt_16w_msa(src - (2 * stride) - 2, + stride, dst, stride, 16, 0); +} + +void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midv_qrt_16w_msa(src - (2 * stride) - 2, + stride, dst, stride, 16, 1); +} + +void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midv_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 0); +} + +void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midv_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 1); +} + +void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midv_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 0); +} + +void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midv_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 1); +} + +void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_vt_16w_msa(src - (stride * 2), stride, dst, stride, 16); +} + +void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_vt_8w_msa(src - (stride * 2), stride, dst, stride, 8); +} + +void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_vt_4w_msa(src - (stride * 2), stride, dst, stride, 4); +} + +void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midh_qrt_16w_msa(src - (2 * stride) - 2, + stride, dst, stride, 16, 0); +} + +void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midh_qrt_16w_msa(src - (2 * stride) - 2, + stride, dst, stride, 16, 1); +} + +void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midh_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 0); +} + +void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midh_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 1); +} + +void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midh_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 0); +} + +void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midh_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 1); +} + +void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_mid_16w_msa(src - (2 * stride) - 2, stride, dst, stride, 16); +} + +void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_mid_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8); +} + +void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_mid_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4); +} + +void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hz_qrt_and_aver_dst_16x16_msa(src - 2, stride, dst, stride, 0); +} + +void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hz_qrt_and_aver_dst_16x16_msa(src - 2, stride, dst, stride, 1); +} + +void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hz_qrt_and_aver_dst_8x8_msa(src - 2, stride, dst, stride, 0); +} + +void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hz_qrt_and_aver_dst_8x8_msa(src - 2, stride, dst, stride, 1); +} + +void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hz_qrt_and_aver_dst_4x4_msa(src - 2, stride, dst, stride, 0); +} + +void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hz_qrt_and_aver_dst_4x4_msa(src - 2, stride, dst, stride, 1); +} + +void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hz_and_aver_dst_16x16_msa(src - 2, stride, dst, stride); +} + +void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hz_and_aver_dst_8x8_msa(src - 2, stride, dst, stride); +} + +void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hz_and_aver_dst_4x4_msa(src - 2, stride, dst, stride); +} + +void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_vt_qrt_and_aver_dst_16x16_msa(src - (stride * 2), + stride, dst, stride, 0); +} + +void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_vt_qrt_and_aver_dst_16x16_msa(src - (stride * 2), + stride, dst, stride, 1); +} + +void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_vt_qrt_and_aver_dst_8x8_msa(src - (stride * 2), + stride, dst, stride, 0); +} + +void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_vt_qrt_and_aver_dst_8x8_msa(src - (stride * 2), + stride, dst, stride, 1); +} + +void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_vt_qrt_and_aver_dst_4x4_msa(src - (stride * 2), + stride, dst, stride, 0); +} + +void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_vt_qrt_and_aver_dst_4x4_msa(src - (stride * 2), + stride, dst, stride, 1); +} + +void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2, + src - (stride * 2), + stride, dst, stride); +} + +void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2, + src - (stride * 2) + + sizeof(uint8_t), stride, + dst, stride); +} + +void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2, + src - (stride * 2), + stride, dst, stride); +} + +void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2, + src - (stride * 2) + + sizeof(uint8_t), stride, + dst, stride); +} + +void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2, + src - (stride * 2), + stride, dst, stride); +} + +void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2, + src - (stride * 2) + + sizeof(uint8_t), stride, dst, stride); +} + +void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2, + src - (stride * 2), + stride, dst, stride); +} + +void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2, + src - (stride * 2) + + sizeof(uint8_t), stride, dst, stride); +} + + +void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2, + src - (stride * 2), + stride, dst, stride); +} + +void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2, + src - (stride * 2) + + sizeof(uint8_t), stride, dst, stride); +} + +void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2, + src - (stride * 2), + stride, dst, stride); +} + +void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2, + src - (stride * 2) + + sizeof(uint8_t), stride, dst, stride); +} + +void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midv_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2, + stride, dst, stride, 16, 0); +} + +void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midv_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2, + stride, dst, stride, 16, 1); +} + +void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midv_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2, + stride, dst, stride, 8, 0); +} + +void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midv_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2, + stride, dst, stride, 8, 1); +} + +void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midv_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2, + stride, dst, stride, 4, 0); +} + +void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midv_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2, + stride, dst, stride, 4, 1); +} + +void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_vt_and_aver_dst_16x16_msa(src - (stride * 2), stride, dst, stride); +} + +void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_vt_and_aver_dst_8x8_msa(src - (stride * 2), stride, dst, stride); +} + +void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_vt_and_aver_dst_4x4_msa(src - (stride * 2), stride, dst, stride); +} + +void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midh_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2, + stride, dst, stride, 16, 0); +} + +void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midh_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2, + stride, dst, stride, 16, 1); +} + +void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midh_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2, + stride, dst, stride, 8, 0); +} + +void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midh_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2, + stride, dst, stride, 8, 1); +} + +void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midh_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2, + stride, dst, stride, 4, 0); +} + +void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_midh_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2, + stride, dst, stride, 4, 1); +} + +void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_mid_and_aver_dst_16x16_msa(src - (2 * stride) - 2, + stride, dst, stride); +} + +void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_mid_and_aver_dst_8w_msa(src - (2 * stride) - 2, + stride, dst, stride, 8); +} + +void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avc_luma_mid_and_aver_dst_4x4_msa(src - (2 * stride) - 2, + stride, dst, stride); +} diff --git a/libavcodec/mips/hevc_idct_msa.c b/libavcodec/mips/hevc_idct_msa.c new file mode 100644 index 0000000000..975d91f8be --- /dev/null +++ b/libavcodec/mips/hevc_idct_msa.c @@ -0,0 +1,939 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "libavcodec/mips/hevcdsp_mips.h" + +static const int16_t gt8x8_cnst[16] = { + 64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18 +}; + +static const int16_t gt16x16_cnst[64] = { + 64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43, + 64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90, + 64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57, + 64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25 +}; + +static const int16_t gt32x32_cnst0[256] = { + 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, + 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, + 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, + 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, + 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, + 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, + 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, + 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, + 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, + 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, + 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, + 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, + 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, + 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, + 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, + 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90 +}; + +static const int16_t gt32x32_cnst1[64] = { + 90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25, + 80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57, + 57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80, + 25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90 +}; + +static const int16_t gt32x32_cnst2[16] = { + 89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89 +}; + +static const int16_t gt32x32_cnst3[16] = { + 64, 64, 64, 64, 83, 36, -36, -83, 64, -64, -64, 64, 36, -83, 83, -36 +}; + +#define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, \ + sum0, sum1, sum2, sum3, shift) \ +{ \ + v4i32 vec0, vec1, vec2, vec3, vec4, vec5; \ + v4i32 cnst64 = __msa_ldi_w(64); \ + v4i32 cnst83 = __msa_ldi_w(83); \ + v4i32 cnst36 = __msa_ldi_w(36); \ + \ + DOTP_SH4_SW(in_r0, in_r1, in_l0, in_l1, cnst64, cnst64, \ + cnst83, cnst36, vec0, vec2, vec1, vec3); \ + DOTP_SH2_SW(in_l0, in_l1, cnst36, cnst83, vec4, vec5); \ + \ + sum0 = vec0 + vec2; \ + sum1 = vec0 - vec2; \ + sum3 = sum0; \ + sum2 = sum1; \ + \ + vec1 += vec3; \ + vec4 -= vec5; \ + \ + sum0 += vec1; \ + sum1 += vec4; \ + sum2 -= vec4; \ + sum3 -= vec1; \ + \ + SRARI_W4_SW(sum0, sum1, sum2, sum3, shift); \ + SAT_SW4_SW(sum0, sum1, sum2, sum3, 15); \ +} + +#define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift) \ +{ \ + v8i16 src0_r, src1_r, src2_r, src3_r; \ + v8i16 src0_l, src1_l, src2_l, src3_l; \ + v8i16 filt0, filter0, filter1, filter2, filter3; \ + v4i32 temp0_r, temp1_r, temp2_r, temp3_r, temp4_r, temp5_r; \ + v4i32 temp0_l, temp1_l, temp2_l, temp3_l, temp4_l, temp5_l; \ + v4i32 sum0_r, sum1_r, sum2_r, sum3_r; \ + v4i32 sum0_l, sum1_l, sum2_l, sum3_l; \ + \ + ILVR_H4_SH(in4, in0, in6, in2, in5, in1, in3, in7, \ + src0_r, src1_r, src2_r, src3_r); \ + ILVL_H4_SH(in4, in0, in6, in2, in5, in1, in3, in7, \ + src0_l, src1_l, src2_l, src3_l); \ + \ + filt0 = LD_SH(filter); \ + SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); \ + DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l, filter0, filter0, \ + filter1, filter1, temp0_r, temp0_l, temp1_r, temp1_l); \ + \ + BUTTERFLY_4(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l, \ + sum1_l, sum1_r); \ + sum2_r = sum1_r; \ + sum2_l = sum1_l; \ + sum3_r = sum0_r; \ + sum3_l = sum0_l; \ + \ + DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter2, filter2, \ + filter3, filter3, temp2_r, temp2_l, temp3_r, temp3_l); \ + \ + temp2_r += temp3_r; \ + temp2_l += temp3_l; \ + sum0_r += temp2_r; \ + sum0_l += temp2_l; \ + sum3_r -= temp2_r; \ + sum3_l -= temp2_l; \ + \ + SRARI_W4_SW(sum0_r, sum0_l, sum3_r, sum3_l, shift); \ + SAT_SW4_SW(sum0_r, sum0_l, sum3_r, sum3_l, 15); \ + PCKEV_H2_SH(sum0_l, sum0_r, sum3_l, sum3_r, in0, in7); \ + DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter3, filter3, \ + filter2, filter2, temp4_r, temp4_l, temp5_r, temp5_l); \ + \ + temp4_r -= temp5_r; \ + temp4_l -= temp5_l; \ + sum1_r += temp4_r; \ + sum1_l += temp4_l; \ + sum2_r -= temp4_r; \ + sum2_l -= temp4_l; \ + \ + SRARI_W4_SW(sum1_r, sum1_l, sum2_r, sum2_l, shift); \ + SAT_SW4_SW(sum1_r, sum1_l, sum2_r, sum2_l, 15); \ + PCKEV_H2_SH(sum1_l, sum1_r, sum2_l, sum2_r, in3, in4); \ + \ + filt0 = LD_SH(filter + 8); \ + SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); \ + DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l, filter0, filter0, \ + filter1, filter1, temp0_r, temp0_l, temp1_r, temp1_l); \ + \ + BUTTERFLY_4(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l, \ + sum1_l, sum1_r); \ + sum2_r = sum1_r; \ + sum2_l = sum1_l; \ + sum3_r = sum0_r; \ + sum3_l = sum0_l; \ + \ + DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter2, filter2, \ + filter3, filter3, temp2_r, temp2_l, temp3_r, temp3_l); \ + \ + temp2_r += temp3_r; \ + temp2_l += temp3_l; \ + sum0_r += temp2_r; \ + sum0_l += temp2_l; \ + sum3_r -= temp2_r; \ + sum3_l -= temp2_l; \ + \ + SRARI_W4_SW(sum0_r, sum0_l, sum3_r, sum3_l, shift); \ + SAT_SW4_SW(sum0_r, sum0_l, sum3_r, sum3_l, 15); \ + PCKEV_H2_SH(sum0_l, sum0_r, sum3_l, sum3_r, in1, in6); \ + DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter3, filter3, \ + filter2, filter2, temp4_r, temp4_l, temp5_r, temp5_l); \ + \ + temp4_r -= temp5_r; \ + temp4_l -= temp5_l; \ + sum1_r -= temp4_r; \ + sum1_l -= temp4_l; \ + sum2_r += temp4_r; \ + sum2_l += temp4_l; \ + \ + SRARI_W4_SW(sum1_r, sum1_l, sum2_r, sum2_l, shift); \ + SAT_SW4_SW(sum1_r, sum1_l, sum2_r, sum2_l, 15); \ + PCKEV_H2_SH(sum1_l, sum1_r, sum2_l, sum2_r, in2, in5); \ +} + +#define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, \ + src4_r, src5_r, src6_r, src7_r, \ + src0_l, src1_l, src2_l, src3_l, \ + src4_l, src5_l, src6_l, src7_l, shift) \ +{ \ + int16_t *ptr0, *ptr1; \ + v8i16 filt0, filt1, dst0, dst1; \ + v8i16 filter0, filter1, filter2, filter3; \ + v4i32 temp0_r, temp1_r, temp0_l, temp1_l; \ + v4i32 sum0_r, sum1_r, sum2_r, sum3_r, sum0_l, sum1_l, sum2_l; \ + v4i32 sum3_l, res0_r, res1_r, res0_l, res1_l; \ + \ + ptr0 = (buf_ptr + 112); \ + ptr1 = (buf_ptr + 128); \ + k = -1; \ + \ + for (j = 0; j < 4; j++) \ + { \ + LD_SH2(filter, 8, filt0, filt1) \ + filter += 16; \ + SPLATI_W2_SH(filt0, 0, filter0, filter1); \ + SPLATI_W2_SH(filt1, 0, filter2, filter3); \ + DOTP_SH4_SW(src0_r, src0_l, src4_r, src4_l, filter0, filter0, \ + filter2, filter2, sum0_r, sum0_l, sum2_r, sum2_l); \ + DOTP_SH2_SW(src7_r, src7_l, filter2, filter2, sum3_r, sum3_l); \ + DPADD_SH4_SW(src1_r, src1_l, src5_r, src5_l, filter1, filter1, \ + filter3, filter3, sum0_r, sum0_l, sum2_r, sum2_l); \ + DPADD_SH2_SW(src6_r, src6_l, filter3, filter3, sum3_r, sum3_l); \ + \ + sum1_r = sum0_r; \ + sum1_l = sum0_l; \ + \ + SPLATI_W2_SH(filt0, 2, filter0, filter1); \ + SPLATI_W2_SH(filt1, 2, filter2, filter3); \ + DOTP_SH2_SW(src2_r, src2_l, filter0, filter0, temp0_r, temp0_l); \ + DPADD_SH2_SW(src6_r, src6_l, filter2, filter2, sum2_r, sum2_l); \ + DOTP_SH2_SW(src5_r, src5_l, filter2, filter2, temp1_r, temp1_l); \ + \ + sum0_r += temp0_r; \ + sum0_l += temp0_l; \ + sum1_r -= temp0_r; \ + sum1_l -= temp0_l; \ + \ + sum3_r = temp1_r - sum3_r; \ + sum3_l = temp1_l - sum3_l; \ + \ + DOTP_SH2_SW(src3_r, src3_l, filter1, filter1, temp0_r, temp0_l); \ + DPADD_SH4_SW(src7_r, src7_l, src4_r, src4_l, filter3, filter3, \ + filter3, filter3, sum2_r, sum2_l, sum3_r, sum3_l); \ + \ + sum0_r += temp0_r; \ + sum0_l += temp0_l; \ + sum1_r -= temp0_r; \ + sum1_l -= temp0_l; \ + \ + BUTTERFLY_4(sum0_r, sum0_l, sum2_l, sum2_r, res0_r, res0_l, \ + res1_l, res1_r); \ + SRARI_W4_SW(res0_r, res0_l, res1_r, res1_l, shift); \ + SAT_SW4_SW(res0_r, res0_l, res1_r, res1_l, 15); \ + PCKEV_H2_SH(res0_l, res0_r, res1_l, res1_r, dst0, dst1); \ + ST_SH(dst0, buf_ptr); \ + ST_SH(dst1, (buf_ptr + ((15 - (j * 2)) * 16))); \ + \ + BUTTERFLY_4(sum1_r, sum1_l, sum3_l, sum3_r, res0_r, res0_l, \ + res1_l, res1_r); \ + SRARI_W4_SW(res0_r, res0_l, res1_r, res1_l, shift); \ + SAT_SW4_SW(res0_r, res0_l, res1_r, res1_l, 15); \ + PCKEV_H2_SH(res0_l, res0_r, res1_l, res1_r, dst0, dst1); \ + ST_SH(dst0, (ptr0 + (((j / 2 + j % 2) * 2 * k) * 16))); \ + ST_SH(dst1, (ptr1 - (((j / 2 + j % 2) * 2 * k) * 16))); \ + \ + k *= -1; \ + buf_ptr += 16; \ + } \ +} + +#define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx) \ +{ \ + LD_SW2(input + load_idx * 8, 4, tmp0_r, tmp0_l); \ + tmp1_r = sum0_r; \ + tmp1_l = sum0_l; \ + sum0_r += tmp0_r; \ + sum0_l += tmp0_l; \ + ST_SW2(sum0_r, sum0_l, (input + load_idx * 8), 4); \ + tmp1_r -= tmp0_r; \ + tmp1_l -= tmp0_l; \ + ST_SW2(tmp1_r, tmp1_l, (input + store_idx * 8), 4); \ +} + +#define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, \ + res0, res1, res2, res3, shift) \ +{ \ + v4i32 vec0, vec1, vec2, vec3; \ + v4i32 cnst74 = __msa_ldi_w(74); \ + v4i32 cnst55 = __msa_ldi_w(55); \ + v4i32 cnst29 = __msa_ldi_w(29); \ + \ + vec0 = in_r0 + in_r1; \ + vec2 = in_r0 - in_l1; \ + res0 = vec0 * cnst29; \ + res1 = vec2 * cnst55; \ + res2 = in_r0 - in_r1; \ + vec1 = in_r1 + in_l1; \ + res2 += in_l1; \ + vec3 = in_l0 * cnst74; \ + res3 = vec0 * cnst55; \ + \ + res0 += vec1 * cnst55; \ + res1 -= vec1 * cnst29; \ + res2 *= cnst74; \ + res3 += vec2 * cnst29; \ + \ + res0 += vec3; \ + res1 += vec3; \ + res3 -= vec3; \ + \ + SRARI_W4_SW(res0, res1, res2, res3, shift); \ + SAT_SW4_SW(res0, res1, res2, res3, 15); \ +} + +static void hevc_idct_4x4_msa(int16_t *coeffs) +{ + v8i16 in0, in1; + v4i32 in_r0, in_l0, in_r1, in_l1; + v4i32 sum0, sum1, sum2, sum3; + v8i16 zeros = { 0 }; + + LD_SH2(coeffs, 8, in0, in1); + ILVRL_H2_SW(zeros, in0, in_r0, in_l0); + ILVRL_H2_SW(zeros, in1, in_r1, in_l1); + + HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7); + TRANSPOSE4x4_SW_SW(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1); + HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12); + TRANSPOSE4x4_SW_SW(sum0, sum1, sum2, sum3, sum0, sum1, sum2, sum3); + PCKEV_H2_SH(sum1, sum0, sum3, sum2, in0, in1); + ST_SH2(in0, in1, coeffs, 8); +} + +static void hevc_idct_8x8_msa(int16_t *coeffs) +{ + int16_t *filter = >8x8_cnst[0]; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7); + HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 12); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs, 8); +} + +static void hevc_idct_16x16_msa(int16_t *coeffs) +{ + int16_t i, j, k; + int16_t buf[256]; + int16_t *buf_ptr = &buf[0]; + int16_t *src = coeffs; + int16_t *filter = >16x16_cnst[0]; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; + v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l; + + for (i = 2; i--;) { + LD_SH16(src, 16, in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15); + + ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10, + src0_r, src1_r, src2_r, src3_r); + ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15, + src4_r, src5_r, src6_r, src7_r); + ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10, + src0_l, src1_l, src2_l, src3_l); + ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15, + src4_l, src5_l, src6_l, src7_l); + HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, + src6_r, src7_r, src0_l, src1_l, src2_l, src3_l, + src4_l, src5_l, src6_l, src7_l, 7); + + src += 8; + buf_ptr = (&buf[0] + 8); + filter = >16x16_cnst[0]; + } + + src = &buf[0]; + buf_ptr = coeffs; + filter = >16x16_cnst[0]; + + for (i = 2; i--;) { + LD_SH16(src, 8, in0, in8, in1, in9, in2, in10, in3, in11, + in4, in12, in5, in13, in6, in14, in7, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, + in8, in9, in10, in11, in12, in13, in14, in15); + ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10, + src0_r, src1_r, src2_r, src3_r); + ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15, + src4_r, src5_r, src6_r, src7_r); + ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10, + src0_l, src1_l, src2_l, src3_l); + ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15, + src4_l, src5_l, src6_l, src7_l); + HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, + src6_r, src7_r, src0_l, src1_l, src2_l, src3_l, + src4_l, src5_l, src6_l, src7_l, 12); + + src += 128; + buf_ptr = coeffs + 8; + filter = >16x16_cnst[0]; + } + + LD_SH8(coeffs, 16, in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); + ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, coeffs, 16); + + LD_SH8((coeffs + 8), 16, in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); + LD_SH8((coeffs + 128), 16, in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 128), 16); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, + vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); + ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 8), 16); + + LD_SH8((coeffs + 136), 16, in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); + ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 136), 16); +} + +static void hevc_idct_8x32_column_msa(int16_t *coeffs, uint8_t buf_pitch, + uint8_t round) +{ + uint8_t i; + int16_t *filter_ptr0 = >32x32_cnst0[0]; + int16_t *filter_ptr1 = >32x32_cnst1[0]; + int16_t *filter_ptr2 = >32x32_cnst2[0]; + int16_t *filter_ptr3 = >32x32_cnst3[0]; + int16_t *src0 = (coeffs + buf_pitch); + int16_t *src1 = (coeffs + 2 * buf_pitch); + int16_t *src2 = (coeffs + 4 * buf_pitch); + int16_t *src3 = (coeffs); + int32_t cnst0, cnst1; + int32_t tmp_buf[8 * 32]; + int32_t *tmp_buf_ptr = &tmp_buf[0]; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; + v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l; + v8i16 filt0, filter0, filter1, filter2, filter3; + v4i32 sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l; + + /* process coeff 4, 12, 20, 28 */ + LD_SH4(src2, 8 * buf_pitch, in0, in1, in2, in3); + ILVR_H2_SH(in1, in0, in3, in2, src0_r, src1_r); + ILVL_H2_SH(in1, in0, in3, in2, src0_l, src1_l); + + /* loop for all columns of constants */ + for (i = 0; i < 4; i++) { + /* processing single column of constants */ + cnst0 = LW(filter_ptr2); + cnst1 = LW(filter_ptr2 + 2); + + filter0 = (v8i16) __msa_fill_w(cnst0); + filter1 = (v8i16) __msa_fill_w(cnst1); + + DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l); + DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l); + ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + i * 8), 4); + + filter_ptr2 += 4; + } + + /* process coeff 0, 8, 16, 24 */ + LD_SH2(src3, 16 * buf_pitch, in0, in2); + LD_SH2((src3 + 8 * buf_pitch), 16 * buf_pitch, in1, in3); + + ILVR_H2_SH(in2, in0, in3, in1, src0_r, src1_r); + ILVL_H2_SH(in2, in0, in3, in1, src0_l, src1_l); + + /* loop for all columns of constants */ + for (i = 0; i < 2; i++) { + /* processing first column of filter constants */ + cnst0 = LW(filter_ptr3); + cnst1 = LW(filter_ptr3 + 4); + + filter0 = (v8i16) __msa_fill_w(cnst0); + filter1 = (v8i16) __msa_fill_w(cnst1); + + DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l, filter0, filter0, filter1, + filter1, sum0_r, sum0_l, tmp1_r, tmp1_l); + + sum1_r = sum0_r; + sum1_l = sum0_l; + sum0_r += tmp1_r; + sum0_l += tmp1_l; + + sum1_r -= tmp1_r; + sum1_l -= tmp1_l; + + HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, i, (7 - i)); + HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, (3 - i), (4 + i)); + + filter_ptr3 += 8; + } + + /* process coeff 2 6 10 14 18 22 26 30 */ + LD_SH8(src1, 4 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7); + ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, + src0_r, src1_r, src2_r, src3_r); + ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, + src0_l, src1_l, src2_l, src3_l); + + /* loop for all columns of constants */ + for (i = 0; i < 8; i++) { + /* processing single column of constants */ + filt0 = LD_SH(filter_ptr1); + SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); + DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l); + DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2, + filter2, sum0_r, sum0_l, sum0_r, sum0_l); + DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l); + + LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l); + tmp1_r = tmp0_r; + tmp1_l = tmp0_l; + tmp0_r += sum0_r; + tmp0_l += sum0_l; + ST_SW2(tmp0_r, tmp0_l, (tmp_buf_ptr + i * 8), 4); + tmp1_r -= sum0_r; + tmp1_l -= sum0_l; + ST_SW2(tmp1_r, tmp1_l, (tmp_buf_ptr + (15 - i) * 8), 4); + + filter_ptr1 += 8; + } + + /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */ + LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7); + src0 += 16 * buf_pitch; + ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, + src0_r, src1_r, src2_r, src3_r); + ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, + src0_l, src1_l, src2_l, src3_l); + + LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7); + ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, + src4_r, src5_r, src6_r, src7_r); + ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, + src4_l, src5_l, src6_l, src7_l); + + /* loop for all columns of filter constants */ + for (i = 0; i < 16; i++) { + /* processing single column of constants */ + filt0 = LD_SH(filter_ptr0); + SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); + DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l); + DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2, + filter2, sum0_r, sum0_l, sum0_r, sum0_l); + DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l); + + tmp1_r = sum0_r; + tmp1_l = sum0_l; + + filt0 = LD_SH(filter_ptr0 + 8); + SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); + DOTP_SH2_SW(src4_r, src4_l, filter0, filter0, sum0_r, sum0_l); + DPADD_SH4_SW(src5_r, src5_l, src6_r, src6_l, filter1, filter1, filter2, + filter2, sum0_r, sum0_l, sum0_r, sum0_l); + DPADD_SH2_SW(src7_r, src7_l, filter3, filter3, sum0_r, sum0_l); + + sum0_r += tmp1_r; + sum0_l += tmp1_l; + + LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l); + tmp1_r = tmp0_r; + tmp1_l = tmp0_l; + tmp0_r += sum0_r; + tmp0_l += sum0_l; + sum1_r = __msa_fill_w(round); + SRAR_W2_SW(tmp0_r, tmp0_l, sum1_r); + SAT_SW2_SW(tmp0_r, tmp0_l, 15); + in0 = __msa_pckev_h((v8i16) tmp0_l, (v8i16) tmp0_r); + ST_SH(in0, (coeffs + i * buf_pitch)); + tmp1_r -= sum0_r; + tmp1_l -= sum0_l; + SRAR_W2_SW(tmp1_r, tmp1_l, sum1_r); + SAT_SW2_SW(tmp1_r, tmp1_l, 15); + in0 = __msa_pckev_h((v8i16) tmp1_l, (v8i16) tmp1_r); + ST_SH(in0, (coeffs + (31 - i) * buf_pitch)); + + filter_ptr0 += 16; + } +} + +static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf) +{ + uint8_t i; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + for (i = 0; i < 4; i++) { + LD_SH8(coeffs + i * 8, 32, in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, tmp_buf + i * 8 * 8, 8); + } +} + +static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs) +{ + uint8_t i; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + for (i = 0; i < 4; i++) { + LD_SH8(tmp_buf + i * 8 * 8, 8, in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs + i * 8, 32); + } +} + +static void hevc_idct_32x32_msa(int16_t *coeffs) +{ + uint8_t row_cnt, col_cnt; + int16_t *src = coeffs; + int16_t tmp_buf[8 * 32]; + int16_t *tmp_buf_ptr = &tmp_buf[0]; + uint8_t round; + uint8_t buf_pitch; + + /* column transform */ + round = 7; + buf_pitch = 32; + for (col_cnt = 0; col_cnt < 4; col_cnt++) { + /* process 8x32 blocks */ + hevc_idct_8x32_column_msa((coeffs + col_cnt * 8), buf_pitch, round); + } + + /* row transform */ + round = 12; + buf_pitch = 8; + for (row_cnt = 0; row_cnt < 4; row_cnt++) { + /* process 32x8 blocks */ + src = (coeffs + 32 * 8 * row_cnt); + + hevc_idct_transpose_32x8_to_8x32(src, tmp_buf_ptr); + hevc_idct_8x32_column_msa(tmp_buf_ptr, buf_pitch, round); + hevc_idct_transpose_8x32_to_32x8(tmp_buf_ptr, src); + } +} + +static void hevc_idct_dc_4x4_msa(int16_t *coeffs) +{ + int32_t val; + v8i16 dst; + + val = (coeffs[0] + 1) >> 1; + val = (val + 32) >> 6; + dst = __msa_fill_h(val); + + ST_SH2(dst, dst, coeffs, 8); +} + +static void hevc_idct_dc_8x8_msa(int16_t *coeffs) +{ + int32_t val; + v8i16 dst; + + val = (coeffs[0] + 1) >> 1; + val = (val + 32) >> 6; + dst = __msa_fill_h(val); + + ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8); +} + +static void hevc_idct_dc_16x16_msa(int16_t *coeffs) +{ + uint8_t loop; + int32_t val; + v8i16 dst; + + val = (coeffs[0] + 1) >> 1; + val = (val + 32) >> 6; + dst = __msa_fill_h(val); + + for (loop = 4; loop--;) { + ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8); + coeffs += 8 * 8; + } +} + +static void hevc_idct_dc_32x32_msa(int16_t *coeffs) +{ + uint8_t loop; + int32_t val; + v8i16 dst; + + val = (coeffs[0] + 1) >> 1; + val = (val + 32) >> 6; + dst = __msa_fill_h(val); + + for (loop = 16; loop--;) { + ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8); + coeffs += 8 * 8; + } +} + +static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) +{ + uint32_t dst0, dst1, dst2, dst3; + v8i16 dst_r0, dst_l0, in0, in1; + v4i32 dst_vec = { 0 }; + v16u8 zeros = { 0 }; + + LD_SH2(coeffs, 8, in0, in1); + LW4(dst, stride, dst0, dst1, dst2, dst3); + INSERT_W4_SW(dst0, dst1, dst2, dst3, dst_vec); + ILVRL_B2_SH(zeros, dst_vec, dst_r0, dst_l0); + ADD2(dst_r0, in0, dst_l0, in1, dst_r0, dst_l0); + CLIP_SH2_0_255(dst_r0, dst_l0); + dst_vec = (v4i32) __msa_pckev_b((v16i8) dst_l0, (v16i8) dst_r0); + ST4x4_UB(dst_vec, dst_vec, 0, 1, 2, 3, dst, stride); +} + +static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) +{ + uint8_t *temp_dst = dst; + uint64_t dst0, dst1, dst2, dst3; + v2i64 dst_vec0 = { 0 }; + v2i64 dst_vec1 = { 0 }; + v8i16 dst_r0, dst_l0, dst_r1, dst_l1; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v16u8 zeros = { 0 }; + + LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD4(temp_dst, stride, dst0, dst1, dst2, dst3); + temp_dst += (4 * stride); + + INSERT_D2_SD(dst0, dst1, dst_vec0); + INSERT_D2_SD(dst2, dst3, dst_vec1); + ILVRL_B2_SH(zeros, dst_vec0, dst_r0, dst_l0); + ILVRL_B2_SH(zeros, dst_vec1, dst_r1, dst_l1); + ADD4(dst_r0, in0, dst_l0, in1, dst_r1, in2, dst_l1, in3, + dst_r0, dst_l0, dst_r1, dst_l1); + CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); + PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1); + ST8x4_UB(dst_r0, dst_r1, dst, stride); + dst += (4 * stride); + + LD4(temp_dst, stride, dst0, dst1, dst2, dst3); + INSERT_D2_SD(dst0, dst1, dst_vec0); + INSERT_D2_SD(dst2, dst3, dst_vec1); + UNPCK_UB_SH(dst_vec0, dst_r0, dst_l0); + UNPCK_UB_SH(dst_vec1, dst_r1, dst_l1); + ADD4(dst_r0, in4, dst_l0, in5, dst_r1, in6, dst_l1, in7, + dst_r0, dst_l0, dst_r1, dst_l1); + CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); + PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1); + ST8x4_UB(dst_r0, dst_r1, dst, stride); +} + +static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) +{ + uint8_t loop_cnt; + uint8_t *temp_dst = dst; + v16u8 dst0, dst1, dst2, dst3; + v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + for (loop_cnt = 4; loop_cnt--;) { + LD_SH4(coeffs, 16, in0, in2, in4, in6); + LD_SH4((coeffs + 8), 16, in1, in3, in5, in7); + coeffs += 64; + LD_UB4(temp_dst, stride, dst0, dst1, dst2, dst3); + temp_dst += (4 * stride); + + UNPCK_UB_SH(dst0, dst_r0, dst_l0); + UNPCK_UB_SH(dst1, dst_r1, dst_l1); + UNPCK_UB_SH(dst2, dst_r2, dst_l2); + UNPCK_UB_SH(dst3, dst_r3, dst_l3); + + dst_r0 += in0; + dst_l0 += in1; + dst_r1 += in2; + dst_l1 += in3; + dst_r2 += in4; + dst_l2 += in5; + dst_r3 += in6; + dst_l3 += in7; + + CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); + CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3); + PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, + dst_r3, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, stride); + dst += (4 * stride); + } +} + +static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) +{ + uint8_t loop_cnt; + uint8_t *temp_dst = dst; + v16u8 dst0, dst1, dst2, dst3; + v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + for (loop_cnt = 8; loop_cnt--;) { + LD_SH4(coeffs, 32, in0, in2, in4, in6); + LD_SH4((coeffs + 8), 32, in1, in3, in5, in7); + LD_UB4(temp_dst, stride, dst0, dst1, dst2, dst3); + + UNPCK_UB_SH(dst0, dst_r0, dst_l0); + UNPCK_UB_SH(dst1, dst_r1, dst_l1); + UNPCK_UB_SH(dst2, dst_r2, dst_l2); + UNPCK_UB_SH(dst3, dst_r3, dst_l3); + + dst_r0 += in0; + dst_l0 += in1; + dst_r1 += in2; + dst_l1 += in3; + dst_r2 += in4; + dst_l2 += in5; + dst_r3 += in6; + dst_l3 += in7; + + CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); + CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3); + PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, + dst_r3, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, stride); + + LD_SH4((coeffs + 16), 32, in0, in2, in4, in6); + LD_SH4((coeffs + 24), 32, in1, in3, in5, in7); + coeffs += 128; + LD_UB4((temp_dst + 16), stride, dst0, dst1, dst2, dst3); + temp_dst += (4 * stride); + + UNPCK_UB_SH(dst0, dst_r0, dst_l0); + UNPCK_UB_SH(dst1, dst_r1, dst_l1); + UNPCK_UB_SH(dst2, dst_r2, dst_l2); + UNPCK_UB_SH(dst3, dst_r3, dst_l3); + + dst_r0 += in0; + dst_l0 += in1; + dst_r1 += in2; + dst_l1 += in3; + dst_r2 += in4; + dst_l2 += in5; + dst_r3 += in6; + dst_l3 += in7; + + CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); + CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3); + PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, + dst_r3, dst0, dst1, dst2, dst3); + + ST_UB4(dst0, dst1, dst2, dst3, (dst + 16), stride); + dst += (4 * stride); + } +} + +static void hevc_idct_luma_4x4_msa(int16_t *coeffs) +{ + v8i16 in0, in1, dst0, dst1; + v4i32 in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3; + + LD_SH2(coeffs, 8, in0, in1); + UNPCK_SH_SW(in0, in_r0, in_l0); + UNPCK_SH_SW(in1, in_r1, in_l1); + HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3, + 7); + TRANSPOSE4x4_SW_SW(res0, res1, res2, res3, in_r0, in_l0, in_r1, in_l1); + HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3, + 12); + TRANSPOSE4x4_SW_SW(res0, res1, res2, res3, res0, res1, res2, res3); + PCKEV_H2_SH(res1, res0, res3, res2, dst0, dst1); + ST_SH2(dst0, dst1, coeffs, 8); +} + +void ff_hevc_idct_4x4_msa(int16_t *coeffs, int col_limit) +{ + hevc_idct_4x4_msa(coeffs); +} + +void ff_hevc_idct_8x8_msa(int16_t *coeffs, int col_limit) +{ + hevc_idct_8x8_msa(coeffs); +} + +void ff_hevc_idct_16x16_msa(int16_t *coeffs, int col_limit) +{ + hevc_idct_16x16_msa(coeffs); +} + +void ff_hevc_idct_32x32_msa(int16_t *coeffs, int col_limit) +{ + hevc_idct_32x32_msa(coeffs); +} + +void ff_hevc_addblk_4x4_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +{ + hevc_addblk_4x4_msa(coeffs, dst, stride); +} + +void ff_hevc_addblk_8x8_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +{ + hevc_addblk_8x8_msa(coeffs, dst, stride); +} + +void ff_hevc_addblk_16x16_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +{ + hevc_addblk_16x16_msa(coeffs, dst, stride); +} + +void ff_hevc_addblk_32x32_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +{ + hevc_addblk_32x32_msa(coeffs, dst, stride); +} + +void ff_hevc_idct_dc_4x4_msa(int16_t *coeffs) +{ + hevc_idct_dc_4x4_msa(coeffs); +} + +void ff_hevc_idct_dc_8x8_msa(int16_t *coeffs) +{ + hevc_idct_dc_8x8_msa(coeffs); +} + +void ff_hevc_idct_dc_16x16_msa(int16_t *coeffs) +{ + hevc_idct_dc_16x16_msa(coeffs); +} + +void ff_hevc_idct_dc_32x32_msa(int16_t *coeffs) +{ + hevc_idct_dc_32x32_msa(coeffs); +} + +void ff_hevc_idct_luma_4x4_msa(int16_t *coeffs) +{ + hevc_idct_luma_4x4_msa(coeffs); +} diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c b/libavcodec/mips/hevc_lpf_sao_msa.c new file mode 100644 index 0000000000..da1db51ef5 --- /dev/null +++ b/libavcodec/mips/hevc_lpf_sao_msa.c @@ -0,0 +1,2088 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "libavcodec/mips/hevcdsp_mips.h" + +static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, + int32_t beta, int32_t *tc, + uint8_t *p_is_pcm, uint8_t *q_is_pcm) +{ + uint8_t *p3 = src - (stride << 2); + uint8_t *p2 = src - ((stride << 1) + stride); + uint8_t *p1 = src - (stride << 1); + uint8_t *p0 = src - stride; + uint8_t *q0 = src; + uint8_t *q1 = src + stride; + uint8_t *q2 = src + (stride << 1); + uint8_t *q3 = src + (stride << 1) + stride; + uint8_t flag0, flag1; + int32_t dp00, dq00, dp30, dq30, d00, d30; + int32_t dp04, dq04, dp34, dq34, d04, d34; + int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250; + int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp; + uint64_t dst_val0, dst_val1; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5; + v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec; + v8u16 temp0, temp1; + v8i16 temp2; + v8i16 tc_pos, tc_neg; + v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0; + v16i8 zero = { 0 }; + v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src; + + dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]); + dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]); + dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]); + dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]); + d00 = dp00 + dq00; + d30 = dp30 + dq30; + p_is_pcm0 = p_is_pcm[0]; + q_is_pcm0 = q_is_pcm[0]; + dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]); + dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]); + dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]); + dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]); + d04 = dp04 + dq04; + d34 = dp34 + dq34; + p_is_pcm4 = p_is_pcm[1]; + q_is_pcm4 = q_is_pcm[1]; + + if (!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) { + if (!(d00 + d30 >= beta) || !(d04 + d34 >= beta)) { + p3_src = LD_UH(p3); + p2_src = LD_UH(p2); + p1_src = LD_UH(p1); + p0_src = LD_UH(p0); + q0_src = LD_UH(q0); + q1_src = LD_UH(q1); + q2_src = LD_UH(q2); + q3_src = LD_UH(q3); + + tc0 = tc[0]; + beta30 = beta >> 3; + beta20 = beta >> 2; + tc250 = ((tc0 * 5 + 1) >> 1); + tc4 = tc[1]; + tc254 = ((tc4 * 5 + 1) >> 1); + + flag0 = (abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 && + abs(p0[0] - q0[0]) < tc250 && + abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 && + abs(p0[3] - q0[3]) < tc250 && + (d00 << 1) < beta20 && (d30 << 1) < beta20); + cmp0 = __msa_fill_d(flag0); + + flag1 = (abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 && + abs(p0[4] - q0[4]) < tc254 && + abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 && + abs(p0[7] - q0[7]) < tc254 && + (d04 << 1) < beta20 && (d34 << 1) < beta20); + cmp1 = __msa_fill_d(flag1); + cmp2 = __msa_ilvev_d(cmp1, cmp0); + cmp2 = __msa_ceqi_d(cmp2, 0); + + ILVR_B8_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src, + zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src, + p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, + q3_src); + + cmp0 = (v2i64) __msa_fill_h(tc0); + cmp1 = (v2i64) __msa_fill_h(tc4); + tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0); + tc_pos <<= 1; + tc_neg = -tc_pos; + + temp0 = (p1_src + p0_src + q0_src); + temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0; + temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); + temp2 = (v8i16) (temp1 - p2_src); + temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + dst0 = (v16u8) (temp2 + (v8i16) p2_src); + + temp1 = temp0 + p2_src; + temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); + temp2 = (v8i16) (temp1 - p1_src); + temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + dst1 = (v16u8) (temp2 + (v8i16) p1_src); + + temp1 = (temp0 << 1) + p2_src + q1_src; + temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); + temp2 = (v8i16) (temp1 - p0_src); + temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + dst2 = (v16u8) (temp2 + (v8i16) p0_src); + + cmp0 = __msa_fill_d(p_is_pcm0); + cmp1 = __msa_fill_d(p_is_pcm4); + p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); + p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0); + + dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec); + dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec); + dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec); + + temp0 = (q1_src + p0_src + q0_src); + + temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0; + temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); + temp2 = (v8i16) (temp1 - q2_src); + temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + dst5 = (v16u8) (temp2 + (v8i16) q2_src); + + temp1 = temp0 + q2_src; + temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); + temp2 = (v8i16) (temp1 - q1_src); + temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + dst4 = (v16u8) (temp2 + (v8i16) q1_src); + + temp1 = (temp0 << 1) + p1_src + q2_src; + temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); + temp2 = (v8i16) (temp1 - q0_src); + temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + dst3 = (v16u8) (temp2 + (v8i16) q0_src); + + cmp0 = __msa_fill_d(q_is_pcm0); + cmp1 = __msa_fill_d(q_is_pcm4); + q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); + q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0); + + dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec); + dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec); + dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec); + + tc_pos >>= 1; + tc_neg = -tc_pos; + + diff0 = (v8i16) (q0_src - p0_src); + diff1 = (v8i16) (q1_src - p1_src); + diff0 = (diff0 << 3) + diff0; + diff1 = (diff1 << 1) + diff1; + delta0 = diff0 - diff1; + delta0 = __msa_srari_h(delta0, 4); + + temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1)); + abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero); + abs_delta0 = (v8u16) abs_delta0 < temp1; + + delta0 = CLIP_SH(delta0, tc_neg, tc_pos); + + temp0 = (v8u16) (delta0 + p0_src); + temp0 = (v8u16) CLIP_SH_0_255(temp0); + temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src, + (v16u8) p_is_pcm_vec); + + temp2 = (v8i16) (q0_src - delta0); + temp2 = CLIP_SH_0_255(temp2); + temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, + (v16u8) q_is_pcm_vec); + + tmp = (beta + (beta >> 1)) >> 3; + cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) < tmp)); + cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) < tmp)); + p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); + p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0); + + cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp)); + cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp)); + q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); + q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0); + + tc_pos >>= 1; + tc_neg = -tc_pos; + + delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src); + delta1 -= (v8i16) p1_src; + delta1 += delta0; + delta1 >>= 1; + delta1 = CLIP_SH(delta1, tc_neg, tc_pos); + delta1 = (v8i16) p1_src + (v8i16) delta1; + delta1 = CLIP_SH_0_255(delta1); + delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src, + (v16u8) p_is_pcm_vec); + + delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src); + delta2 = delta2 - (v8i16) q1_src; + delta2 = delta2 - delta0; + delta2 = delta2 >> 1; + delta2 = CLIP_SH(delta2, tc_neg, tc_pos); + delta2 = (v8i16) q1_src + (v8i16) delta2; + delta2 = CLIP_SH_0_255(delta2); + delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src, + (v16u8) q_is_pcm_vec); + + delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src, + (v16u8) abs_delta0); + temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src, + (v16u8) abs_delta0); + temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, + (v16u8) abs_delta0); + delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src, + (v16u8) abs_delta0); + + dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2); + dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2); + dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2); + dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2); + dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2); + dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2); + + cmp0 = __msa_fill_d(d00 + d30 >= beta); + cmp1 = __msa_fill_d(d04 + d34 >= beta); + cmp0 = __msa_ilvev_d(cmp1, cmp0); + cmp0 = __msa_ceqi_d(cmp0, 0); + + dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) cmp0); + dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) cmp0); + dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) cmp0); + dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) cmp0); + dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) cmp0); + dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) cmp0); + + PCKEV_B2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); + + dst_val0 = __msa_copy_u_d((v2i64) dst2, 0); + dst_val1 = __msa_copy_u_d((v2i64) dst2, 1); + + ST8x4_UB(dst0, dst1, p2, stride); + p2 += (4 * stride); + SD(dst_val0, p2); + p2 += stride; + SD(dst_val1, p2); + } + } +} + +static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, + int32_t beta, int32_t *tc, + uint8_t *p_is_pcm, uint8_t *q_is_pcm) +{ + uint8_t *p3 = src; + uint8_t *p2 = src + 3 * stride; + uint8_t *p1 = src + (stride << 2); + uint8_t *p0 = src + 7 * stride; + uint8_t flag0, flag1; + uint16_t tmp0, tmp1; + uint32_t tmp2, tmp3; + int32_t dp00, dq00, dp30, dq30, d00, d30; + int32_t dp04, dq04, dp34, dq34, d04, d34; + int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250; + int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec; + v8u16 temp0, temp1; + v8i16 temp2; + v8i16 tc_pos, tc_neg; + v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0; + v16i8 zero = { 0 }; + v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src; + + dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]); + dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]); + dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]); + dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]); + d00 = dp00 + dq00; + d30 = dp30 + dq30; + p_is_pcm0 = p_is_pcm[0]; + q_is_pcm0 = q_is_pcm[0]; + + dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]); + dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]); + dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]); + dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]); + d04 = dp04 + dq04; + d34 = dp34 + dq34; + p_is_pcm4 = p_is_pcm[1]; + q_is_pcm4 = q_is_pcm[1]; + + if (!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) { + if (!(d00 + d30 >= beta) || !(d04 + d34 >= beta)) { + src -= 4; + LD_UH8(src, stride, + p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, + q3_src); + + tc0 = tc[0]; + beta30 = beta >> 3; + beta20 = beta >> 2; + tc250 = ((tc0 * 5 + 1) >> 1); + + tc4 = tc[1]; + tc254 = ((tc4 * 5 + 1) >> 1); + + TRANSPOSE8x8_UB_UH(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, + q2_src, q3_src, p3_src, p2_src, p1_src, p0_src, + q0_src, q1_src, q2_src, q3_src); + + flag0 = (abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 && + abs(p3[-1] - p3[0]) < tc250 && + abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 && + abs(p2[-1] - p2[0]) < tc250 && + (d00 << 1) < beta20 && (d30 << 1) < beta20); + cmp0 = __msa_fill_d(flag0); + + flag1 = (abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 && + abs(p1[-1] - p1[0]) < tc254 && + abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 && + abs(p0[-1] - p0[0]) < tc254 && + (d04 << 1) < beta20 && (d34 << 1) < beta20); + cmp1 = __msa_fill_d(flag1); + cmp2 = __msa_ilvev_d(cmp1, cmp0); + cmp2 = __msa_ceqi_d(cmp2, 0); + + ILVR_B8_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src, + zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src, + p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, + q3_src); + + cmp0 = (v2i64) __msa_fill_h(tc0 << 1); + cmp1 = (v2i64) __msa_fill_h(tc4 << 1); + tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0); + tc_neg = -tc_pos; + + temp0 = (p1_src + p0_src + q0_src); + + temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0; + temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); + temp2 = (v8i16) (temp1 - p2_src); + temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + dst0 = (v16u8) (temp2 + (v8i16) p2_src); + + temp1 = temp0 + p2_src; + temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); + temp2 = (v8i16) (temp1 - p1_src); + temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + dst1 = (v16u8) (temp2 + (v8i16) p1_src); + + temp1 = (temp0 << 1) + p2_src + q1_src; + temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); + temp2 = (v8i16) (temp1 - p0_src); + temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + dst2 = (v16u8) (temp2 + (v8i16) p0_src); + + cmp0 = __msa_fill_d(p_is_pcm0); + cmp1 = __msa_fill_d(p_is_pcm4); + p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); + p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0); + + dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec); + dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec); + dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec); + + temp0 = (q1_src + p0_src + q0_src); + temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0; + temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); + temp2 = (v8i16) (temp1 - q2_src); + temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + dst5 = (v16u8) (temp2 + (v8i16) q2_src); + + temp1 = temp0 + q2_src; + temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); + temp2 = (v8i16) (temp1 - q1_src); + temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + dst4 = (v16u8) (temp2 + (v8i16) q1_src); + + temp1 = (temp0 << 1) + p1_src + q2_src; + temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); + temp2 = (v8i16) (temp1 - q0_src); + temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + dst3 = (v16u8) (temp2 + (v8i16) q0_src); + + cmp0 = __msa_fill_d(q_is_pcm0); + cmp1 = __msa_fill_d(q_is_pcm4); + q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); + q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0); + + dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec); + dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec); + dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec); + + tc_pos >>= 1; + tc_neg = -tc_pos; + + diff0 = (v8i16) (q0_src - p0_src); + diff1 = (v8i16) (q1_src - p1_src); + diff0 = (v8i16) (diff0 << 3) + diff0; + diff1 = (v8i16) (diff1 << 1) + diff1; + delta0 = diff0 - diff1; + delta0 = __msa_srari_h(delta0, 4); + + temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1)); + abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero); + abs_delta0 = (v8u16) abs_delta0 < temp1; + + delta0 = CLIP_SH(delta0, tc_neg, tc_pos); + temp0 = (v8u16) delta0 + p0_src; + temp0 = (v8u16) CLIP_SH_0_255(temp0); + temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src, + (v16u8) p_is_pcm_vec); + + temp2 = (v8i16) q0_src - delta0; + temp2 = CLIP_SH_0_255(temp2); + temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, + (v16u8) q_is_pcm_vec); + + tmp = ((beta + (beta >> 1)) >> 3); + cmp0 = __msa_fill_d(!p_is_pcm0 && (dp00 + dp30 < tmp)); + cmp1 = __msa_fill_d(!p_is_pcm4 && (dp04 + dp34 < tmp)); + p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); + p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0); + + cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp)); + cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp)); + q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); + q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0); + + tc_pos >>= 1; + tc_neg = -tc_pos; + + delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src); + delta1 -= (v8i16) p1_src; + delta1 += delta0; + delta1 >>= 1; + delta1 = CLIP_SH(delta1, tc_neg, tc_pos); + delta1 = (v8i16) p1_src + (v8i16) delta1; + delta1 = CLIP_SH_0_255(delta1); + delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src, + (v16u8) p_is_pcm_vec); + + delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src); + delta2 = delta2 - (v8i16) q1_src; + delta2 = delta2 - delta0; + delta2 = delta2 >> 1; + delta2 = CLIP_SH(delta2, tc_neg, tc_pos); + delta2 = (v8i16) q1_src + (v8i16) delta2; + delta2 = CLIP_SH_0_255(delta2); + delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src, + (v16u8) q_is_pcm_vec); + delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src, + (v16u8) abs_delta0); + temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src, + (v16u8) abs_delta0); + temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, + (v16u8) abs_delta0); + delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src, + (v16u8) abs_delta0); + + dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2); + dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2); + dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2); + dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2); + dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2); + dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2); + + cmp0 = __msa_fill_d(d00 + d30 >= beta); + dst7 = (v16u8) __msa_fill_d(d04 + d34 >= beta); + cmp0 = __msa_ilvev_d((v2i64) dst7, cmp0); + dst6 = (v16u8) __msa_ceqi_d(cmp0, 0); + + dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, dst6); + dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, dst6); + dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, dst6); + dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, dst6); + dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, dst6); + dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, dst6); + + PCKEV_B4_UB(dst0, dst0, dst1, dst1, dst2, dst2, dst3, dst3, + dst0, dst1, dst2, dst3); + PCKEV_B2_UB(dst4, dst4, dst5, dst5, dst4, dst5); + + TRANSPOSE8x8_UB_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, + dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + + src += 1; + + tmp2 = __msa_copy_u_w((v4i32) dst0, 0); + tmp0 = __msa_copy_u_h((v8i16) dst0, 2); + tmp3 = __msa_copy_u_w((v4i32) dst1, 0); + tmp1 = __msa_copy_u_h((v8i16) dst1, 2); + SW(tmp2, src); + SH(tmp0, src + 4); + src += stride; + SW(tmp3, src); + SH(tmp1, src + 4); + src += stride; + + tmp2 = __msa_copy_u_w((v4i32) dst2, 0); + tmp0 = __msa_copy_u_h((v8i16) dst2, 2); + tmp3 = __msa_copy_u_w((v4i32) dst3, 0); + tmp1 = __msa_copy_u_h((v8i16) dst3, 2); + SW(tmp2, src); + SH(tmp0, src + 4); + src += stride; + SW(tmp3, src); + SH(tmp1, src + 4); + src += stride; + + tmp2 = __msa_copy_u_w((v4i32) dst4, 0); + tmp0 = __msa_copy_u_h((v8i16) dst4, 2); + tmp3 = __msa_copy_u_w((v4i32) dst5, 0); + tmp1 = __msa_copy_u_h((v8i16) dst5, 2); + SW(tmp2, src); + SH(tmp0, src + 4); + src += stride; + SW(tmp3, src); + SH(tmp1, src + 4); + src += stride; + + tmp2 = __msa_copy_u_w((v4i32) dst6, 0); + tmp0 = __msa_copy_u_h((v8i16) dst6, 2); + tmp3 = __msa_copy_u_w((v4i32) dst7, 0); + tmp1 = __msa_copy_u_h((v8i16) dst7, 2); + SW(tmp2, src); + SH(tmp0, src + 4); + src += stride; + SW(tmp3, src); + SH(tmp1, src + 4); + } + } +} + +static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride, + int32_t *tc, uint8_t *p_is_pcm, + uint8_t *q_is_pcm) +{ + uint8_t *p1_ptr = src - (stride << 1); + uint8_t *p0_ptr = src - stride; + uint8_t *q0_ptr = src; + uint8_t *q1_ptr = src + stride; + v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec; + v8u16 p1, p0, q0, q1; + v8i16 tc_pos, tc_neg; + v16i8 zero = { 0 }; + v8i16 temp0, temp1, delta; + + if (!(tc[0] <= 0) || !(tc[1] <= 0)) { + cmp0 = (v2i64) __msa_fill_h(tc[0]); + cmp1 = (v2i64) __msa_fill_h(tc[1]); + tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0); + tc_neg = -tc_pos; + + cmp0 = __msa_fill_d(p_is_pcm[0]); + cmp1 = __msa_fill_d(p_is_pcm[1]); + p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); + p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0); + + cmp0 = __msa_fill_d(q_is_pcm[0]); + cmp1 = __msa_fill_d(q_is_pcm[1]); + q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); + q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0); + + p1 = LD_UH(p1_ptr); + p0 = LD_UH(p0_ptr); + q0 = LD_UH(q0_ptr); + q1 = LD_UH(q1_ptr); + + ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1); + + temp0 = (v8i16) (q0 - p0); + temp1 = (v8i16) (p1 - q1); + temp0 <<= 2; + temp0 += temp1; + delta = __msa_srari_h((v8i16) temp0, 3); + delta = CLIP_SH(delta, tc_neg, tc_pos); + + temp0 = (v8i16) ((v8i16) p0 + delta); + temp0 = CLIP_SH_0_255(temp0); + temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0, + (v16u8) p_is_pcm_vec); + + temp1 = (v8i16) ((v8i16) q0 - delta); + temp1 = CLIP_SH_0_255(temp1); + temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0, + (v16u8) q_is_pcm_vec); + + tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0); + temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos); + temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos); + + temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0); + ST8x2_UB(temp0, p0_ptr, stride); + } +} + +static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride, + int32_t *tc, uint8_t *p_is_pcm, + uint8_t *q_is_pcm) +{ + v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v8u16 p1, p0, q0, q1; + v8i16 tc_pos, tc_neg; + v16i8 zero = { 0 }; + v8i16 temp0, temp1, delta; + + if (!(tc[0] <= 0) || !(tc[1] <= 0)) { + cmp0 = (v2i64) __msa_fill_h(tc[0]); + cmp1 = (v2i64) __msa_fill_h(tc[1]); + tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0); + tc_neg = -tc_pos; + + cmp0 = __msa_fill_d(p_is_pcm[0]); + cmp1 = __msa_fill_d(p_is_pcm[1]); + p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); + p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0); + + cmp0 = __msa_fill_d(q_is_pcm[0]); + cmp1 = __msa_fill_d(q_is_pcm[1]); + q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); + q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0); + + src -= 2; + LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); + TRANSPOSE8x4_UB_UH(src0, src1, src2, src3, src4, src5, src6, src7, + p1, p0, q0, q1); + ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1); + + temp0 = (v8i16) (q0 - p0); + temp1 = (v8i16) (p1 - q1); + temp0 <<= 2; + temp0 += temp1; + delta = __msa_srari_h((v8i16) temp0, 3); + delta = CLIP_SH(delta, tc_neg, tc_pos); + + temp0 = (v8i16) ((v8i16) p0 + delta); + temp0 = CLIP_SH_0_255(temp0); + temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0, + (v16u8) p_is_pcm_vec); + + temp1 = (v8i16) ((v8i16) q0 - delta); + temp1 = CLIP_SH_0_255(temp1); + temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0, + (v16u8) q_is_pcm_vec); + + tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0); + temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos); + temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos); + + temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0); + + src += 1; + ST2x4_UB(temp0, 0, src, stride); + src += (4 * stride); + ST2x4_UB(temp0, 4, src, stride); + } +} + +static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride, + uint8_t *src, int32_t src_stride, + int32_t sao_left_class, + int16_t *sao_offset_val, + int32_t height) +{ + int32_t h_cnt; + v16u8 src0, src1, src2, src3; + v16i8 src0_r, src1_r; + v16i8 offset, offset_val, mask; + v16i8 offset0 = { 0 }; + v16i8 offset1 = { 0 }; + v16i8 zero = { 0 }; + v8i16 temp0, temp1, dst0, dst1; + + offset_val = LD_SB(sao_offset_val + 1); + offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val); + + offset_val = __msa_pckev_b(offset_val, offset_val); + offset1 = (v16i8) __msa_insve_w((v4i32) offset1, 3, (v4i32) offset_val); + offset0 = __msa_sld_b(offset1, offset0, 28 - ((sao_left_class) & 31)); + offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31)); + + if (!((sao_left_class > 12) & (sao_left_class < 29))) { + SWAP(offset0, offset1); + } + + for (h_cnt = height >> 2; h_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + ILVEV_D2_SB(src0, src1, src2, src3, src0_r, src1_r); + + src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r); + mask = __msa_srli_b(src0_r, 3); + offset = __msa_vshf_b(mask, offset1, offset0); + + UNPCK_SB_SH(offset, temp0, temp1); + ILVRL_B2_SH(zero, src0_r, dst0, dst1); + ADD2(dst0, temp0, dst1, temp1, dst0, dst1); + CLIP_SH2_0_255(dst0, dst1); + dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); + ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride, + uint8_t *src, int32_t src_stride, + int32_t sao_left_class, + int16_t *sao_offset_val, + int32_t height) +{ + int32_t h_cnt; + v16u8 src0, src1, src2, src3; + v16i8 src0_r, src1_r, mask0, mask1; + v16i8 offset, offset_val; + v16i8 offset0 = { 0 }; + v16i8 offset1 = { 0 }; + v16i8 zero = { 0 }; + v8i16 dst0, dst1, dst2, dst3; + v8i16 temp0, temp1, temp2, temp3; + + offset_val = LD_SB(sao_offset_val + 1); + offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val); + offset_val = __msa_pckev_b(offset_val, offset_val); + offset1 = (v16i8) __msa_insve_w((v4i32) offset1, 3, (v4i32) offset_val); + offset0 = __msa_sld_b(offset1, offset0, 28 - ((sao_left_class) & 31)); + offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31)); + + if (!((sao_left_class > 12) & (sao_left_class < 29))) { + SWAP(offset0, offset1); + } + + for (h_cnt = height >> 2; h_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + ILVR_D2_SB(src1, src0, src3, src2, src0_r, src1_r); + + mask0 = __msa_srli_b(src0_r, 3); + mask1 = __msa_srli_b(src1_r, 3); + + offset = __msa_vshf_b(mask0, offset1, offset0); + UNPCK_SB_SH(offset, temp0, temp1); + + offset = __msa_vshf_b(mask1, offset1, offset0); + UNPCK_SB_SH(offset, temp2, temp3); + + UNPCK_UB_SH(src0_r, dst0, dst1); + UNPCK_UB_SH(src1_r, dst2, dst3); + ADD4(dst0, temp0, dst1, temp1, dst2, temp2, dst3, temp3, + dst0, dst1, dst2, dst3); + CLIP_SH4_0_255(dst0, dst1, dst2, dst3); + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst2); + ST8x4_UB(dst0, dst2, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_sao_band_filter_16multiple_msa(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int32_t sao_left_class, + int16_t *sao_offset_val, + int32_t width, int32_t height) +{ + int32_t h_cnt, w_cnt; + v16u8 src0, src1, src2, src3; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16i8 out0, out1, out2, out3; + v16i8 mask0, mask1, mask2, mask3; + v16i8 tmp0, tmp1, tmp2, tmp3, offset_val; + v16i8 offset0 = { 0 }; + v16i8 offset1 = { 0 }; + v16i8 zero = { 0 }; + v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + + offset_val = LD_SB(sao_offset_val + 1); + offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val); + offset_val = __msa_pckev_b(offset_val, offset_val); + offset1 = (v16i8) __msa_insve_w((v4i32) offset1, 3, (v4i32) offset_val); + offset0 = __msa_sld_b(offset1, offset0, 28 - ((sao_left_class) & 31)); + offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31)); + + if (!((sao_left_class > 12) & (sao_left_class < 29))) { + SWAP(offset0, offset1); + } + + for (h_cnt = height >> 2; h_cnt--;) { + for (w_cnt = 0; w_cnt < (width >> 4); w_cnt++) { + LD_UB4(src + w_cnt * 16, src_stride, src0, src1, src2, src3); + + mask0 = __msa_srli_b((v16i8) src0, 3); + mask1 = __msa_srli_b((v16i8) src1, 3); + mask2 = __msa_srli_b((v16i8) src2, 3); + mask3 = __msa_srli_b((v16i8) src3, 3); + + VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1, + tmp0, tmp1); + VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3, + tmp2, tmp3); + UNPCK_SB_SH(tmp0, temp0, temp1); + UNPCK_SB_SH(tmp1, temp2, temp3); + UNPCK_SB_SH(tmp2, temp4, temp5); + UNPCK_SB_SH(tmp3, temp6, temp7); + ILVRL_B2_SH(zero, src0, dst0, dst1); + ILVRL_B2_SH(zero, src1, dst2, dst3); + ILVRL_B2_SH(zero, src2, dst4, dst5); + ILVRL_B2_SH(zero, src3, dst6, dst7); + ADD4(dst0, temp0, dst1, temp1, dst2, temp2, dst3, temp3, + dst0, dst1, dst2, dst3); + ADD4(dst4, temp4, dst5, temp5, dst6, temp6, dst7, temp7, + dst4, dst5, dst6, dst7); + CLIP_SH4_0_255(dst0, dst1, dst2, dst3); + CLIP_SH4_0_255(dst4, dst5, dst6, dst7); + PCKEV_B4_SB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, + out0, out1, out2, out3); + ST_SB4(out0, out1, out2, out3, dst + w_cnt * 16, dst_stride); + } + + src += src_stride << 2; + dst += dst_stride << 2; + } +} + +static void hevc_sao_edge_filter_0degree_4width_msa(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t height) +{ + int32_t h_cnt; + uint32_t dst_val0, dst_val1; + v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 }; + v16u8 const1 = (v16u8) __msa_ldi_b(1); + v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; + v16u8 src_minus10, src_minus11; + v16i8 zero = { 0 }; + v16i8 src_zero0, src_zero1, src_plus10, src_plus11, dst0; + v8i16 offset_mask0, offset_mask1; + v8i16 sao_offset, src00, src01; + + sao_offset = LD_SH(sao_offset_val); + src -= 1; + + for (h_cnt = (height >> 1); h_cnt--;) { + LD_UB2(src, src_stride, src_minus10, src_minus11); + src += (2 * src_stride); + + SLDI_B2_0_SB(src_minus10, src_minus11, src_zero0, src_zero1, 1); + SLDI_B2_0_SB(src_minus10, src_minus11, src_plus10, src_plus11, 2); + ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11, + src_minus10, src_minus11); + ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0, + src_zero1); + + cmp_minus10 = ((v16u8) src_zero0 == src_minus10); + diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); + cmp_minus10 = (src_minus10 < (v16u8) src_zero0); + diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); + + cmp_minus11 = ((v16u8) src_zero1 == src_minus11); + diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); + cmp_minus11 = (src_minus11 < (v16u8) src_zero1); + diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); + + offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); + offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); + + VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0, + offset_mask0, offset_mask0, offset_mask0); + VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1, + offset_mask1, offset_mask1, offset_mask1); + ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01); + ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0, + offset_mask1); + CLIP_SH2_0_255(offset_mask0, offset_mask1); + + dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0); + dst_val0 = __msa_copy_u_w((v4i32) dst0, 0); + dst_val1 = __msa_copy_u_w((v4i32) dst0, 2); + SW(dst_val0, dst); + dst += dst_stride; + SW(dst_val1, dst); + dst += dst_stride; + } +} + +static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t height) +{ + uint8_t *src_minus1; + int32_t h_cnt; + uint64_t dst_val0, dst_val1; + v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 }; + v16u8 const1 = (v16u8) __msa_ldi_b(1); + v16i8 dst0, dst1; + v16i8 zero = { 0 }; + v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; + v16u8 src_minus10, src_minus11; + v16i8 src_zero0, src_plus10, src_zero1, src_plus11; + v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1; + + sao_offset = LD_SH(sao_offset_val); + + for (h_cnt = (height >> 1); h_cnt--;) { + src_minus1 = src - 1; + LD_UB2(src_minus1, src_stride, src_minus10, src_minus11); + + SLDI_B2_0_SB(src_minus10, src_minus11, src_zero0, src_zero1, 1); + SLDI_B2_0_SB(src_minus10, src_minus11, src_plus10, src_plus11, 2); + ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11, + src_minus10, src_minus11); + ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, + src_zero0, src_zero1); + + cmp_minus10 = ((v16u8) src_zero0 == src_minus10); + diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); + cmp_minus10 = (src_minus10 < (v16u8) src_zero0); + diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); + + cmp_minus11 = ((v16u8) src_zero1 == src_minus11); + diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); + cmp_minus11 = (src_minus11 < (v16u8) src_zero1); + diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); + + offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); + offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); + + VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0, + offset_mask0, offset_mask0, offset_mask0); + VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1, + offset_mask1, offset_mask1, offset_mask1); + ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01); + + ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0, + offset_mask1); + CLIP_SH2_0_255(offset_mask0, offset_mask1); + PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1, + dst0, dst1); + + dst_val0 = __msa_copy_u_d((v2i64) dst0, 0); + dst_val1 = __msa_copy_u_d((v2i64) dst1, 0); + SD(dst_val0, dst); + dst += dst_stride; + SD(dst_val1, dst); + dst += dst_stride; + src += (src_stride << 1); + } +} + +static void hevc_sao_edge_filter_0degree_16multiple_msa(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t width, + int32_t height) +{ + uint8_t *dst_ptr, *src_minus1; + int32_t h_cnt, v_cnt; + v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + v16u8 const1 = (v16u8) __msa_ldi_b(1); + v16i8 sao_offset; + v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11; + v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12; + v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13; + v16u8 diff_plus13; + v16u8 src10, src11, src12, src13, dst0, dst1, dst2, dst3; + v16u8 src_minus10, src_minus11, src_minus12, src_minus13; + v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3; + v16i8 src_zero0, src_zero1, src_zero2, src_zero3; + v16i8 src_plus10, src_plus11, src_plus12, src_plus13; + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + + sao_offset = LD_SB(sao_offset_val); + sao_offset = __msa_pckev_b(sao_offset, sao_offset); + + for (h_cnt = (height >> 2); h_cnt--;) { + src_minus1 = src - 1; + LD_UB4(src_minus1, src_stride, + src_minus10, src_minus11, src_minus12, src_minus13); + + for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) { + src_minus1 += 16; + dst_ptr = dst + (v_cnt << 4); + LD_UB4(src_minus1, src_stride, src10, src11, src12, src13); + + SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_zero0, + src_zero1, 1); + SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_zero2, + src_zero3, 1); + SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_plus10, + src_plus11, 2); + SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_plus12, + src_plus13, 2); + + cmp_minus10 = ((v16u8) src_zero0 == src_minus10); + cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10); + cmp_minus11 = ((v16u8) src_zero1 == src_minus11); + cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11); + cmp_minus12 = ((v16u8) src_zero2 == src_minus12); + cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12); + cmp_minus13 = ((v16u8) src_zero3 == src_minus13); + cmp_plus13 = ((v16u8) src_zero3 == (v16u8) src_plus13); + + diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); + diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10); + diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); + diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11); + diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12); + diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12); + diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13); + diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13); + + cmp_minus10 = (src_minus10 < (v16u8) src_zero0); + cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0); + cmp_minus11 = (src_minus11 < (v16u8) src_zero1); + cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1); + cmp_minus12 = (src_minus12 < (v16u8) src_zero2); + cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2); + cmp_minus13 = (src_minus13 < (v16u8) src_zero3); + cmp_plus13 = ((v16u8) src_plus13 < (v16u8) src_zero3); + + diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); + diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10); + diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); + diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11); + diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12); + diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12); + diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13); + diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13); + + offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10; + VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0, + offset_mask0, offset_mask0, offset_mask0); + offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11; + VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1, + offset_mask1, offset_mask1, offset_mask1); + offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12; + VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask2, + offset_mask2, offset_mask2, offset_mask2); + offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13; + VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask3, + offset_mask3, offset_mask3, offset_mask3); + + UNPCK_UB_SH(src_zero0, src0, src1); + UNPCK_SB_SH(offset_mask0, temp0, temp1); + UNPCK_UB_SH(src_zero1, src2, src3); + UNPCK_SB_SH(offset_mask1, temp2, temp3); + UNPCK_UB_SH(src_zero2, src4, src5); + UNPCK_SB_SH(offset_mask2, temp4, temp5); + UNPCK_UB_SH(src_zero3, src6, src7); + UNPCK_SB_SH(offset_mask3, temp6, temp7); + ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0, + temp1, temp2, temp3); + ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4, + temp5, temp6, temp7); + CLIP_SH4_0_255(temp0, temp1, temp2, temp3); + CLIP_SH4_0_255(temp4, temp5, temp6, temp7); + PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6, + dst0, dst1, dst2, dst3); + + src_minus10 = src10; + ST_UB(dst0, dst_ptr); + src_minus11 = src11; + ST_UB(dst1, dst_ptr + dst_stride); + src_minus12 = src12; + ST_UB(dst2, dst_ptr + (dst_stride << 1)); + src_minus13 = src13; + ST_UB(dst3, dst_ptr + (dst_stride * 3)); + } + + src += (src_stride << 2); + dst += (dst_stride << 2); + } +} + +static void hevc_sao_edge_filter_90degree_4width_msa(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t height) +{ + int32_t h_cnt; + uint32_t dst_val0, dst_val1; + v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 }; + v16u8 const1 = (v16u8) __msa_ldi_b(1); + v16i8 dst0; + v16i8 zero = { 0 }; + v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; + v16u8 src_minus10, src_minus11, src10, src11; + v16i8 src_zero0, src_zero1; + v8i16 sao_offset, src00, src01, offset_mask0, offset_mask1; + + sao_offset = LD_SH(sao_offset_val); + + LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11); + + for (h_cnt = (height >> 1); h_cnt--;) { + LD_UB2(src + src_stride, src_stride, src10, src11); + + src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10); + src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11); + src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11); + src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10); + + cmp_minus10 = ((v16u8) src_zero0 == src_minus10); + diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); + cmp_minus10 = (src_minus10 < (v16u8) src_zero0); + diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); + + cmp_minus11 = ((v16u8) src_zero1 == src_minus11); + diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); + cmp_minus11 = (src_minus11 < (v16u8) src_zero1); + diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); + + offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); + offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); + + VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0, + offset_mask0, offset_mask0, offset_mask0); + VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1, + offset_mask1, offset_mask1, offset_mask1); + ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01); + ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0, + offset_mask1); + CLIP_SH2_0_255(offset_mask0, offset_mask1); + dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0); + + src_minus10 = src10; + src_minus11 = src11; + + dst_val0 = __msa_copy_u_w((v4i32) dst0, 0); + dst_val1 = __msa_copy_u_w((v4i32) dst0, 2); + SW(dst_val0, dst); + dst += dst_stride; + SW(dst_val1, dst); + + dst += dst_stride; + src += (src_stride << 1); + } +} + +static void hevc_sao_edge_filter_90degree_8width_msa(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t height) +{ + int32_t h_cnt; + uint64_t dst_val0, dst_val1; + v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 }; + v16u8 const1 = (v16u8) __msa_ldi_b(1); + v16i8 zero = { 0 }; + v16i8 src_zero0, src_zero1, dst0, dst1; + v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; + v16u8 src_minus10, src_minus11, src10, src11; + v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1; + + sao_offset = LD_SH(sao_offset_val); + + LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11); + + for (h_cnt = (height >> 1); h_cnt--;) { + LD_UB2(src + src_stride, src_stride, src10, src11); + + src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10); + src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11); + src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11); + src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10); + + cmp_minus10 = ((v16u8) src_zero0 == src_minus10); + diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); + cmp_minus10 = (src_minus10 < (v16u8) src_zero0); + diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); + + cmp_minus11 = ((v16u8) src_zero1 == src_minus11); + diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); + cmp_minus11 = (src_minus11 < (v16u8) src_zero1); + diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); + + offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); + offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); + + VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0, + offset_mask0, offset_mask0, offset_mask0); + VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1, + offset_mask1, offset_mask1, offset_mask1); + ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01); + ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0, + offset_mask1); + CLIP_SH2_0_255(offset_mask0, offset_mask1); + PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1, + dst0, dst1); + + src_minus10 = src10; + src_minus11 = src11; + + dst_val0 = __msa_copy_u_d((v2i64) dst0, 0); + dst_val1 = __msa_copy_u_d((v2i64) dst1, 0); + SD(dst_val0, dst); + dst += dst_stride; + SD(dst_val1, dst); + dst += dst_stride; + src += (src_stride << 1); + } +} + +static void hevc_sao_edge_filter_90degree_16multiple_msa(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t * + sao_offset_val, + int32_t width, + int32_t height) +{ + uint8_t *src_orig = src; + uint8_t *dst_orig = dst; + int32_t h_cnt, v_cnt; + v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + v16u8 const1 = (v16u8) __msa_ldi_b(1); + v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11; + v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12; + v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13; + v16u8 diff_plus13; + v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1; + v16u8 src12, dst2, src13, dst3; + v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset; + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + + sao_offset = LD_SB(sao_offset_val); + sao_offset = __msa_pckev_b(sao_offset, sao_offset); + + for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) { + src = src_orig + (v_cnt << 4); + dst = dst_orig + (v_cnt << 4); + + LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11); + + for (h_cnt = (height >> 2); h_cnt--;) { + LD_UB4(src + src_stride, src_stride, src10, src11, src12, src13); + + cmp_minus10 = (src_minus11 == src_minus10); + cmp_plus10 = (src_minus11 == src10); + cmp_minus11 = (src10 == src_minus11); + cmp_plus11 = (src10 == src11); + cmp_minus12 = (src11 == src10); + cmp_plus12 = (src11 == src12); + cmp_minus13 = (src12 == src11); + cmp_plus13 = (src12 == src13); + + diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); + diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10); + diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); + diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11); + diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12); + diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12); + diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13); + diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13); + + cmp_minus10 = (src_minus10 < src_minus11); + cmp_plus10 = (src10 < src_minus11); + cmp_minus11 = (src_minus11 < src10); + cmp_plus11 = (src11 < src10); + cmp_minus12 = (src10 < src11); + cmp_plus12 = (src12 < src11); + cmp_minus13 = (src11 < src12); + cmp_plus13 = (src13 < src12); + + diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); + diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10); + diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); + diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11); + diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12); + diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12); + diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13); + diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13); + + offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10; + VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, + offset_mask0, offset_mask0, offset_mask0, offset_mask0); + offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11; + VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, + offset_mask1, offset_mask1, offset_mask1, offset_mask1); + offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12; + VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, + offset_mask2, offset_mask2, offset_mask2, offset_mask2); + offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13; + VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, + offset_mask3, offset_mask3, offset_mask3, offset_mask3); + + UNPCK_UB_SH(src_minus11, src0, src1); + UNPCK_SB_SH(offset_mask0, temp0, temp1); + UNPCK_UB_SH(src10, src2, src3); + UNPCK_SB_SH(offset_mask1, temp2, temp3); + UNPCK_UB_SH(src11, src4, src5); + UNPCK_SB_SH(offset_mask2, temp4, temp5); + UNPCK_UB_SH(src12, src6, src7); + UNPCK_SB_SH(offset_mask3, temp6, temp7); + ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0, + temp1, temp2, temp3); + ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4, + temp5, temp6, temp7); + CLIP_SH4_0_255(temp0, temp1, temp2, temp3); + CLIP_SH4_0_255(temp4, temp5, temp6, temp7); + PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6, + dst0, dst1, dst2, dst3); + + src_minus10 = src12; + src_minus11 = src13; + + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + + src += (src_stride << 2); + dst += (dst_stride << 2); + } + } +} + +static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t height) +{ + uint8_t *src_orig; + int32_t h_cnt; + uint32_t dst_val0, dst_val1; + v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 }; + v16u8 const1 = (v16u8) __msa_ldi_b(1); + v16i8 zero = { 0 }; + v16u8 cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11; + v16u8 src_minus11, src10, src11; + v16i8 src_plus0, src_zero0, src_plus1, src_zero1, dst0; + v8i16 sao_offset, src00, src01, offset_mask0, offset_mask1; + + sao_offset = LD_SH(sao_offset_val); + + src_orig = src - 1; + LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11); + + for (h_cnt = (height >> 1); h_cnt--;) { + LD_UB2(src_orig + src_stride, src_stride, src10, src11); + + SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); + SLDI_B2_0_SB(src10, src11, src_plus0, src_plus1, 2); + + ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10, + src_minus11); + ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0, + src_zero1); + + cmp_minus10 = ((v16u8) src_zero0 == src_minus10); + diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); + cmp_minus10 = (src_minus10 < (v16u8) src_zero0); + diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); + + cmp_minus11 = ((v16u8) src_zero1 == src_minus11); + diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); + cmp_minus11 = (src_minus11 < (v16u8) src_zero1); + diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); + + offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); + offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); + + VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0, + offset_mask0, offset_mask0, offset_mask0); + VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1, + offset_mask1, offset_mask1, offset_mask1); + ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01); + ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0, + offset_mask1); + CLIP_SH2_0_255(offset_mask0, offset_mask1); + + dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0); + + src_minus10 = src10; + src_minus11 = src11; + + dst_val0 = __msa_copy_u_w((v4i32) dst0, 0); + dst_val1 = __msa_copy_u_w((v4i32) dst0, 2); + SW(dst_val0, dst); + dst += dst_stride; + SW(dst_val1, dst); + + dst += dst_stride; + src_orig += (src_stride << 1); + } +} + +static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t height) +{ + uint8_t *src_orig; + int32_t h_cnt; + uint64_t dst_val0, dst_val1; + v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 }; + v16u8 const1 = (v16u8) __msa_ldi_b(1); + v16i8 zero = { 0 }; + v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; + v16u8 src_minus10, src10, src_minus11, src11; + v16i8 src_zero0, src_plus10, src_zero1, src_plus11, dst0, dst1; + v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1; + + sao_offset = LD_SH(sao_offset_val); + src_orig = src - 1; + + LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11); + + for (h_cnt = (height >> 1); h_cnt--;) { + LD_UB2(src_orig + src_stride, src_stride, src10, src11); + + SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); + SLDI_B2_0_SB(src10, src11, src_plus10, src_plus11, 2); + + ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11, + src_minus10, src_minus11); + ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, + src_zero0, src_zero1); + + cmp_minus10 = ((v16u8) src_zero0 == src_minus10); + diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); + cmp_minus10 = (src_minus10 < (v16u8) src_zero0); + diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); + + cmp_minus11 = ((v16u8) src_zero1 == src_minus11); + diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); + cmp_minus11 = (src_minus11 < (v16u8) src_zero1); + diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); + + offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); + offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); + + VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0, + offset_mask0, offset_mask0, offset_mask0); + VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1, + offset_mask1, offset_mask1, offset_mask1); + ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01); + ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0, + offset_mask1); + CLIP_SH2_0_255(offset_mask0, offset_mask1); + PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1, + dst0, dst1); + + src_minus10 = src10; + src_minus11 = src11; + + dst_val0 = __msa_copy_u_d((v2i64) dst0, 0); + dst_val1 = __msa_copy_u_d((v2i64) dst1, 0); + SD(dst_val0, dst); + dst += dst_stride; + SD(dst_val1, dst); + + dst += dst_stride; + src_orig += (src_stride << 1); + } +} + +static void hevc_sao_edge_filter_45degree_16multiple_msa(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t * + sao_offset_val, + int32_t width, + int32_t height) +{ + uint8_t *src_orig = src; + uint8_t *dst_orig = dst; + int32_t h_cnt, v_cnt; + v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + v16u8 const1 = (v16u8) __msa_ldi_b(1); + v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11; + v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12; + v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13; + v16u8 diff_plus13, src_minus14, src_plus13; + v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3; + v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1; + v16u8 src12, src_minus12, dst2, src13, src_minus13, dst3; + v16i8 src_zero0, src_plus10, src_zero1, src_plus11, src_zero2, src_plus12; + v16i8 src_zero3, sao_offset; + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + + sao_offset = LD_SB(sao_offset_val); + sao_offset = __msa_pckev_b(sao_offset, sao_offset); + + for (h_cnt = (height >> 2); h_cnt--;) { + src_orig = src - 1; + dst_orig = dst; + LD_UB4(src_orig, src_stride, + src_minus11, src_minus12, src_minus13, src_minus14); + + for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) { + src_minus10 = LD_UB(src_orig - src_stride); + LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13); + src_plus13 = LD_UB(src + 1 + (v_cnt << 4) + (src_stride << 2)); + src_orig += 16; + + SLDI_B2_SB(src10, src11, src_minus11, src_minus12, src_zero0, + src_zero1, 1); + SLDI_B2_SB(src12, src13, src_minus13, src_minus14, src_zero2, + src_zero3, 1); + SLDI_B2_SB(src11, src12, src_minus12, src_minus13, src_plus10, + src_plus11, 2); + + src_plus12 = __msa_sldi_b((v16i8) src13, (v16i8) src_minus14, 2); + + cmp_minus10 = ((v16u8) src_zero0 == src_minus10); + cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10); + cmp_minus11 = ((v16u8) src_zero1 == src_minus11); + cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11); + cmp_minus12 = ((v16u8) src_zero2 == src_minus12); + cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12); + cmp_minus13 = ((v16u8) src_zero3 == src_minus13); + cmp_plus13 = ((v16u8) src_zero3 == src_plus13); + + diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); + diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10); + diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); + diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11); + diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12); + diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12); + diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13); + diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13); + + cmp_minus10 = (src_minus10 < (v16u8) src_zero0); + cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0); + cmp_minus11 = (src_minus11 < (v16u8) src_zero1); + cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1); + cmp_minus12 = (src_minus12 < (v16u8) src_zero2); + cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2); + cmp_minus13 = (src_minus13 < (v16u8) src_zero3); + cmp_plus13 = (src_plus13 < (v16u8) src_zero3); + + diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); + diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10); + diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); + diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11); + diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12); + diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12); + diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13); + diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13); + + offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10; + offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11; + offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12; + offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13; + + VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, + offset_mask0, offset_mask0, offset_mask0, offset_mask0); + VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, + offset_mask1, offset_mask1, offset_mask1, offset_mask1); + VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, + offset_mask2, offset_mask2, offset_mask2, offset_mask2); + VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, + offset_mask3, offset_mask3, offset_mask3, offset_mask3); + + UNPCK_UB_SH(src_zero0, src0, src1); + UNPCK_SB_SH(offset_mask0, temp0, temp1); + UNPCK_UB_SH(src_zero1, src2, src3); + UNPCK_SB_SH(offset_mask1, temp2, temp3); + UNPCK_UB_SH(src_zero2, src4, src5); + UNPCK_SB_SH(offset_mask2, temp4, temp5); + UNPCK_UB_SH(src_zero3, src6, src7); + UNPCK_SB_SH(offset_mask3, temp6, temp7); + ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0, + temp1, temp2, temp3); + ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4, + temp5, temp6, temp7); + CLIP_SH4_0_255(temp0, temp1, temp2, temp3); + CLIP_SH4_0_255(temp4, temp5, temp6, temp7); + PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, + temp7, temp6, dst0, dst1, dst2, dst3); + + src_minus11 = src10; + src_minus12 = src11; + src_minus13 = src12; + src_minus14 = src13; + + ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride); + dst_orig += 16; + } + + src += (src_stride << 2); + dst += (dst_stride << 2); + } +} + +static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t height) +{ + uint8_t *src_orig; + int32_t h_cnt; + uint32_t dst_val0, dst_val1; + v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 }; + v16u8 const1 = (v16u8) __msa_ldi_b(1); + v16i8 zero = { 0 }; + v16i8 src_zero0, src_zero1, dst0; + v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; + v16u8 src_minus10, src10, src_minus11, src11; + v8i16 offset_mask0, offset_mask1, sao_offset, src00, src01; + + sao_offset = LD_SH(sao_offset_val); + src_orig = src - 1; + + LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11); + + for (h_cnt = (height >> 1); h_cnt--;) { + LD_UB2(src_orig + src_stride, src_stride, src10, src11); + + SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); + SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2); + + ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10, + src_minus11); + ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0, + src_zero1); + + cmp_minus10 = ((v16u8) src_zero0 == src_minus10); + diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); + cmp_minus10 = (src_minus10 < (v16u8) src_zero0); + diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); + + cmp_minus11 = ((v16u8) src_zero1 == src_minus11); + diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); + cmp_minus11 = (src_minus11 < (v16u8) src_zero1); + diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); + + offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); + offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); + + VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0, + offset_mask0, offset_mask0, offset_mask0); + VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1, + offset_mask1, offset_mask1, offset_mask1); + ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01); + ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0, + offset_mask1); + CLIP_SH2_0_255(offset_mask0, offset_mask1); + dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0); + + src_minus10 = src10; + src_minus11 = src11; + + dst_val0 = __msa_copy_u_w((v4i32) dst0, 0); + dst_val1 = __msa_copy_u_w((v4i32) dst0, 2); + + SW(dst_val0, dst); + dst += dst_stride; + SW(dst_val1, dst); + + dst += dst_stride; + src_orig += (src_stride << 1); + } +} + +static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t *sao_offset_val, + int32_t height) +{ + uint8_t *src_orig; + int32_t h_cnt; + uint64_t dst_val0, dst_val1; + v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 }; + v16u8 const1 = (v16u8) __msa_ldi_b(1); + v16i8 zero = { 0 }; + v16i8 src_zero0, src_zero1, dst0, dst1; + v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; + v16u8 src_minus10, src10, src_minus11, src11; + v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1; + + sao_offset = LD_SH(sao_offset_val); + src_orig = src - 1; + + LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11); + + for (h_cnt = (height >> 1); h_cnt--;) { + LD_UB2(src_orig + src_stride, src_stride, src10, src11); + + SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); + SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2); + ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10, + src_minus11); + ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0, + src_zero1); + + cmp_minus10 = ((v16u8) src_zero0 == src_minus10); + diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); + cmp_minus10 = (src_minus10 < (v16u8) src_zero0); + diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); + + cmp_minus11 = ((v16u8) src_zero1 == src_minus11); + diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); + cmp_minus11 = (src_minus11 < (v16u8) src_zero1); + diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); + + offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); + offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); + + VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0, + offset_mask0, offset_mask0, offset_mask0); + VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1, + offset_mask1, offset_mask1, offset_mask1); + ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01); + ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0, + offset_mask1); + CLIP_SH2_0_255(offset_mask0, offset_mask1); + PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1, + dst0, dst1); + + src_minus10 = src10; + src_minus11 = src11; + + dst_val0 = __msa_copy_u_d((v2i64) dst0, 0); + dst_val1 = __msa_copy_u_d((v2i64) dst1, 0); + + SD(dst_val0, dst); + dst += dst_stride; + SD(dst_val1, dst); + dst += dst_stride; + + src_orig += (src_stride << 1); + } +} + +static void hevc_sao_edge_filter_135degree_16multiple_msa(uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + int16_t * + sao_offset_val, + int32_t width, + int32_t height) +{ + uint8_t *src_orig, *dst_orig; + int32_t h_cnt, v_cnt; + v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + v16u8 const1 = (v16u8) __msa_ldi_b(1); + v16u8 dst0, dst1, dst2, dst3; + v16u8 cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10; + v16u8 cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11; + v16u8 diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12; + v16u8 diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11; + v16u8 src_plus10, src_plus11, src_plus12, src_plus13; + v16i8 src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3; + v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset; + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + + sao_offset = LD_SB(sao_offset_val); + sao_offset = __msa_pckev_b(sao_offset, sao_offset); + + for (h_cnt = (height >> 2); h_cnt--;) { + src_orig = src - 1; + dst_orig = dst; + + LD_UB4(src_orig, src_stride, + src_minus11, src_plus10, src_plus11, src_plus12); + + for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) { + src_minus10 = LD_UB(src_orig + 2 - src_stride); + LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13); + src_plus13 = LD_UB(src_orig + (src_stride << 2)); + src_orig += 16; + + src_zero0 = __msa_sldi_b((v16i8) src10, (v16i8) src_minus11, 1); + cmp_minus10 = ((v16u8) src_zero0 == src_minus10); + cmp_plus10 = ((v16u8) src_zero0 == src_plus10); + + src_zero1 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 1); + src_minus11 = (v16u8) __msa_sldi_b((v16i8) src10, + (v16i8) src_minus11, 2); + cmp_minus11 = ((v16u8) src_zero1 == src_minus11); + cmp_plus11 = ((v16u8) src_zero1 == src_plus11); + + src_zero2 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 1); + src_minus12 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 2); + cmp_minus12 = ((v16u8) src_zero2 == (v16u8) src_minus12); + cmp_plus12 = ((v16u8) src_zero2 == src_plus12); + + src_zero3 = __msa_sldi_b((v16i8) src13, (v16i8) src_plus12, 1); + src_minus13 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 2); + cmp_minus13 = ((v16u8) src_zero3 == (v16u8) src_minus13); + cmp_plus13 = ((v16u8) src_zero3 == src_plus13); + + diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); + diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10); + diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); + diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11); + diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12); + diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12); + diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13); + diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13); + + cmp_minus10 = (src_minus10 < (v16u8) src_zero0); + cmp_plus10 = (src_plus10 < (v16u8) src_zero0); + cmp_minus11 = (src_minus11 < (v16u8) src_zero1); + cmp_plus11 = (src_plus11 < (v16u8) src_zero1); + cmp_minus12 = ((v16u8) src_minus12 < (v16u8) src_zero2); + cmp_plus12 = (src_plus12 < (v16u8) src_zero2); + cmp_minus13 = ((v16u8) src_minus13 < (v16u8) src_zero3); + cmp_plus13 = (src_plus13 < (v16u8) src_zero3); + + diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); + diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10); + diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); + diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11); + diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12); + diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12); + diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13); + diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13); + + offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10; + offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11; + offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12; + offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13; + + VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, + offset_mask0, offset_mask0, offset_mask0, offset_mask0); + VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, + offset_mask1, offset_mask1, offset_mask1, offset_mask1); + VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, + offset_mask2, offset_mask2, offset_mask2, offset_mask2); + VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, + offset_mask3, offset_mask3, offset_mask3, offset_mask3); + + UNPCK_UB_SH(src_zero0, src0, src1); + UNPCK_SB_SH(offset_mask0, temp0, temp1); + UNPCK_UB_SH(src_zero1, src2, src3); + UNPCK_SB_SH(offset_mask1, temp2, temp3); + UNPCK_UB_SH(src_zero2, src4, src5); + UNPCK_SB_SH(offset_mask2, temp4, temp5); + UNPCK_UB_SH(src_zero3, src6, src7); + UNPCK_SB_SH(offset_mask3, temp6, temp7); + + ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0, + temp1, temp2, temp3); + ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4, + temp5, temp6, temp7); + CLIP_SH4_0_255(temp0, temp1, temp2, temp3); + CLIP_SH4_0_255(temp4, temp5, temp6, temp7); + PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6, + dst0, dst1, dst2, dst3); + + src_minus11 = src10; + src_plus10 = src11; + src_plus11 = src12; + src_plus12 = src13; + + ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride); + dst_orig += 16; + } + + src += (src_stride << 2); + dst += (dst_stride << 2); + } +} + +void ff_hevc_loop_filter_luma_h_8_msa(uint8_t *src, + ptrdiff_t src_stride, + int32_t beta, int32_t *tc, + uint8_t *no_p, uint8_t *no_q) +{ + hevc_loopfilter_luma_hor_msa(src, src_stride, beta, tc, no_p, no_q); +} + +void ff_hevc_loop_filter_luma_v_8_msa(uint8_t *src, + ptrdiff_t src_stride, + int32_t beta, int32_t *tc, + uint8_t *no_p, uint8_t *no_q) +{ + hevc_loopfilter_luma_ver_msa(src, src_stride, beta, tc, no_p, no_q); +} + +void ff_hevc_loop_filter_chroma_h_8_msa(uint8_t *src, + ptrdiff_t src_stride, + int32_t *tc, uint8_t *no_p, + uint8_t *no_q) +{ + hevc_loopfilter_chroma_hor_msa(src, src_stride, tc, no_p, no_q); +} + +void ff_hevc_loop_filter_chroma_v_8_msa(uint8_t *src, + ptrdiff_t src_stride, + int32_t *tc, uint8_t *no_p, + uint8_t *no_q) +{ + hevc_loopfilter_chroma_ver_msa(src, src_stride, tc, no_p, no_q); +} + +void ff_hevc_sao_band_filter_0_8_msa(uint8_t *dst, uint8_t *src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, + int width, int height) +{ + if (width >> 4) { + hevc_sao_band_filter_16multiple_msa(dst, stride_dst, src, stride_src, + sao_left_class, sao_offset_val, + width - (width % 16), height); + dst += width - (width % 16); + src += width - (width % 16); + width %= 16; + } + + if (width >> 3) { + hevc_sao_band_filter_8width_msa(dst, stride_dst, src, stride_src, + sao_left_class, sao_offset_val, height); + dst += 8; + src += 8; + width %= 8; + } + + if (width) { + hevc_sao_band_filter_4width_msa(dst, stride_dst, src, stride_src, + sao_left_class, sao_offset_val, height); + } +} + +void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src, + ptrdiff_t stride_dst, + int16_t *sao_offset_val, + int eo, int width, int height) +{ + ptrdiff_t stride_src = (2 * 64 + 32) / sizeof(uint8_t); + + switch (eo) { + case 0: + if (width >> 4) { + hevc_sao_edge_filter_0degree_16multiple_msa(dst, stride_dst, + src, stride_src, + sao_offset_val, + width - (width % 16), + height); + dst += width - (width % 16); + src += width - (width % 16); + width %= 16; + } + + if (width >> 3) { + hevc_sao_edge_filter_0degree_8width_msa(dst, stride_dst, + src, stride_src, + sao_offset_val, height); + dst += 8; + src += 8; + width %= 8; + } + + if (width) { + hevc_sao_edge_filter_0degree_4width_msa(dst, stride_dst, + src, stride_src, + sao_offset_val, height); + } + break; + + case 1: + if (width >> 4) { + hevc_sao_edge_filter_90degree_16multiple_msa(dst, stride_dst, + src, stride_src, + sao_offset_val, + width - (width % 16), + height); + dst += width - (width % 16); + src += width - (width % 16); + width %= 16; + } + + if (width >> 3) { + hevc_sao_edge_filter_90degree_8width_msa(dst, stride_dst, + src, stride_src, + sao_offset_val, height); + dst += 8; + src += 8; + width %= 8; + } + + if (width) { + hevc_sao_edge_filter_90degree_4width_msa(dst, stride_dst, + src, stride_src, + sao_offset_val, height); + } + break; + + case 2: + if (width >> 4) { + hevc_sao_edge_filter_45degree_16multiple_msa(dst, stride_dst, + src, stride_src, + sao_offset_val, + width - (width % 16), + height); + dst += width - (width % 16); + src += width - (width % 16); + width %= 16; + } + + if (width >> 3) { + hevc_sao_edge_filter_45degree_8width_msa(dst, stride_dst, + src, stride_src, + sao_offset_val, height); + dst += 8; + src += 8; + width %= 8; + } + + if (width) { + hevc_sao_edge_filter_45degree_4width_msa(dst, stride_dst, + src, stride_src, + sao_offset_val, height); + } + break; + + case 3: + if (width >> 4) { + hevc_sao_edge_filter_135degree_16multiple_msa(dst, stride_dst, + src, stride_src, + sao_offset_val, + width - (width % 16), + height); + dst += width - (width % 16); + src += width - (width % 16); + width %= 16; + } + + if (width >> 3) { + hevc_sao_edge_filter_135degree_8width_msa(dst, stride_dst, + src, stride_src, + sao_offset_val, height); + dst += 8; + src += 8; + width %= 8; + } + + if (width) { + hevc_sao_edge_filter_135degree_4width_msa(dst, stride_dst, + src, stride_src, + sao_offset_val, height); + } + break; + } +} diff --git a/libavcodec/mips/hevc_macros_msa.h b/libavcodec/mips/hevc_macros_msa.h new file mode 100644 index 0000000000..b06c5ad9b9 --- /dev/null +++ b/libavcodec/mips/hevc_macros_msa.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_HEVC_MACROS_MSA_H +#define AVCODEC_MIPS_HEVC_MACROS_MSA_H + +#define HEVC_PCK_SW_SB2(in0, in1, out) \ +{ \ + v8i16 tmp0_m; \ + \ + tmp0_m = __msa_pckev_h((v8i16) in0, (v8i16) in1); \ + out = (v4i32) __msa_pckev_b((v16i8) tmp0_m, (v16i8) tmp0_m); \ +} + +#define HEVC_PCK_SW_SB4(in0, in1, in2, in3, out) \ +{ \ + v8i16 tmp0_m, tmp1_m; \ + \ + PCKEV_H2_SH(in0, in1, in2, in3, tmp0_m, tmp1_m); \ + out = (v4i32) __msa_pckev_b((v16i8) tmp1_m, (v16i8) tmp0_m); \ +} + +#define HEVC_PCK_SW_SB8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1) \ +{ \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + PCKEV_H4_SH(in0, in1, in2, in3, in4, in5, in6, in7, \ + tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + PCKEV_B2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \ +} + +#define HEVC_PCK_SW_SB12(in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, out0, out1, out2) \ +{ \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m, tmp4_m, tmp5_m; \ + \ + PCKEV_H4_SH(in0, in1, in2, in3, in4, in5, in6, in7, \ + tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + PCKEV_H2_SH(in8, in9, in10, in11, tmp4_m, tmp5_m); \ + PCKEV_B2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \ + out2 = (v4i32) __msa_pckev_b((v16i8) tmp5_m, (v16i8) tmp4_m); \ +} + +#define HEVC_FILT_8TAP(in0, in1, in2, in3, \ + filt0, filt1, filt2, filt3) \ +( { \ + v4i32 out_m; \ + \ + out_m = __msa_dotp_s_w((v8i16) in0, (v8i16) filt0); \ + out_m = __msa_dpadd_s_w(out_m, (v8i16) in1, (v8i16) filt1); \ + DPADD_SH2_SW(in2, in3, filt2, filt3, out_m, out_m); \ + out_m; \ +} ) + +#define HEVC_FILT_4TAP(in0, in1, filt0, filt1) \ +( { \ + v4i32 out_m; \ + \ + out_m = __msa_dotp_s_w(in0, (v8i16) filt0); \ + out_m = __msa_dpadd_s_w(out_m, in1, (v8i16) filt1); \ + out_m; \ +} ) + +#endif /* AVCODEC_MIPS_HEVC_MACROS_MSA_H */ diff --git a/libavcodec/mips/hevc_mc_bi_msa.c b/libavcodec/mips/hevc_mc_bi_msa.c new file mode 100644 index 0000000000..8208be327d --- /dev/null +++ b/libavcodec/mips/hevc_mc_bi_msa.c @@ -0,0 +1,4462 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "libavcodec/mips/hevcdsp_mips.h" +#include "libavcodec/mips/hevc_macros_msa.h" + +#define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \ +{ \ + ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \ + SRARI_H2_SH(out0, out1, rnd_val); \ + CLIP_SH2_0_255(out0, out1); \ +} + +#define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \ + vec0, vec1, vec2, vec3, rnd_val, \ + out0, out1, out2, out3) \ +{ \ + HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \ + HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \ +} + +static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + v16i8 zero = { 0 }; + + if (2 == height) { + v16i8 src0, src1; + v8i16 dst0, in0, in1; + + LD_SB2(src0_ptr, src_stride, src0, src1); + LD_SH2(src1_ptr, src2_stride, in0, in1); + + src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0); + in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); + + dst0 = (v8i16) __msa_ilvr_b(zero, src0); + dst0 <<= 6; + dst0 += in0; + dst0 = __msa_srari_h(dst0, 7); + dst0 = CLIP_SH_0_255(dst0); + + dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); + ST4x2_UB(dst0, dst, dst_stride); + } else if (4 == height) { + v16i8 src0, src1, src2, src3; + v8i16 dst0, dst1; + v8i16 in0, in1, in2, in3; + + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + ILVR_W2_SB(src1, src0, src3, src2, src0, src1); + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1); + + dst0 <<= 6; + dst1 <<= 6; + HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); + + dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); + ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride); + } else if (0 == height % 8) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 dst0, dst1, dst2, dst3; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src0_ptr, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src0_ptr += (8 * src_stride); + + LD_SH8(src1_ptr, src2_stride, + in0, in1, in2, in3, in4, in5, in6, in7); + src1_ptr += (8 * src2_stride); + + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + ILVR_D2_SH(in5, in4, in7, in6, in2, in3); + + ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6, + src0, src1, src2, src3); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0, dst1, dst2, dst3); + + SLLI_4V(dst0, dst1, dst2, dst3, 6); + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, + dst0, dst1, dst2, dst3); + + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST4x8_UB(dst0, dst1, dst, dst_stride); + dst += (8 * dst_stride); + } + } +} + +static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 zero = { 0 }; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src0_ptr, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src0_ptr += (8 * src_stride); + LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); + src1_ptr += (8 * src2_stride); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0, dst1, dst2, dst3); + ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, + dst4, dst5, dst6, dst7); + + SLLI_4V(dst0, dst1, dst2, dst3, 6); + SLLI_4V(dst4, dst5, dst6, dst7, 6); + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); + + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST6x4_UB(dst0, dst1, dst, dst_stride); + dst += (4 * dst_stride); + + HEVC_BI_RND_CLIP4(in4, in5, in6, in7, + dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7); + + PCKEV_B2_SH(dst5, dst4, dst7, dst6, dst4, dst5); + ST6x4_UB(dst4, dst5, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + v16i8 zero = { 0 }; + + if (2 == height) { + v16i8 src0, src1; + v8i16 in0, in1; + v8i16 dst0, dst1; + + LD_SB2(src0_ptr, src_stride, src0, src1); + LD_SH2(src1_ptr, src2_stride, in0, in1); + ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1); + + dst0 <<= 6; + dst1 <<= 6; + HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); + + dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); + ST8x2_UB(dst0, dst, dst_stride); + } else if (4 == height) { + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3; + v8i16 dst0, dst1, dst2, dst3; + + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0, dst1, dst2, dst3); + + SLLI_4V(dst0, dst1, dst2, dst3, 6); + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); + + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST8x4_UB(dst0, dst1, dst, dst_stride); + } else if (6 == height) { + v16i8 src0, src1, src2, src3, src4, src5; + v8i16 in0, in1, in2, in3, in4, in5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + + LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5); + LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0, dst1, dst2, dst3); + ILVR_B2_SH(zero, src4, zero, src5, dst4, dst5); + + SLLI_4V(dst0, dst1, dst2, dst3, 6); + dst4 <<= 6; + dst5 <<= 6; + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); + HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5); + + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); + ST8x4_UB(dst0, dst1, dst, dst_stride); + dst += (4 * dst_stride); + ST8x2_UB(dst2, dst, dst_stride); + } else if (0 == height % 8) { + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3; + v8i16 dst0, dst1, dst2, dst3; + uint32_t loop_cnt; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0, dst1, dst2, dst3); + + SLLI_4V(dst0, dst1, dst2, dst3, 6); + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, + dst0, dst1, dst2, dst3); + + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST8x4_UB(dst0, dst1, dst, dst_stride); + dst += (4 * dst_stride); + + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0, dst1, dst2, dst3); + + SLLI_4V(dst0, dst1, dst2, dst3, 6); + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, + dst0, dst1, dst2, dst3); + + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST8x4_UB(dst0, dst1, dst, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v16i8 zero = { 0 }; + + for (loop_cnt = (16 >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + src0_ptr += (4 * src_stride); + + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); + src1_ptr += (4 * src2_stride); + ILVR_D2_SH(in5, in4, in7, in6, in4, in5); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0, dst1, dst2, dst3); + + SLLI_4V(dst0, dst1, dst2, dst3, 6); + ILVL_W2_SB(src1, src0, src3, src2, src0, src1); + ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5); + dst4 <<= 6; + dst5 <<= 6; + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); + HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5); + + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); + ST12x4_UB(dst0, dst1, dst2, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_bi_copy_16multx4mult_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t width) +{ + uint32_t loop_cnt; + uint32_t cnt; + uint8_t *src0_ptr_tmp; + int16_t *src1_ptr_tmp; + uint8_t *dst_tmp; + v16i8 zero = { 0 }; + + for (cnt = (width >> 4); cnt--;) { + src0_ptr_tmp = src0_ptr; + src1_ptr_tmp = src1_ptr; + dst_tmp = dst; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 dst0_r, dst1_r, dst2_r, dst3_r; + v8i16 dst0_l, dst1_l, dst2_l, dst3_l; + + LD_SB4(src0_ptr_tmp, src_stride, src0, src1, src2, src3); + src0_ptr_tmp += (4 * src_stride); + LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); + LD_SH4(src1_ptr_tmp + 8, src2_stride, in4, in5, in6, in7); + src1_ptr_tmp += (4 * src2_stride); + + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0_r, dst1_r, dst2_r, dst3_r); + ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0_l, dst1_l, dst2_l, dst3_l); + + SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); + SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6); + HEVC_BI_RND_CLIP4(in0, in1, in4, in5, + dst0_r, dst1_r, dst0_l, dst1_l, 7, + dst0_r, dst1_r, dst0_l, dst1_l); + + PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); + ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + + HEVC_BI_RND_CLIP4(in2, in3, in6, in7, + dst2_r, dst3_r, dst2_l, dst3_l, 7, + dst2_r, dst3_r, dst2_l, dst3_l); + + PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r); + ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + } + + src0_ptr += 16; + src1_ptr += 16; + dst += 16; + } +} + +static void hevc_bi_copy_16w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, height, 16); +} + +static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, height, 16); + + hevc_bi_copy_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, + dst + 16, dst_stride, height); +} + +static void hevc_bi_copy_32w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, height, 32); +} + +static void hevc_bi_copy_48w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, height, 48); +} + +static void hevc_bi_copy_64w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, height, 64); +} + +static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v8i16 filt0, filt1, filt2, filt3; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 mask1, mask2, mask3; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 filter_vec, const_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + + src0_ptr -= 3; + + /* rearranging filter */ + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3, + src4, src5, src6, src7); + src0_ptr += (8 * src_stride); + LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); + src1_ptr += (8 * src2_stride); + + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + ILVR_D2_SH(in5, in4, in7, in6, in2, in3); + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + + VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); + + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST4x8_UB(dst0, dst1, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v8i16 filt0, filt1, filt2, filt3; + v16i8 src0, src1, src2, src3; + v16i8 mask1, mask2, mask3; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3; + v8i16 in0, in1, in2, in3; + v8i16 filter_vec, const_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src0_ptr -= 3; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); + + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST8x4_UB(dst0, dst1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + hevc_hz_bi_8t_8w_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height); + hevc_hz_bi_8t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride, + dst + 8, dst_stride, filter, height); +} + +static void hevc_hz_bi_8t_16w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3; + v8i16 in0, in1, in2, in3; + v8i16 filter_vec, const_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src0_ptr -= 3; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src0_ptr, 8, src0, src1); + src0_ptr += src_stride; + LD_SB2(src0_ptr, 8, src2, src3); + src0_ptr += src_stride; + LD_SH2(src1_ptr, 8, in0, in1); + src1_ptr += src2_stride; + LD_SH2(src1_ptr, 8, in2, in3); + src1_ptr += src2_stride; + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); + + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST_SH2(dst0, dst1, dst, dst_stride); + dst += (2 * dst_stride); + } +} + +static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + uint64_t dst_val0; + v16i8 src0, src1, tmp0, tmp1; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2; + v8i16 in0, in1, in2; + v8i16 filter_vec, const_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src0_ptr = src0_ptr - 3; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = mask0 + 8; + mask5 = mask0 + 10; + mask6 = mask0 + 12; + mask7 = mask0 + 14; + + for (loop_cnt = height; loop_cnt--;) { + LD_SB2(src0_ptr, 16, src0, src1); + src0_ptr += src_stride; + LD_SH2(src1_ptr, 8, in0, in1); + in2 = LD_SH(src1_ptr + 16); + src1_ptr += src2_stride; + XORI_B2_128_SB(src0, src1); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + + HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); + dst2 = __msa_adds_s_h(dst2, in2); + dst2 = __msa_srari_h(dst2, 7); + dst2 = CLIP_SH_0_255(dst2); + + PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1); + dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0); + ST_SB(tmp0, dst); + SD(dst_val0, dst + 16); + dst += dst_stride; + } +} + +static void hevc_hz_bi_8t_32w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, tmp0, tmp1; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3; + v8i16 in0, in1, in2, in3; + v8i16 filter_vec, const_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src0_ptr -= 3; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = mask0 + 8; + mask5 = mask0 + 10; + mask6 = mask0 + 12; + mask7 = mask0 + 14; + + for (loop_cnt = height; loop_cnt--;) { + LD_SB2(src0_ptr, 16, src0, src1); + src2 = LD_SB(src0_ptr + 24); + src0_ptr += src_stride; + LD_SH4(src1_ptr, 8, in0, in1, in2, in3); + src1_ptr += src2_stride; + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); + + PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1); + ST_SB2(tmp0, tmp1, dst, 16); + dst += dst_stride; + } +} + +static void hevc_hz_bi_8t_48w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v16i8 tmp0, tmp1, tmp2; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v8i16 in0, in1, in2, in3, in4, in5; + v8i16 filter_vec, const_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src0_ptr -= 3; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = mask0 + 8; + mask5 = mask0 + 10; + mask6 = mask0 + 12; + mask7 = mask0 + 14; + + for (loop_cnt = height; loop_cnt--;) { + LD_SB2(src0_ptr, 16, src0, src1); + XORI_B2_128_SB(src0, src1); + LD_SH2(src1_ptr, 8, in0, in1); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + + HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); + + tmp0 = __msa_pckev_b((v16i8) dst1, (v16i8) dst0); + ST_SB(tmp0, dst); + + LD_SB2(src0_ptr + 32, 8, src2, src3); + XORI_B2_128_SB(src2, src3); + src0_ptr += src_stride; + + LD_SH2(src1_ptr + 16, 8, in2, in3); + + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + HEVC_BI_RND_CLIP2(in2, in3, dst2, dst3, 7, dst2, dst3); + + tmp1 = __msa_pckev_b((v16i8) dst3, (v16i8) dst2); + ST_SB(tmp1, dst + 16); + + LD_SH2(src1_ptr + 32, 8, in4, in5); + src1_ptr += src2_stride; + + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst4 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst4, dst4, dst4, dst4); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst5 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst5, dst5, dst5, dst5); + + HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5); + + tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4); + ST_SB(tmp2, dst + 32); + dst += dst_stride; + } +} + +static void hevc_hz_bi_8t_64w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint8_t *src0_ptr_tmp; + uint8_t *dst_tmp; + int16_t *src1_ptr_tmp; + uint32_t loop_cnt; + uint32_t cnt; + v16i8 src0, src1, src2, tmp0, tmp1; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3; + v8i16 in0, in1, in2, in3; + v8i16 filter_vec, const_vec; + + src0_ptr -= 3; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = mask0 + 8; + mask5 = mask0 + 10; + mask6 = mask0 + 12; + mask7 = mask0 + 14; + + for (loop_cnt = height; loop_cnt--;) { + src0_ptr_tmp = src0_ptr; + dst_tmp = dst; + src1_ptr_tmp = src1_ptr; + + for (cnt = 2; cnt--;) { + LD_SB2(src0_ptr_tmp, 16, src0, src1); + src2 = LD_SB(src0_ptr_tmp + 24); + src0_ptr_tmp += 32; + LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3); + src1_ptr_tmp += 32; + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, + dst0, dst1, dst2, dst3); + + PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1); + ST_SB2(tmp0, tmp1, dst_tmp, 16); + dst_tmp += 32; + } + + src1_ptr += src2_stride; + src0_ptr += src_stride; + dst += dst_stride; + } +} + +static void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v16i8 src6, src7, src8, src9, src10; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v16i8 src11, src12, src13, src14; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; + v16i8 src1110_r, src1211_r, src1312_r, src1413_r; + v16i8 src2110, src4332, src6554, src8776, src10998; + v16i8 src12111110, src14131312; + v8i16 dst10, dst32, dst54, dst76; + v8i16 filt0, filt1, filt2, filt3; + v8i16 filter_vec, const_vec; + + src0_ptr -= (3 * src_stride); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); + src0_ptr += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_r, src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, + src2110, src4332, src6554); + XORI_B3_128_SB(src2110, src4332, src6554); + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src0_ptr, src_stride, + src7, src8, src9, src10, src11, src12, src13, src14); + src0_ptr += (8 * src_stride); + LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); + src1_ptr += (8 * src2_stride); + + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + ILVR_D2_SH(in5, in4, in7, in6, in2, in3); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, + src1110_r, src1211_r, src1312_r, src1413_r); + ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r, + src1413_r, src1312_r, + src8776, src10998, src12111110, src14131312); + XORI_B4_128_SB(src8776, src10998, src12111110, src14131312); + + dst10 = const_vec; + DPADD_SB4_SH(src2110, src4332, src6554, src8776, + filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10); + dst32 = const_vec; + DPADD_SB4_SH(src4332, src6554, src8776, src10998, + filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32); + dst54 = const_vec; + DPADD_SB4_SH(src6554, src8776, src10998, src12111110, + filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54); + dst76 = const_vec; + DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, + filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst10, dst32, dst54, dst76, 7, + dst10, dst32, dst54, dst76); + + PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54); + ST4x8_UB(dst10, dst54, dst, dst_stride); + dst += (8 * dst_stride); + + src2110 = src10998; + src4332 = src12111110; + src6554 = src14131312; + src6 = src14; + } +} + +static void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v16i8 src6, src7, src8, src9, src10; + v8i16 in0, in1, in2, in3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; + v8i16 dst0_r, dst1_r, dst2_r, dst3_r; + v8i16 filt0, filt1, filt2, filt3; + v8i16 filter_vec, const_vec; + + src0_ptr -= (3 * src_stride); + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); + src0_ptr += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_r, src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + + dst0_r = const_vec; + DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, + filt0, filt1, filt2, filt3, + dst0_r, dst0_r, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, + filt0, filt1, filt2, filt3, + dst1_r, dst1_r, dst1_r, dst1_r); + dst2_r = const_vec; + DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, + filt0, filt1, filt2, filt3, + dst2_r, dst2_r, dst2_r, dst2_r); + dst3_r = const_vec; + DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, + filt0, filt1, filt2, filt3, + dst3_r, dst3_r, dst3_r, dst3_r); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0_r, dst1_r, dst2_r, dst3_r, 7, + dst0_r, dst1_r, dst2_r, dst3_r); + + PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + + src6 = src10; + } +} + +static void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; + v8i16 dst0_r, dst1_r, dst2_r, dst3_r; + v16i8 src10_l, src32_l, src54_l, src76_l, src98_l; + v16i8 src21_l, src43_l, src65_l, src87_l, src109_l; + v16i8 src2110, src4332, src6554, src8776, src10998; + v8i16 dst0_l, dst1_l; + v8i16 filt0, filt1, filt2, filt3; + v8i16 filter_vec, const_vec; + + src0_ptr -= (3 * src_stride); + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); + src0_ptr += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_r, src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_l, src32_l, src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l, + src2110, src4332, src6554); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7); + src1_ptr += (4 * src2_stride); + + ILVR_D2_SH(in5, in4, in7, in6, in4, in5); + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, + src76_l, src87_l, src98_l, src109_l); + ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998); + + dst0_r = const_vec; + DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, + filt0, filt1, filt2, filt3, + dst0_r, dst0_r, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, + filt0, filt1, filt2, filt3, + dst1_r, dst1_r, dst1_r, dst1_r); + dst2_r = const_vec; + DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, + filt0, filt1, filt2, filt3, + dst2_r, dst2_r, dst2_r, dst2_r); + dst3_r = const_vec; + DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, + filt0, filt1, filt2, filt3, + dst3_r, dst3_r, dst3_r, dst3_r); + dst0_l = const_vec; + DPADD_SB4_SH(src2110, src4332, src6554, src8776, + filt0, filt1, filt2, filt3, + dst0_l, dst0_l, dst0_l, dst0_l); + dst1_l = const_vec; + DPADD_SB4_SH(src4332, src6554, src8776, src10998, + filt0, filt1, filt2, filt3, + dst1_l, dst1_l, dst1_l, dst1_l); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0_r, dst1_r, dst2_r, dst3_r, 7, + dst0_r, dst1_r, dst2_r, dst3_r); + HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l); + + + PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); + dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l); + ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src2110 = src6554; + src4332 = src8776; + src6554 = src10998; + src6 = src10; + } +} + +static void hevc_vt_bi_8t_16multx2mult_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, int32_t width) +{ + uint8_t *src0_ptr_tmp; + int16_t *src1_ptr_tmp; + uint8_t *dst_tmp; + uint32_t loop_cnt; + uint32_t cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 in0, in1, in2, in3; + v16i8 src10_r, src32_r, src54_r, src76_r; + v16i8 src21_r, src43_r, src65_r, src87_r; + v8i16 dst0_r, dst1_r; + v16i8 src10_l, src32_l, src54_l, src76_l; + v16i8 src21_l, src43_l, src65_l, src87_l; + v8i16 dst0_l, dst1_l; + v8i16 filt0, filt1, filt2, filt3; + v8i16 filter_vec, const_vec; + + src0_ptr -= (3 * src_stride); + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + for (cnt = (width >> 4); cnt--;) { + src0_ptr_tmp = src0_ptr; + src1_ptr_tmp = src1_ptr; + dst_tmp = dst; + + LD_SB7(src0_ptr_tmp, src_stride, + src0, src1, src2, src3, src4, src5, src6); + src0_ptr_tmp += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_r, src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_l, src32_l, src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src0_ptr_tmp, src_stride, src7, src8); + src0_ptr_tmp += (2 * src_stride); + LD_SH2(src1_ptr_tmp, src2_stride, in0, in1); + LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3); + src1_ptr_tmp += (2 * src2_stride); + XORI_B2_128_SB(src7, src8); + + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); + + dst0_r = const_vec; + DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, + filt0, filt1, filt2, filt3, + dst0_r, dst0_r, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, + filt0, filt1, filt2, filt3, + dst1_r, dst1_r, dst1_r, dst1_r); + dst0_l = const_vec; + DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l, + filt0, filt1, filt2, filt3, + dst0_l, dst0_l, dst0_l, dst0_l); + dst1_l = const_vec; + DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l, + filt0, filt1, filt2, filt3, + dst1_l, dst1_l, dst1_l, dst1_l); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0_r, dst1_r, dst0_l, dst1_l, 7, + dst0_r, dst1_r, dst0_l, dst1_l); + + PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); + ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + + src10_r = src32_r; + src32_r = src54_r; + src54_r = src76_r; + src21_r = src43_r; + src43_r = src65_r; + src65_r = src87_r; + src10_l = src32_l; + src32_l = src54_l; + src54_l = src76_l; + src21_l = src43_l; + src43_l = src65_l; + src65_l = src87_l; + src6 = src8; + } + + src0_ptr += 16; + src1_ptr += 16; + dst += 16; + } +} + +static void hevc_vt_bi_8t_16w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height, 16); +} + +static void hevc_vt_bi_8t_24w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height, 16); + hevc_vt_bi_8t_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, + dst + 16, dst_stride, filter, height); +} + +static void hevc_vt_bi_8t_32w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height, 32); +} + +static void hevc_vt_bi_8t_48w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height, 48); +} + +static void hevc_vt_bi_8t_64w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height, 64); +} + +static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 in0, in1; + v8i16 filt0, filt1, filt2, filt3; + v4i32 filt_h0, filt_h1, filt_h2, filt_h3; + v16i8 mask1, mask2, mask3; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + v8i16 dst30, dst41, dst52, dst63, dst66, dst87; + v4i32 dst0_r, dst1_r, in0_r, in0_l; + v8i16 dst10_r, dst32_r, dst54_r, dst76_r; + v8i16 dst21_r, dst43_r, dst65_r, dst87_r; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 }; + + src0_ptr -= ((3 * src_stride) + 3); + filter_vec = LD_SH(filter_x); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); + src0_ptr += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + + /* row 0 row 1 row 2 row 3 */ + VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); + VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); + VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3, + vec8, vec9, vec10, vec11); + VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3, + vec12, vec13, vec14, vec15); + + dst30 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst30, dst30, dst30, dst30); + dst41 = const_vec; + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, + dst41, dst41, dst41, dst41); + dst52 = const_vec; + DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, + dst52, dst52, dst52, dst52); + dst63 = const_vec; + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, + dst63, dst63, dst63, dst63); + + ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52, + dst10_r, dst21_r, dst32_r); + dst43_r = __msa_ilvl_h(dst41, dst30); + dst54_r = __msa_ilvl_h(dst52, dst41); + dst65_r = __msa_ilvl_h(dst63, dst52); + dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); + + for (loop_cnt = height >> 1; loop_cnt--;) { + LD_SB2(src0_ptr, src_stride, src7, src8); + src0_ptr += (2 * src_stride); + LD_SH2(src1_ptr, src2_stride, in0, in1); + src1_ptr += (2 * src2_stride); + + in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); + XORI_B2_128_SB(src7, src8); + + VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst87 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst87, dst87, dst87, dst87); + dst76_r = __msa_ilvr_h(dst87, dst66); + dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, + filt_h0, filt_h1, filt_h2, filt_h3); + dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87); + dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, + filt_h0, filt_h1, filt_h2, filt_h3); + + dst0_r >>= 6; + dst1_r >>= 6; + UNPCK_SH_SW(in0, in0_r, in0_l); + dst0_r = __msa_adds_s_w(dst0_r, in0_r); + dst1_r = __msa_adds_s_w(dst1_r, in0_l); + SRARI_W2_SW(dst0_r, dst1_r, 7); + dst0_r = CLIP_SW_0_255(dst0_r); + dst1_r = CLIP_SW_0_255(dst1_r); + + HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r); + ST4x2_UB(dst0_r, dst, dst_stride); + dst += (2 * dst_stride); + + dst10_r = dst32_r; + dst32_r = dst54_r; + dst54_r = dst76_r; + dst21_r = dst43_r; + dst43_r = dst65_r; + dst65_r = dst87_r; + dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1); + } +} + +static void hevc_hv_bi_8t_8multx2mult_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, int32_t width) +{ + uint32_t loop_cnt; + uint32_t cnt; + uint8_t *src0_ptr_tmp; + int16_t *src1_ptr_tmp; + uint8_t *dst_tmp; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 in0, in1; + v4i32 in0_r, in0_l, in1_r, in1_l; + v8i16 filt0, filt1, filt2, filt3; + v4i32 filt_h0, filt_h1, filt_h2, filt_h3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1, mask2, mask3; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l; + v8i16 dst10_r, dst32_r, dst54_r, dst76_r; + v8i16 dst10_l, dst32_l, dst54_l, dst76_l; + v8i16 dst21_r, dst43_r, dst65_r, dst87_r; + v8i16 dst21_l, dst43_l, dst65_l, dst87_l; + + src0_ptr -= ((3 * src_stride) + 3); + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter_x); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (cnt = width >> 3; cnt--;) { + src0_ptr_tmp = src0_ptr; + dst_tmp = dst; + src1_ptr_tmp = src1_ptr; + + LD_SB7(src0_ptr_tmp, src_stride, + src0, src1, src2, src3, src4, src5, src6); + src0_ptr_tmp += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + + /* row 0 row 1 row 2 row 3 */ + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec4, vec5, vec6, vec7); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec8, vec9, vec10, vec11); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec12, vec13, vec14, vec15); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + dst1 = const_vec; + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + dst2 = const_vec; + DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + dst3 = const_vec; + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, + vec4, vec5, vec6, vec7); + VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, + vec8, vec9, vec10, vec11); + dst4 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst4, dst4, dst4, dst4); + dst5 = const_vec; + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, + dst5, dst5, dst5, dst5); + dst6 = const_vec; + DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, + dst6, dst6, dst6, dst6); + + ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, + dst10_r, dst32_r, dst54_r, dst21_r); + ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r); + ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, + dst10_l, dst32_l, dst54_l, dst21_l); + ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l); + + for (loop_cnt = height >> 1; loop_cnt--;) { + /* row 7 */ + LD_SB2(src0_ptr_tmp, src_stride, src7, src8); + XORI_B2_128_SB(src7, src8); + src0_ptr_tmp += 2 * src_stride; + + LD_SH2(src1_ptr_tmp, src2_stride, in0, in1); + src1_ptr_tmp += (2 * src2_stride); + + VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst7 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst7, dst7, dst7, dst7); + + ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); + dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, + filt_h0, filt_h1, filt_h2, filt_h3); + dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, + filt_h0, filt_h1, filt_h2, filt_h3); + dst0_r >>= 6; + dst0_l >>= 6; + + VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst8 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst8, dst8, dst8, dst8); + + ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); + dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, + filt_h0, filt_h1, filt_h2, filt_h3); + dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, + filt_h0, filt_h1, filt_h2, filt_h3); + dst1_r >>= 6; + dst1_l >>= 6; + + UNPCK_SH_SW(in0, in0_r, in0_l); + UNPCK_SH_SW(in1, in1_r, in1_l); + in0_r = __msa_adds_s_w(in0_r, dst0_r); + in0_l = __msa_adds_s_w(in0_l, dst0_l); + in1_r = __msa_adds_s_w(in1_r, dst1_r); + in1_l = __msa_adds_s_w(in1_l, dst1_l); + SRARI_W4_SW(in0_r, in0_l, in1_r, in1_l, 7); + in0_r = CLIP_SW_0_255(in0_r); + in0_l = CLIP_SW_0_255(in0_l); + in1_r = CLIP_SW_0_255(in1_r); + in1_l = CLIP_SW_0_255(in1_l); + + HEVC_PCK_SW_SB4(in0_l, in0_r, in1_l, in1_r, dst0_r); + ST8x2_UB(dst0_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + + dst10_r = dst32_r; + dst32_r = dst54_r; + dst54_r = dst76_r; + dst10_l = dst32_l; + dst32_l = dst54_l; + dst54_l = dst76_l; + dst21_r = dst43_r; + dst43_r = dst65_r; + dst65_r = dst87_r; + dst21_l = dst43_l; + dst43_l = dst65_l; + dst65_l = dst87_l; + dst6 = dst8; + } + + src0_ptr += 8; + dst += 8; + src1_ptr += 8; + } +} + +static void hevc_hv_bi_8t_8w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, 8); +} + +static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, 8); + + hevc_hv_bi_8t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride, + dst + 8, dst_stride, filter_x, filter_y, height); +} + +static void hevc_hv_bi_8t_16w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, 16); +} + +static void hevc_hv_bi_8t_24w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, 24); +} + +static void hevc_hv_bi_8t_32w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, 32); +} + +static void hevc_hv_bi_8t_48w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, 48); +} + +static void hevc_hv_bi_8t_64w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, 64); +} + +static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + v8i16 filt0, filt1; + v16i8 src0, src1, dst0, vec0, vec1; + v8i16 in0, in1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v16i8 mask1; + v8i16 tmp0; + v8i16 filter_vec, const_vec; + + src0_ptr -= 1; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + LD_SB2(src0_ptr, src_stride, src0, src1); + LD_SH2(src1_ptr, src2_stride, in0, in1); + in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); + XORI_B2_128_SB(src0, src1); + VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); + tmp0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0); + + tmp0 = __msa_adds_s_h(tmp0, in0); + tmp0 = __msa_srari_h(tmp0, 7); + tmp0 = CLIP_SH_0_255(tmp0); + dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0); + + ST4x2_UB(dst0, dst, dst_stride); +} + +static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3, dst0, vec0, vec1; + v8i16 in0, in1, in2, in3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v16i8 mask1; + v8i16 tmp0, tmp1; + v8i16 filter_vec, const_vec; + + src0_ptr -= 1; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); + tmp0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0); + VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); + tmp1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp1, tmp1); + HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1); + dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); + + ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride); +} + +static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 dst0, dst1; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v16i8 mask1, vec0, vec1; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 filter_vec, const_vec; + + src0_ptr -= 1; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src0_ptr, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src0_ptr += (8 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7); + src1_ptr += (4 * src2_stride); + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + ILVR_D2_SH(in5, in4, in7, in6, in2, in3); + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + + VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); + tmp0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0); + VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); + tmp1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp1, tmp1); + VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1); + tmp2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp2, tmp2); + VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1); + tmp3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp3, tmp3); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3); + + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); + ST4x8_UB(dst0, dst1, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +static void hevc_hz_bi_4t_4w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + if (2 == height) { + hevc_hz_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height); + } else if (4 == height) { + hevc_hz_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height); + } else if (8 == height || 16 == height) { + hevc_hz_bi_4t_4x8multiple_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter, height); + } +} + +static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v16i8 vec0, vec1; + v8i16 dst0, dst1, dst2, dst3; + v8i16 filter_vec, const_vec; + + src0_ptr -= 1; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); + + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST6x4_UB(dst0, dst1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + v8i16 filt0, filt1; + v16i8 src0, src1; + v8i16 in0, in1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1, vec0, vec1; + v8i16 dst0, dst1; + v8i16 filter_vec, const_vec; + + src0_ptr -= 1; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + LD_SB2(src0_ptr, src_stride, src0, src1); + LD_SH2(src1_ptr, src2_stride, in0, in1); + XORI_B2_128_SB(src0, src1); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); + + dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); + ST8x2_UB(dst0, dst, dst_stride); +} + +static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3, src4, src5; + v8i16 in0, in1, in2, in3, in4, in5; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v16i8 vec0, vec1; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v8i16 filter_vec, const_vec; + + src0_ptr -= 1; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + LD_SH2(src1_ptr, src2_stride, in4, in5); + XORI_B6_128_SB(src0, src1, src2, src3, src4, src5); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); + HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5); + + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); + ST8x4_UB(dst0, dst1, dst, dst_stride); + dst += (4 * dst_stride); + ST8x2_UB(dst2, dst, dst_stride); +} + +static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v16i8 vec0, vec1; + v8i16 dst0, dst1, dst2, dst3; + v8i16 filter_vec, const_vec; + + src0_ptr -= 1; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); + + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST8x4_UB(dst0, dst1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_bi_4t_8w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + if (2 == height) { + hevc_hz_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height); + } else if (6 == height) { + hevc_hz_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height); + } else if (0 == (height % 4)) { + hevc_hz_bi_4t_8x4multiple_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter, height); + } +} + +static void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask2 = { + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 + }; + v16i8 mask1, mask3; + v16i8 vec0, vec1; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v8i16 filter_vec, const_vec; + + src0_ptr -= 1; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + mask3 = mask2 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); + src1_ptr += (4 * src2_stride); + + ILVR_D2_SH(in5, in4, in7, in6, in4, in5); + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); + HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5); + + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); + ST12x4_UB(dst0, dst1, dst2, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_bi_4t_16w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 filt0, filt1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16i8 vec0, vec1; + v8i16 filter_vec, const_vec; + + src0_ptr -= 1; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6); + LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6); + LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7); + src1_ptr += (4 * src2_stride); + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst6 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); + VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); + dst7 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); + HEVC_BI_RND_CLIP4(in4, in5, in6, in7, + dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7); + + PCKEV_B4_SH(dst1, dst0, dst3, dst2, + dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3); + ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + int16_t *src1_ptr_tmp; + uint8_t *dst_tmp; + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 filt0, filt1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1, mask2, mask3; + v16i8 vec0, vec1; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8i16 filter_vec, const_vec; + + src0_ptr -= 1; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + mask2 = mask0 + 8; + mask3 = mask0 + 10; + + dst_tmp = dst + 16; + src1_ptr_tmp = src1_ptr + 16; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6); + LD_SB4(src0_ptr + 16, src_stride, src1, src3, src5, src7); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6); + LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7); + src1_ptr += (4 * src2_stride); + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + VSHF_B2_SB(src4, src5, src4, src5, mask2, mask3, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst6 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); + VSHF_B2_SB(src6, src7, src6, src7, mask2, mask3, vec0, vec1); + dst7 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); + HEVC_BI_RND_CLIP4(in4, in5, in6, in7, + dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7); + + PCKEV_B4_SH(dst1, dst0, dst3, dst2, + dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3); + ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += (4 * dst_stride); + + LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); + src1_ptr_tmp += (4 * src2_stride); + + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); + + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST8x4_UB(dst0, dst1, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + } +} + +static void hevc_hz_bi_4t_32w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2; + v8i16 in0, in1, in2, in3; + v8i16 filt0, filt1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1, mask2, mask3; + v8i16 dst0, dst1, dst2, dst3; + v16i8 vec0, vec1; + v8i16 filter_vec, const_vec; + + src0_ptr -= 1; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + mask2 = mask0 + 8; + mask3 = mask0 + 10; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src0_ptr, 16, src0, src1); + src2 = LD_SB(src0_ptr + 24); + src0_ptr += src_stride; + LD_SH4(src1_ptr, 8, in0, in1, in2, in3); + src1_ptr += src2_stride; + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); + + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST_SH2(dst0, dst1, dst, 16); + dst += dst_stride; + + LD_SB2(src0_ptr, 16, src0, src1); + src2 = LD_SB(src0_ptr + 24); + src0_ptr += src_stride; + LD_SH4(src1_ptr, 8, in0, in1, in2, in3); + src1_ptr += src2_stride; + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); + + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST_SH2(dst0, dst1, dst, 16); + dst += dst_stride; + } +} + +static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + v16i8 src0, src1, src2, src3, src4; + v8i16 in0, in1; + v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; + v8i16 dst10; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src0_ptr -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + + LD_SB2(src0_ptr, src_stride, src3, src4); + LD_SH2(src1_ptr, src2_stride, in0, in1); + in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r); + src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128); + + dst10 = const_vec; + DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); + dst10 = __msa_adds_s_h(dst10, in0); + dst10 = __msa_srari_h(dst10, 7); + dst10 = CLIP_SH_0_255(dst10); + + dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10); + ST4x2_UB(dst10, dst, dst_stride); +} + +static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6; + v8i16 in0, in1, in2, in3; + v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; + v16i8 src2110, src4332, src6554; + v8i16 dst10, dst32; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src0_ptr -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + + LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, + src32_r, src43_r, src54_r, src65_r); + ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554); + XORI_B2_128_SB(src4332, src6554); + + dst10 = const_vec; + DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); + dst32 = const_vec; + DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); + HEVC_BI_RND_CLIP2(in0, in1, dst10, dst32, 7, dst10, dst32); + + dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10); + ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride); +} + +static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v16i8 src6, src7, src8, src9; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; + v16i8 src2110, src4332, src6554, src8776; + v8i16 dst10, dst32, dst54, dst76; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src0_ptr -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8); + src0_ptr += (6 * src_stride); + LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); + src1_ptr += (8 * src2_stride); + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + ILVR_D2_SH(in5, in4, in7, in6, in2, in3); + ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, + src32_r, src43_r, src54_r, src65_r); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, + src4332, src6554, src8776); + XORI_B3_128_SB(src4332, src6554, src8776); + + dst10 = const_vec; + DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); + dst32 = const_vec; + DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); + dst54 = const_vec; + DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54); + + LD_SB2(src0_ptr, src_stride, src9, src2); + src0_ptr += (2 * src_stride); + ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + dst76 = const_vec; + DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst10, dst32, dst54, dst76, 7, + dst10, dst32, dst54, dst76); + + PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54); + ST4x8_UB(dst10, dst54, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +static void hevc_vt_bi_4t_4w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + if (2 == height) { + hevc_vt_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height); + } else if (4 == height) { + hevc_vt_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height); + } else { + hevc_vt_bi_4t_4x8multiple_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter, height); + } +} + +static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v8i16 in0, in1, in2, in3; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 dst0_r, dst1_r, dst2_r, dst3_r; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src0_ptr -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src0_ptr, src_stride, src3, src4); + src0_ptr += (2 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + + dst0_r = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); + + LD_SB2(src0_ptr, src_stride, src5, src2); + src0_ptr += (2 * src_stride); + XORI_B2_128_SB(src5, src2); + ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); + + dst2_r = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r); + dst3_r = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0_r, dst1_r, dst2_r, dst3_r, 7, + dst0_r, dst1_r, dst2_r, dst3_r); + + PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); + ST6x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + v16i8 src0, src1, src2, src3, src4; + v8i16 in0, in1, dst0_r, dst1_r; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src0_ptr -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + LD_SB2(src0_ptr, src_stride, src3, src4); + LD_SH2(src1_ptr, src2_stride, in0, in1); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + + dst0_r = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); + + HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r); + dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r); + + ST8x2_UB(dst0_r, dst, dst_stride); +} + +static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 in0, in1, in2, in3, in4, in5; + v16i8 src10_r, src32_r, src54_r, src76_r; + v16i8 src21_r, src43_r, src65_r, src87_r; + v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src0_ptr -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8); + LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); + XORI_B6_128_SB(src3, src4, src5, src6, src7, src8); + ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, + src32_r, src43_r, src54_r, src65_r); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + + dst0_r = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); + dst2_r = const_vec; + DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r); + dst3_r = const_vec; + DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r); + dst4_r = const_vec; + DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r); + dst5_r = const_vec; + DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r); + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0_r, dst1_r, dst2_r, dst3_r, 7, + dst0_r, dst1_r, dst2_r, dst3_r); + HEVC_BI_RND_CLIP2(in4, in5, dst4_r, dst5_r, 7, dst4_r, dst5_r); + + PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); + dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + ST8x2_UB(dst2_r, dst, dst_stride); +} + +static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v8i16 in0, in1, in2, in3; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 dst0_r, dst1_r, dst2_r, dst3_r; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src0_ptr -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src0_ptr, src_stride, src3, src4); + src0_ptr += (2 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + + dst0_r = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); + + LD_SB2(src0_ptr, src_stride, src5, src2); + src0_ptr += (2 * src_stride); + XORI_B2_128_SB(src5, src2); + ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); + + dst2_r = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r); + dst3_r = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r); + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0_r, dst1_r, dst2_r, dst3_r, 7, + dst0_r, dst1_r, dst2_r, dst3_r); + + PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_vt_bi_4t_8w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + if (2 == height) { + hevc_vt_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height); + } else if (6 == height) { + hevc_vt_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height); + } else { + hevc_vt_bi_4t_8x4multiple_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter, height); + } +} + +static void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 dst0_r, dst1_r, dst2_r, dst3_r; + v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; + v16i8 src2110, src4332; + v8i16 dst0_l, dst1_l, filt0, filt1; + v8i16 filter_vec, const_vec; + + src0_ptr -= (1 * src_stride); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src0_ptr, src_stride, src3, src4); + src0_ptr += (2 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7); + src1_ptr += (4 * src2_stride); + ILVR_D2_SH(in5, in4, in7, in6, in4, in5); + XORI_B2_128_SB(src3, src4); + + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); + src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l); + + dst0_r = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); + dst0_l = const_vec; + DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l); + + LD_SB2(src0_ptr, src_stride, src5, src2); + src0_ptr += (2 * src_stride); + XORI_B2_128_SB(src5, src2); + + ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); + ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l); + + dst2_r = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r); + dst3_r = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r); + dst1_l = const_vec; + DPADD_SB2_SH(src4332, src2110, filt0, filt1, dst1_l, dst1_l); + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0_r, dst1_r, dst2_r, dst3_r, 7, + dst0_r, dst1_r, dst2_r, dst3_r); + HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l); + + PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); + dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l); + ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_vt_bi_4t_16w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v8i16 in0, in1, in2, in3; + v16i8 src10_r, src32_r, src21_r, src43_r; + v16i8 src10_l, src32_l, src21_l, src43_l; + v8i16 dst0_r, dst1_r, dst0_l, dst1_l; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src0_ptr -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src0_ptr, src_stride, src3, src4); + src0_ptr += (2 * src_stride); + LD_SH2(src1_ptr, src2_stride, in0, in1); + LD_SH2((src1_ptr + 8), src2_stride, in2, in3); + src1_ptr += (2 * src2_stride); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); + + dst0_r = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); + dst0_l = const_vec; + DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); + dst1_l = const_vec; + DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0_r, dst1_r, dst0_l, dst1_l, 7, + dst0_r, dst1_r, dst0_l, dst1_l); + + PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); + ST_SH2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + + LD_SB2(src0_ptr, src_stride, src5, src2); + src0_ptr += (2 * src_stride); + LD_SH2(src1_ptr, src2_stride, in0, in1); + LD_SH2((src1_ptr + 8), src2_stride, in2, in3); + src1_ptr += (2 * src2_stride); + XORI_B2_128_SB(src5, src2); + ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); + ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); + + dst0_r = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); + dst0_l = const_vec; + DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l); + dst1_r = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); + dst1_l = const_vec; + DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l); + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0_r, dst1_r, dst0_l, dst1_l, 7, + dst0_r, dst1_r, dst0_l, dst1_l); + + PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); + ST_SH2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + } +} + +static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v16i8 src6, src7, src8, src9, src10, src11; + v8i16 in0, in1, in2, in3, in4, in5; + v16i8 src10_r, src32_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src87_r, src109_r; + v16i8 src10_l, src32_l, src21_l, src43_l; + v8i16 dst0_r, dst1_r, dst2_r, dst3_r; + v8i16 dst0_l, dst1_l; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src0_ptr -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + /* 16width */ + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + /* 8width */ + LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src6, src7, src8); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + /* 16width */ + LD_SB2(src0_ptr, src_stride, src3, src4); + LD_SH2(src1_ptr, src2_stride, in0, in1); + LD_SH2((src1_ptr + 8), src2_stride, in2, in3); + LD_SH2((src1_ptr + 16), src2_stride, in4, in5); + src1_ptr += (2 * src2_stride); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); + /* 8width */ + LD_SB2(src0_ptr + 16, src_stride, src9, src10); + src0_ptr += (2 * src_stride); + XORI_B2_128_SB(src9, src10); + ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); + /* 16width */ + dst0_r = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); + dst0_l = const_vec; + DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); + dst1_r = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); + dst1_l = const_vec; + DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); + /* 8width */ + dst2_r = const_vec; + DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r); + dst3_r = const_vec; + DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r); + /* 16width */ + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0_r, dst1_r, dst0_l, dst1_l, 7, + dst0_r, dst1_r, dst0_l, dst1_l); + + HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r); + + PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); + dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r); + ST_SH2(dst0_r, dst1_r, dst, dst_stride); + ST8x2_UB(dst2_r, dst + 16, dst_stride); + dst += (2 * dst_stride); + + /* 16width */ + LD_SB2(src0_ptr, src_stride, src5, src2); + LD_SH2(src1_ptr, src2_stride, in0, in1); + LD_SH2((src1_ptr + 8), src2_stride, in2, in3); + LD_SH2((src1_ptr + 16), src2_stride, in4, in5); + src1_ptr += (2 * src2_stride); + XORI_B2_128_SB(src5, src2); + ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); + ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); + /* 8width */ + LD_SB2(src0_ptr + 16, src_stride, src11, src8); + src0_ptr += (2 * src_stride); + XORI_B2_128_SB(src11, src8); + ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r); + /* 16width */ + dst0_r = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); + dst0_l = const_vec; + DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l); + dst1_r = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); + dst1_l = const_vec; + DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l); + /* 8width */ + dst2_r = const_vec; + DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r); + dst3_r = const_vec; + DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0_r, dst1_r, dst0_l, dst1_l, 7, + dst0_r, dst1_r, dst0_l, dst1_l); + HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r); + + PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); + dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r); + ST_SH2(dst0_r, dst1_r, dst, dst_stride); + ST8x2_UB(dst2_r, dst + 16, dst_stride); + dst += (2 * dst_stride); + } +} + +static void hevc_vt_bi_4t_32w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + uint8_t *dst_tmp = dst + 16; + v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v16i8 src10_r, src32_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src87_r, src109_r; + v8i16 dst0_r, dst1_r, dst2_r, dst3_r; + v16i8 src10_l, src32_l, src76_l, src98_l; + v16i8 src21_l, src43_l, src87_l, src109_l; + v8i16 dst0_l, dst1_l, dst2_l, dst3_l; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src0_ptr -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + /* 16width */ + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + + /* next 16width */ + LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src6, src7, src8); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); + + for (loop_cnt = (height >> 1); loop_cnt--;) { + /* 16width */ + LD_SB2(src0_ptr, src_stride, src3, src4); + LD_SH2(src1_ptr, src2_stride, in0, in1); + LD_SH2((src1_ptr + 8), src2_stride, in2, in3); + LD_SH2((src1_ptr + 16), src2_stride, in4, in5); + LD_SH2((src1_ptr + 24), src2_stride, in6, in7); + src1_ptr += (2 * src2_stride); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); + /* 16width */ + dst0_r = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); + dst0_l = const_vec; + DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); + dst1_r = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); + dst1_l = const_vec; + DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); + /* 16width */ + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + dst0_r, dst1_r, dst0_l, dst1_l, 7, + dst0_r, dst1_r, dst0_l, dst1_l); + + src10_r = src32_r; + src21_r = src43_r; + src10_l = src32_l; + src21_l = src43_l; + src2 = src4; + + PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); + ST_SH2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + + /* next 16width */ + LD_SB2(src0_ptr + 16, src_stride, src9, src10); + src0_ptr += (2 * src_stride); + XORI_B2_128_SB(src9, src10); + ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); + ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l); + /* next 16width */ + dst2_r = const_vec; + DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r); + dst2_l = const_vec; + DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l); + dst3_r = const_vec; + DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r); + dst3_l = const_vec; + DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l); + /* next 16width */ + HEVC_BI_RND_CLIP4(in4, in5, in6, in7, + dst2_r, dst3_r, dst2_l, dst3_l, 7, + dst2_r, dst3_r, dst2_l, dst3_l); + + PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r); + ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + + src76_r = src98_r; + src87_r = src109_r; + src76_l = src98_l; + src87_l = src109_l; + src8 = src10; + } +} + +static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + v8i16 in0, in1; + v16i8 src0, src1, src2, src3, src4; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4; + v4i32 dst0_r, dst1_r; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + + src0_ptr -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r); + + LD_SB2(src0_ptr, src_stride, src3, src4); + LD_SH2(src1_ptr, src2_stride, in0, in1); + in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); + XORI_B2_128_SB(src3, src4); + /* row 3 */ + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + dst32_r = __msa_ilvr_h(dst3, dst2); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_r >>= 6; + /* row 4 */ + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + dst43_r = __msa_ilvr_h(dst4, dst3); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_r >>= 6; + dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r); + dst0_r = (v4i32) __msa_adds_s_h((v8i16) dst0_r, in0); + dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 7); + dst0_r = (v4i32) CLIP_SH_0_255(dst0_r); + + dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r); + ST4x2_UB(dst0_r, dst, dst_stride); +} + +static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + v8i16 in0, in1, in2, in3; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v8i16 dst0_r, dst1_r; + v4i32 tmp0, tmp1, tmp2, tmp3; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + + src0_ptr -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r); + + LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + XORI_B4_128_SB(src3, src4, src5, src6); + /* row 3 */ + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + dst32_r = __msa_ilvr_h(dst3, dst2); + tmp0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + tmp0 >>= 6; + /* row 4 */ + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + dst43_r = __msa_ilvr_h(dst4, dst3); + tmp1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + tmp1 >>= 6; + /* row 5 */ + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + dst10_r = __msa_ilvr_h(dst5, dst4); + tmp2 = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1); + tmp2 >>= 6; + /* row 6 */ + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + dst21_r = __msa_ilvr_h(dst2, dst5); + tmp3 = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1); + tmp3 >>= 6; + PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r); + HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r); + + dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r); + ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); +} + +static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + uint32_t loop_cnt; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 dst10_r, dst32_r, dst54_r, dst76_r; + v8i16 dst21_r, dst43_r, dst65_r, dst87_r; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; + + src0_ptr -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r); + + for (loop_cnt = height >> 3; loop_cnt--;) { + LD_SB8(src0_ptr, src_stride, + src3, src4, src5, src6, src7, src8, src9, src10); + src0_ptr += (8 * src_stride); + LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); + src1_ptr += (8 * src2_stride); + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + ILVR_D2_SH(in5, in4, in7, in6, in2, in3); + XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); + /* row 3 */ + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + dst32_r = __msa_ilvr_h(dst3, dst2); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_r >>= 6; + /* row 4 */ + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + dst43_r = __msa_ilvr_h(dst4, dst3); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_r >>= 6; + /* row 5 */ + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + dst54_r = __msa_ilvr_h(dst5, dst4); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); + dst2_r >>= 6; + /* row 6 */ + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst6 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); + dst65_r = __msa_ilvr_h(dst6, dst5); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); + dst3_r >>= 6; + /* row 7 */ + VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); + dst7 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); + dst76_r = __msa_ilvr_h(dst7, dst6); + dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); + dst4_r >>= 6; + /* row 8 */ + VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1); + dst8 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8); + dst87_r = __msa_ilvr_h(dst8, dst7); + dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); + dst5_r >>= 6; + /* row 9 */ + VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1); + dst9 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9); + dst10_r = __msa_ilvr_h(dst9, dst8); + dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1); + dst6_r >>= 6; + /* row 10 */ + VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + dst21_r = __msa_ilvr_h(dst2, dst9); + dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1); + dst7_r >>= 6; + PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r, + dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3); + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3); + + PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + ST4x8_UB(tmp0, tmp1, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +static void hevc_hv_bi_4t_4w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + if (2 == height) { + hevc_hv_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, height); + } else if (4 == height) { + hevc_hv_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, height); + } else if (0 == (height % 8)) { + hevc_hv_bi_4t_4multx8mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, + filter_x, filter_y, height); + } +} + +static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v8i16 in0, in1, in2, in3; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v8i16 dst10_l, dst32_l, dst21_l, dst43_l; + + src0_ptr -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + for (loop_cnt = height >> 2; loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + XORI_B4_128_SB(src3, src4, src5, src6); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + dst0_r >>= 6; + dst0_l >>= 6; + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1); + dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1); + dst2_r >>= 6; + dst2_l >>= 6; + + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1); + dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1); + dst3_r >>= 6; + dst3_l >>= 6; + PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3); + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3); + + PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r); + ST6x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + v16i8 src0, src1, src2, src3, src4; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v8i16 dst10_l, dst32_l, dst21_l, dst43_l; + v8i16 tmp0, tmp1; + v8i16 in0, in1; + + src0_ptr -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + LD_SB2(src0_ptr, src_stride, src3, src4); + LD_SH2(src1_ptr, src2_stride, in0, in1); + XORI_B2_128_SB(src3, src4); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + dst0_r >>= 6; + dst0_l >>= 6; + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + + PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); + HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1); + + dst0_r = (v4i32) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); + ST8x2_UB(dst0_r, dst, dst_stride); +} + +static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 in0, in1, in2, in3, in4, in5; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; + v4i32 dst4_r, dst4_l, dst5_r, dst5_l; + v8i16 dst10_r, dst32_r, dst10_l, dst32_l; + v8i16 dst21_r, dst43_r, dst21_l, dst43_l; + v8i16 dst54_r, dst54_l, dst65_r, dst65_l; + v8i16 dst76_r, dst76_l, dst87_r, dst87_l; + + src0_ptr -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + LD_SB2(src0_ptr, src_stride, src3, src4); + src0_ptr += (2 * src_stride); + XORI_B2_128_SB(src3, src4); + LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + dst0_r >>= 6; + dst0_l >>= 6; + tmp0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + tmp1 = __msa_pckev_h((v8i16) dst1_l, (v8i16) dst1_r); + + LD_SB2(src0_ptr, src_stride, src5, src6); + src0_ptr += (2 * src_stride); + XORI_B2_128_SB(src5, src6); + /* row 5 */ + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); + dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); + dst2_r >>= 6; + dst2_l >>= 6; + tmp2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); + + /* row 6 */ + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst6 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); + + ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); + dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); + dst3_r >>= 6; + dst3_l >>= 6; + tmp3 = __msa_pckev_h((v8i16) dst3_l, (v8i16) dst3_r); + + LD_SB2(src0_ptr, src_stride, src7, src8); + XORI_B2_128_SB(src7, src8); + /* row 7 */ + VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); + dst7 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); + + ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); + dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); + dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1); + + dst4_r >>= 6; + dst4_l >>= 6; + tmp4 = __msa_pckev_h((v8i16) dst4_l, (v8i16) dst4_r); + /* row 8 */ + VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1); + dst8 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8); + + ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); + dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); + dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1); + dst5_r >>= 6; + dst5_l >>= 6; + tmp5 = __msa_pckev_h((v8i16) dst5_l, (v8i16) dst5_r); + + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3); + HEVC_BI_RND_CLIP2(in4, in5, tmp4, tmp5, 7, tmp4, tmp5); + + PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r); + dst2_r = (v4i32) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + ST8x2_UB(dst2_r, dst, dst_stride); +} + +static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t width) +{ + uint32_t loop_cnt, cnt; + uint8_t *src0_ptr_tmp; + int16_t *src1_ptr_tmp; + uint8_t *dst_tmp; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v8i16 in0, in1, in2, in3; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v8i16 dst10_l, dst32_l, dst21_l, dst43_l; + + src0_ptr -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + for (cnt = width >> 3; cnt--;) { + src0_ptr_tmp = src0_ptr; + dst_tmp = dst; + src1_ptr_tmp = src1_ptr; + + LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2); + src0_ptr_tmp += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + for (loop_cnt = height >> 2; loop_cnt--;) { + LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6); + src0_ptr_tmp += (4 * src_stride); + LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); + src1_ptr_tmp += (4 * src2_stride); + XORI_B4_128_SB(src3, src4, src5, src6); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + dst0_r >>= 6; + dst0_l >>= 6; + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1); + dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1); + dst2_r >>= 6; + dst2_l >>= 6; + + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1); + dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1); + dst3_r >>= 6; + dst3_l >>= 6; + + PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3); + HEVC_BI_RND_CLIP4(in0, in1, in2, in3, + tmp0, tmp1, tmp2, tmp3, 7, + tmp0, tmp1, tmp2, tmp3); + + PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r); + ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + } + + src0_ptr += 8; + dst += 8; + src1_ptr += 8; + } +} + +static void hevc_hv_bi_4t_8w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + if (2 == height) { + hevc_hv_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, height); + } else if (6 == height) { + hevc_hv_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, height); + } else { + hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, + filter_x, filter_y, height, 8); + } +} + +static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, 8); + hevc_hv_bi_4t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride, + dst + 8, dst_stride, filter_x, filter_y, height); +} + +static void hevc_hv_bi_4t_16w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, 16); +} + +static void hevc_hv_bi_4t_24w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, 24); +} + +static void hevc_hv_bi_4t_32w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const const int8_t *filter_y, + int32_t height) +{ + hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, 32); +} + +#define BI_MC_COPY(WIDTH) \ +void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \ + ptrdiff_t dst_stride, \ + uint8_t *src, \ + ptrdiff_t src_stride, \ + int16_t *src_16bit, \ + int height, \ + intptr_t mx, \ + intptr_t my, \ + int width) \ +{ \ + hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \ + dst, dst_stride, height); \ +} + +BI_MC_COPY(4); +BI_MC_COPY(6); +BI_MC_COPY(8); +BI_MC_COPY(12); +BI_MC_COPY(16); +BI_MC_COPY(24); +BI_MC_COPY(32); +BI_MC_COPY(48); +BI_MC_COPY(64); + +#undef BI_MC_COPY + +#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ +void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ + ptrdiff_t dst_stride, \ + uint8_t *src, \ + ptrdiff_t src_stride, \ + int16_t *src_16bit, \ + int height, \ + intptr_t mx, \ + intptr_t my, \ + int width) \ +{ \ + const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ + \ + hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \ + MAX_PB_SIZE, dst, dst_stride, \ + filter, height); \ +} + +BI_MC(qpel, h, 4, 8, hz, mx); +BI_MC(qpel, h, 8, 8, hz, mx); +BI_MC(qpel, h, 12, 8, hz, mx); +BI_MC(qpel, h, 16, 8, hz, mx); +BI_MC(qpel, h, 24, 8, hz, mx); +BI_MC(qpel, h, 32, 8, hz, mx); +BI_MC(qpel, h, 48, 8, hz, mx); +BI_MC(qpel, h, 64, 8, hz, mx); + +BI_MC(qpel, v, 4, 8, vt, my); +BI_MC(qpel, v, 8, 8, vt, my); +BI_MC(qpel, v, 12, 8, vt, my); +BI_MC(qpel, v, 16, 8, vt, my); +BI_MC(qpel, v, 24, 8, vt, my); +BI_MC(qpel, v, 32, 8, vt, my); +BI_MC(qpel, v, 48, 8, vt, my); +BI_MC(qpel, v, 64, 8, vt, my); + +BI_MC(epel, h, 4, 4, hz, mx); +BI_MC(epel, h, 8, 4, hz, mx); +BI_MC(epel, h, 6, 4, hz, mx); +BI_MC(epel, h, 12, 4, hz, mx); +BI_MC(epel, h, 16, 4, hz, mx); +BI_MC(epel, h, 24, 4, hz, mx); +BI_MC(epel, h, 32, 4, hz, mx); + +BI_MC(epel, v, 4, 4, vt, my); +BI_MC(epel, v, 8, 4, vt, my); +BI_MC(epel, v, 6, 4, vt, my); +BI_MC(epel, v, 12, 4, vt, my); +BI_MC(epel, v, 16, 4, vt, my); +BI_MC(epel, v, 24, 4, vt, my); +BI_MC(epel, v, 32, 4, vt, my); + +#undef BI_MC + +#define BI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \ +void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ + ptrdiff_t dst_stride, \ + uint8_t *src, \ + ptrdiff_t src_stride, \ + int16_t *src_16bit, \ + int height, \ + intptr_t mx, \ + intptr_t my, \ + int width) \ +{ \ + const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ + const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ + \ + hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \ + MAX_PB_SIZE, dst, dst_stride, \ + filter_x, filter_y, \ + height); \ +} + +BI_MC_HV(qpel, hv, 4, 8, hv); +BI_MC_HV(qpel, hv, 8, 8, hv); +BI_MC_HV(qpel, hv, 12, 8, hv); +BI_MC_HV(qpel, hv, 16, 8, hv); +BI_MC_HV(qpel, hv, 24, 8, hv); +BI_MC_HV(qpel, hv, 32, 8, hv); +BI_MC_HV(qpel, hv, 48, 8, hv); +BI_MC_HV(qpel, hv, 64, 8, hv); + +BI_MC_HV(epel, hv, 4, 4, hv); +BI_MC_HV(epel, hv, 8, 4, hv); +BI_MC_HV(epel, hv, 6, 4, hv); +BI_MC_HV(epel, hv, 12, 4, hv); +BI_MC_HV(epel, hv, 16, 4, hv); +BI_MC_HV(epel, hv, 24, 4, hv); +BI_MC_HV(epel, hv, 32, 4, hv); + +#undef BI_MC_HV diff --git a/libavcodec/mips/hevc_mc_biw_msa.c b/libavcodec/mips/hevc_mc_biw_msa.c new file mode 100644 index 0000000000..05a28ece44 --- /dev/null +++ b/libavcodec/mips/hevc_mc_biw_msa.c @@ -0,0 +1,5572 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "libavcodec/mips/hevcdsp_mips.h" +#include "libavcodec/mips/hevc_macros_msa.h" + +#define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \ + out0_r, out1_r, out0_l, out1_l) \ +{ \ + ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \ + ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \ + \ + out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \ + out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \ + out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \ + out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \ + \ + SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \ + \ + out0_r = CLIP_SW_0_255(out0_r); \ + out1_r = CLIP_SW_0_255(out1_r); \ + out0_l = CLIP_SW_0_255(out0_l); \ + out1_l = CLIP_SW_0_255(out1_l); \ +} + +#define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \ + wgt, rnd, offset, \ + out0_r, out1_r, out2_r, out3_r, \ + out0_l, out1_l, out2_l, out3_l) \ +{ \ + HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \ + out0_r, out1_r, out0_l, out1_l) \ + HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, \ + out2_r, out3_r, out2_l, out3_l) \ +} + +#define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \ +{ \ + ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \ + SRARI_H2_SH(out0, out1, rnd_val); \ + CLIP_SH2_0_255(out0, out1); \ +} + +#define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \ + vec0, vec1, vec2, vec3, rnd_val, \ + out0, out1, out2, out3) \ +{ \ + HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \ + HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \ +} + +static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + int32_t offset, weight; + v16i8 zero = { 0 }; + v4i32 weight_vec, offset_vec, rnd_vec; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + if (2 == height) { + v16i8 src0, src1; + v8i16 in0, in1, dst0; + v4i32 dst0_r, dst0_l; + + LD_SB2(src0_ptr, src_stride, src0, src1); + LD_SH2(src1_ptr, src2_stride, in0, in1); + in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); + src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0); + + dst0 = (v8i16) __msa_ilvr_b(zero, src0); + dst0 <<= 6; + + ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l); + dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, + (v8i16) weight_vec); + dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, + (v8i16) weight_vec); + SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); + dst0_r = CLIP_SW_0_255(dst0_r); + dst0_l = CLIP_SW_0_255(dst0_l); + + HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r); + ST4x2_UB(dst0_r, dst, dst_stride); + } else if (4 == height) { + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3; + v8i16 dst0, dst1; + v4i32 dst0_r, dst1_r, dst0_l, dst1_l; + + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + ILVR_W2_SB(src1, src0, src3, src2, src0, src1); + ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1); + dst0 <<= 6; + dst1 <<= 6; + HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride); + } else if (0 == height % 8) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 dst0, dst1, dst2, dst3; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src0_ptr, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src0_ptr += (8 * src_stride); + LD_SH8(src1_ptr, src2_stride, + in0, in1, in2, in3, in4, in5, in6, in7); + src1_ptr += (8 * src2_stride); + + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + ILVR_D2_SH(in5, in4, in7, in6, in2, in3); + ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6, + src0, src1, src2, src3); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0, dst1, dst2, dst3); + + SLLI_4V(dst0, dst1, dst2, dst3, 6); + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST4x8_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (8 * dst_stride); + } + } +} + +static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v16i8 zero = { 0 }; + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3; + v8i16 dst0, dst1, dst2, dst3; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 offset_vec, weight_vec, rnd_vec; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val + 1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0, dst1, dst2, dst3); + + SLLI_4V(dst0, dst1, dst2, dst3, 6); + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST6x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + int32_t offset, weight; + v16i8 zero = { 0 }; + v4i32 offset_vec, weight_vec, rnd_vec; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + if (2 == height) { + v16i8 src0, src1; + v8i16 in0, in1, dst0, dst1; + v4i32 dst0_r, dst1_r, dst0_l, dst1_l; + + LD_SB2(src0_ptr, src_stride, src0, src1); + LD_SH2(src1_ptr, src2_stride, in0, in1); + + ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1); + + dst0 <<= 6; + dst1 <<= 6; + HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST8x2_UB(dst0_r, dst, dst_stride); + } else if (6 == height) { + v16i8 src0, src1, src2, src3, src4, src5; + v8i16 in0, in1, in2, in3, in4, in5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + + LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5); + LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0, dst1, dst2, dst3); + ILVR_B2_SH(zero, src4, zero, src5, dst4, dst5); + + SLLI_4V(dst0, dst1, dst2, dst3, 6); + dst4 <<= 6; + dst5 <<= 6; + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5, + weight_vec, rnd_vec, offset_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + + HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + dst4_l, dst4_r, dst5_l, dst5_r, + dst0_r, dst1_r, dst2_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + ST8x2_UB(dst2_r, dst, dst_stride); + } else if (0 == height % 4) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3; + v8i16 dst0, dst1, dst2, dst3; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0, dst1, dst2, dst3); + + SLLI_4V(dst0, dst1, dst2, dst3, 6); + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v16i8 zero = { 0 }; + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + v4i32 offset_vec, weight_vec, rnd_vec; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + for (loop_cnt = (16 >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); + src1_ptr += (4 * src2_stride); + + ILVR_D2_SH(in5, in4, in7, in6, in4, in5); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0, dst1, dst2, dst3); + + SLLI_4V(dst0, dst1, dst2, dst3, 6); + ILVL_W2_SB(src1, src0, src3, src2, src0, src1); + ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5); + + dst4 <<= 6; + dst5 <<= 6; + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5, + weight_vec, rnd_vec, offset_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + + HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + dst4_l, dst4_r, dst5_l, dst5_r, + dst0_r, dst1_r, dst2_r); + ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_biwgt_copy_16multx4mult_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val, + int32_t width) +{ + uint32_t loop_cnt, cnt; + uint8_t *src0_ptr_tmp; + int16_t *src1_ptr_tmp; + uint8_t *dst_tmp; + int32_t offset, weight; + v16i8 zero = { 0 }; + v4i32 offset_vec, weight_vec, rnd_vec; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + for (cnt = (width >> 4); cnt--;) { + src0_ptr_tmp = src0_ptr; + src1_ptr_tmp = src1_ptr; + dst_tmp = dst; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l; + + LD_SB4(src0_ptr_tmp, src_stride, src0, src1, src2, src3); + src0_ptr_tmp += (4 * src_stride); + LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); + LD_SH4(src1_ptr_tmp + 8, src2_stride, in4, in5, in6, in7); + src1_ptr_tmp += (4 * src2_stride); + + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + tmp0, tmp1, tmp2, tmp3); + ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + tmp4, tmp5, tmp6, tmp7); + + SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6); + SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6); + HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5, + in0, in1, in4, in5, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, + dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + + HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7, + in2, in3, in6, in7, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, + dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + } + + src0_ptr += 16; + src1_ptr += 16; + dst += 16; + } +} + +static void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, height, weight0, + weight1, offset0, offset1, rnd_val, 16); +} + +static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, height, weight0, + weight1, offset0, offset1, rnd_val, 16); + hevc_biwgt_copy_8w_msa(src0_ptr + 16, src_stride, + src1_ptr + 16, src2_stride, + dst + 16, dst_stride, height, weight0, + weight1, offset0, offset1, rnd_val); +} + +static void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, height, weight0, + weight1, offset0, offset1, rnd_val, 32); +} + +static void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, height, weight0, + weight1, offset0, offset1, rnd_val, 48); +} + +static void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, height, weight0, + weight1, offset0, offset1, rnd_val, 64); +} + +static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v8i16 filt0, filt1, filt2, filt3; + v16i8 src0, src1, src2, src3; + v16i8 mask1, mask2, mask3; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1; + v8i16 in0, in1, in2, in3; + v4i32 dst0_r, dst1_r, dst0_l, dst1_l; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + + src0_ptr -= 3; + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + + HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v8i16 filt0, filt1, filt2, filt3; + v16i8 src0, src1, src2, src3; + v16i8 mask1, mask2, mask3; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3; + v8i16 in0, in1, in2, in3; + v4i32 dst0_r, dst1_r, dst0_l, dst1_l, dst2_r, dst3_r, dst2_l, dst3_l; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src0_ptr -= 3; + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_hz_biwgt_8t_8w_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height, + weight0, weight1, offset0, offset1, rnd_val); + hevc_hz_biwgt_8t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride, + dst + 8, dst_stride, filter, height, + weight0, weight1, offset0, offset1, rnd_val); +} + +static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src0_ptr -= 3; + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src0_ptr, 8, src0, src1); + src0_ptr += src_stride; + LD_SB2(src0_ptr, 8, src2, src3); + src0_ptr += src_stride; + LD_SH2(src1_ptr, 8, in0, in1); + src1_ptr += src2_stride; + LD_SH2(src1_ptr, 8, in2, in3); + src1_ptr += src2_stride; + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + } +} + +static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + uint64_t dst_val0; + int32_t offset, weight; + v16i8 src0, src1; + v8i16 in0, in1, in2; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2; + v4i32 dst0_r, dst1_r, dst2_r, dst0_l, dst1_l, dst2_l; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src0_ptr = src0_ptr - 3; + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = mask0 + 8; + mask5 = mask0 + 10; + mask6 = mask0 + 12; + mask7 = mask0 + 14; + + for (loop_cnt = height; loop_cnt--;) { + LD_SB2(src0_ptr, 16, src0, src1); + src0_ptr += src_stride; + LD_SH2(src1_ptr, 8, in0, in1); + in2 = LD_SH(src1_ptr + 16); + src1_ptr += src2_stride; + XORI_B2_128_SB(src0, src1); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + + HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l); + dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, + (v8i16) weight_vec); + dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, + (v8i16) weight_vec); + SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); + dst2_r = CLIP_SW_0_255(dst2_r); + dst2_l = CLIP_SW_0_255(dst2_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + HEVC_PCK_SW_SB2(dst2_l, dst2_r, dst2_r); + dst_val0 = __msa_copy_u_d((v2i64) dst2_r, 0); + ST_SW(dst0_r, dst); + SD(dst_val0, dst + 16); + dst += dst_stride; + } +} + +static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v16i8 src0, src1, src2; + v8i16 in0, in1, in2, in3; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= 3; + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = mask0 + 8; + mask5 = mask0 + 10; + mask6 = mask0 + 12; + mask7 = mask0 + 14; + + for (loop_cnt = height; loop_cnt--;) { + LD_SB2(src0_ptr, 16, src0, src1); + src2 = LD_SB(src0_ptr + 24); + src0_ptr += src_stride; + LD_SH4(src1_ptr, 8, in0, in1, in2, in3); + src1_ptr += src2_stride; + + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, 16); + dst += dst_stride; + } +} + +static void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + uint64_t dst_val0; + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3, in4, in5; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= 3; + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = mask0 + 8; + mask5 = mask0 + 10; + mask6 = mask0 + 12; + mask7 = mask0 + 14; + + for (loop_cnt = height; loop_cnt--;) { + LD_SB3(src0_ptr, 16, src0, src1, src2); + src3 = LD_SB(src0_ptr + 40); + src0_ptr += src_stride; + LD_SH2(src1_ptr, 8, in0, in1); + in2 = LD_SH(src1_ptr + 16); + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst4 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst4, dst4, dst4, dst4); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst5 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst5, dst5, dst5, dst5); + + HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l); + dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, + (v8i16) weight_vec); + dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, + (v8i16) weight_vec); + SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); + dst2_r = CLIP_SW_0_255(dst2_r); + dst2_l = CLIP_SW_0_255(dst2_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + HEVC_PCK_SW_SB2(dst2_l, dst2_r, dst2_r); + dst_val0 = __msa_copy_u_d((v2i64) dst2_r, 0); + ST_SW(dst0_r, dst); + SD(dst_val0, dst + 16); + + LD_SH2(src1_ptr + 24, 8, in3, in4); + in5 = LD_SH(src1_ptr + 40); + src1_ptr += src2_stride; + + HEVC_BIW_RND_CLIP2(dst3, dst4, in3, in4, + weight_vec, rnd_vec, offset_vec, + dst3_r, dst4_r, dst3_l, dst4_l); + + ILVRL_H2_SW(dst5, in5, dst5_r, dst5_l); + dst5_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst5_r, + (v8i16) weight_vec); + dst5_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst5_l, + (v8i16) weight_vec); + SRAR_W2_SW(dst5_r, dst5_l, rnd_vec); + dst5_r = CLIP_SW_0_255(dst5_r); + dst5_l = CLIP_SW_0_255(dst5_l); + + HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r); + HEVC_PCK_SW_SB2(dst3_l, dst3_r, dst3_r); + dst_val0 = __msa_copy_u_d((v2i64) dst3_r, 0); + SD(dst_val0, dst + 24); + ST_SW(dst4_r, dst + 32); + dst += dst_stride; + } +} + +static void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint8_t *src0_ptr_tmp; + uint8_t *dst_tmp; + int16_t *src1_ptr_tmp; + uint32_t loop_cnt, cnt; + int32_t offset, weight; + v16i8 src0, src1, src2; + v8i16 in0, in1, in2, in3; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= 3; + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = mask0 + 8; + mask5 = mask0 + 10; + mask6 = mask0 + 12; + mask7 = mask0 + 14; + + for (loop_cnt = height; loop_cnt--;) { + src0_ptr_tmp = src0_ptr; + dst_tmp = dst; + src1_ptr_tmp = src1_ptr; + + for (cnt = 2; cnt--;) { + LD_SB2(src0_ptr_tmp, 16, src0, src1); + src2 = LD_SB(src0_ptr_tmp + 24); + src0_ptr_tmp += 32; + LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3); + src1_ptr_tmp += 32; + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst_tmp, 16); + dst_tmp += 32; + } + + src0_ptr += src_stride; + src1_ptr += src2_stride; + dst += dst_stride; + + } +} + +static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src11, src12, src13, src14; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; + v16i8 src1110_r, src1211_r, src1312_r, src1413_r; + v16i8 src2110, src4332, src6554, src8776, src10998; + v16i8 src12111110, src14131312; + v8i16 dst10, dst32, dst54, dst76; + v4i32 dst10_r, dst32_r, dst54_r, dst76_r; + v4i32 dst10_l, dst32_l, dst54_l, dst76_l; + v8i16 filt0, filt1, filt2, filt3; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= (3 * src_stride); + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); + src0_ptr += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_r, src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, + src2110, src4332, src6554); + XORI_B3_128_SB(src2110, src4332, src6554); + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src0_ptr, src_stride, + src7, src8, src9, src10, src11, src12, src13, src14); + src0_ptr += (8 * src_stride); + LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); + src1_ptr += (8 * src2_stride); + + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + ILVR_D2_SH(in5, in4, in7, in6, in2, in3); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, + src1110_r, src1211_r, src1312_r, src1413_r); + ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r, + src1413_r, src1312_r, + src8776, src10998, src12111110, src14131312); + XORI_B4_128_SB(src8776, src10998, src12111110, src14131312); + + dst10 = const_vec; + DPADD_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt1, + filt2, filt3, dst10, dst10, dst10, dst10); + dst32 = const_vec; + DPADD_SB4_SH(src4332, src6554, src8776, src10998, + filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32); + dst54 = const_vec; + DPADD_SB4_SH(src6554, src8776, src10998, src12111110, + filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54); + dst76 = const_vec; + DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, + filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76); + + HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst10_r, dst32_r, dst54_r, dst76_r, + dst10_l, dst32_l, dst54_l, dst76_l); + + HEVC_PCK_SW_SB8(dst10_l, dst10_r, dst32_l, dst32_r, + dst54_l, dst54_r, dst76_l, dst76_r, dst10_r, dst54_r); + ST4x8_UB(dst10_r, dst54_r, dst, dst_stride); + dst += (8 * dst_stride); + + src2110 = src10998; + src4332 = src12111110; + src6554 = src14131312; + src6 = src14; + } +} + +static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v16i8 src0, src1, src2, src3, src4, src5; + v16i8 src6, src7, src8, src9, src10; + v8i16 in0, in1, in2, in3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt0, filt1, filt2, filt3; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= (3 * src_stride); + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); + src0_ptr += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_r, src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + + tmp0 = const_vec; + DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, + filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, + filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1); + tmp2 = const_vec; + DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, + filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, + filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3); + + HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src6 = src10; + } +} + +static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 in0, in1, in2, in3; + v16i8 src10_r, src32_r, src54_r, src76_r; + v16i8 src21_r, src43_r, src65_r, src87_r; + v8i16 tmp0, tmp1, tmp2; + v16i8 src10_l, src32_l, src54_l, src76_l; + v16i8 src21_l, src43_l, src65_l, src87_l; + v16i8 src2110, src4332, src6554, src8776; + v8i16 filt0, filt1, filt2, filt3; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst0_l, dst1_l, dst2_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= (3 * src_stride); + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); + src0_ptr += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_r, src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_l, src32_l, src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l, + src2110, src4332, src6554); + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src0_ptr, src_stride, src7, src8); + src0_ptr += (2 * src_stride); + LD_SH2(src1_ptr, src2_stride, in0, in1); + LD_SH2((src1_ptr + 8), src2_stride, in2, in3); + src1_ptr += (2 * src2_stride); + in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2); + XORI_B2_128_SB(src7, src8); + + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); + src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l); + + tmp0 = const_vec; + DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, + filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, + filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1); + tmp2 = const_vec; + DPADD_SB4_SH(src2110, src4332, src6554, src8776, + filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2); + + HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + ILVRL_H2_SW(tmp2, in2, dst2_r, dst2_l); + dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, + (v8i16) weight_vec); + dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, + (v8i16) weight_vec); + SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); + dst2_r = CLIP_SW_0_255(dst2_r); + dst2_l = CLIP_SW_0_255(dst2_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + HEVC_PCK_SW_SB2(dst2_l, dst2_r, dst2_r); + ST8x2_UB(dst0_r, dst, dst_stride); + ST4x2_UB(dst2_r, dst + 8, dst_stride); + dst += (2 * dst_stride); + + src10_r = src32_r; + src32_r = src54_r; + src54_r = src76_r; + src21_r = src43_r; + src43_r = src65_r; + src65_r = src87_r; + src2110 = src4332; + src4332 = src6554; + src6554 = src8776; + src6 = src8; + } +} + +static void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val, + int32_t width) +{ + uint8_t *src0_ptr_tmp; + int16_t *src1_ptr_tmp; + uint8_t *dst_tmp; + uint32_t loop_cnt, cnt; + int32_t offset, weight; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 in0, in1, in2, in3; + v16i8 src10_r, src32_r, src54_r, src76_r; + v16i8 src21_r, src43_r, src65_r, src87_r; + v16i8 src10_l, src32_l, src54_l, src76_l; + v16i8 src21_l, src43_l, src65_l, src87_l; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt0, filt1, filt2, filt3; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= (3 * src_stride); + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + for (cnt = (width >> 4); cnt--;) { + src0_ptr_tmp = src0_ptr; + src1_ptr_tmp = src1_ptr; + dst_tmp = dst; + + LD_SB7(src0_ptr_tmp, src_stride, + src0, src1, src2, src3, src4, src5, src6); + src0_ptr_tmp += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_r, src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_l, src32_l, src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src0_ptr_tmp, src_stride, src7, src8); + src0_ptr_tmp += (2 * src_stride); + LD_SH2(src1_ptr_tmp, src2_stride, in0, in1); + LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3); + src1_ptr_tmp += (2 * src2_stride); + + XORI_B2_128_SB(src7, src8); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); + + tmp0 = const_vec; + DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, + filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, + filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1); + tmp2 = const_vec; + DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l, + filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l, + filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3); + + HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, + dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + + src10_r = src32_r; + src32_r = src54_r; + src54_r = src76_r; + src21_r = src43_r; + src43_r = src65_r; + src65_r = src87_r; + src10_l = src32_l; + src32_l = src54_l; + src54_l = src76_l; + src21_l = src43_l; + src43_l = src65_l; + src65_l = src87_l; + src6 = src8; + } + + src0_ptr += 16; + src1_ptr += 16; + dst += 16; + } +} + +static void hevc_vt_biwgt_8t_16w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter, height, + weight0, weight1, offset0, offset1, + rnd_val, 16); +} + +static void hevc_vt_biwgt_8t_24w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter, height, + weight0, weight1, offset0, offset1, + rnd_val, 16); + hevc_vt_biwgt_8t_8w_msa(src0_ptr + 16, src_stride, + src1_ptr + 16, src2_stride, + dst + 16, dst_stride, filter, height, + weight0, weight1, offset0, offset1, rnd_val); +} + +static void hevc_vt_biwgt_8t_32w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter, height, + weight0, weight1, offset0, offset1, + rnd_val, 32); +} + +static void hevc_vt_biwgt_8t_48w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter, height, + weight0, weight1, offset0, offset1, + rnd_val, 48); +} + +static void hevc_vt_biwgt_8t_64w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter, height, + weight0, weight1, offset0, offset1, + rnd_val, 64); +} + +static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 in0, in1; + v8i16 filt0, filt1, filt2, filt3; + v4i32 filt_h0, filt_h1, filt_h2, filt_h3; + v16i8 mask1, mask2, mask3; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + v8i16 dst30, dst41, dst52, dst63, dst66, dst87; + v4i32 dst0_r, dst1_r; + v4i32 tmp1, tmp2; + v4i32 weight_vec0, weight_vec1, offset_vec, rnd_vec; + v8i16 dst10_r, dst32_r, dst54_r, dst76_r; + v8i16 dst21_r, dst43_r, dst65_r, dst87_r; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 }; + + src0_ptr -= ((3 * src_stride) + 3); + + filter_vec = LD_SH(filter_x); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec0 = __msa_fill_w(weight0); + weight_vec1 = __msa_fill_w(weight1); + rnd_vec = __msa_fill_w(rnd_val + 1); + + LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); + src0_ptr += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + + VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); + VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); + VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3, + vec8, vec9, vec10, vec11); + VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3, + vec12, vec13, vec14, vec15); + + dst30 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst30, dst30, dst30, dst30); + dst41 = const_vec; + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, + dst41, dst41, dst41, dst41); + dst52 = const_vec; + DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, + dst52, dst52, dst52, dst52); + dst63 = const_vec; + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, + dst63, dst63, dst63, dst63); + + ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52, + dst10_r, dst21_r, dst32_r); + dst43_r = __msa_ilvl_h(dst41, dst30); + dst54_r = __msa_ilvl_h(dst52, dst41); + dst65_r = __msa_ilvl_h(dst63, dst52); + dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); + + for (loop_cnt = height >> 1; loop_cnt--;) { + LD_SB2(src0_ptr, src_stride, src7, src8); + src0_ptr += (2 * src_stride); + LD_SH2(src1_ptr, src2_stride, in0, in1); + src1_ptr += (2 * src2_stride); + + in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); + XORI_B2_128_SB(src7, src8); + + VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst87 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst87, dst87, dst87, dst87); + dst76_r = __msa_ilvr_h(dst87, dst66); + dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, + filt_h0, filt_h1, filt_h2, filt_h3); + dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87); + dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, + filt_h0, filt_h1, filt_h2, filt_h3); + + dst0_r >>= 6; + dst1_r >>= 6; + + ILVRL_H2_SW(in0, in0, tmp1, tmp2); + tmp1 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp1, (v8i16) weight_vec0); + tmp2 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp2, (v8i16) weight_vec0); + tmp1 += dst0_r * weight_vec1; + tmp2 += dst1_r * weight_vec1; + SRAR_W2_SW(tmp1, tmp2, rnd_vec); + tmp1 = CLIP_SW_0_255(tmp1); + tmp2 = CLIP_SW_0_255(tmp2); + + HEVC_PCK_SW_SB2(tmp2, tmp1, tmp1); + ST4x2_UB(tmp1, dst, dst_stride); + dst += (2 * dst_stride); + + dst10_r = dst32_r; + dst32_r = dst54_r; + dst54_r = dst76_r; + dst21_r = dst43_r; + dst43_r = dst65_r; + dst65_r = dst87_r; + dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1); + } +} + +static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val, + int32_t width) +{ + uint32_t loop_cnt, cnt; + int32_t offset; + uint8_t *src0_ptr_tmp; + int16_t *src1_ptr_tmp; + uint8_t *dst_tmp; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 in0, in1; + v8i16 filt0, filt1, filt2, filt3; + v4i32 filt_h0, filt_h1, filt_h2, filt_h3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1, mask2, mask3; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l; + v4i32 tmp0, tmp1, tmp2, tmp3; + v8i16 dst10_r, dst32_r, dst54_r, dst76_r; + v8i16 dst10_l, dst32_l, dst54_l, dst76_l; + v8i16 dst21_r, dst43_r, dst65_r, dst87_r; + v8i16 dst21_l, dst43_l, dst65_l, dst87_l; + v4i32 weight_vec0, weight_vec1, offset_vec, rnd_vec; + + src0_ptr -= ((3 * src_stride) + 3); + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec0 = __msa_fill_w(weight0); + weight_vec1 = __msa_fill_w(weight1); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (cnt = width >> 3; cnt--;) { + src0_ptr_tmp = src0_ptr; + src1_ptr_tmp = src1_ptr; + dst_tmp = dst; + + LD_SB7(src0_ptr_tmp, src_stride, + src0, src1, src2, src3, src4, src5, src6); + src0_ptr_tmp += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + + /* row 0 row 1 row 2 row 3 */ + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec4, vec5, vec6, vec7); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec8, vec9, vec10, vec11); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec12, vec13, vec14, vec15); + + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + dst1 = const_vec; + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + dst2 = const_vec; + DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + dst3 = const_vec; + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + /* row 4 row 5 row 6 */ + VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, + vec4, vec5, vec6, vec7); + VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, + vec8, vec9, vec10, vec11); + + dst4 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst4, dst4, dst4, dst4); + dst5 = const_vec; + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, + dst5, dst5, dst5, dst5); + dst6 = const_vec; + DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, + dst6, dst6, dst6, dst6); + + ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, + dst10_r, dst32_r, dst54_r, dst21_r); + ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r); + ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, + dst10_l, dst32_l, dst54_l, dst21_l); + ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l); + + for (loop_cnt = height >> 1; loop_cnt--;) { + LD_SB2(src0_ptr_tmp, src_stride, src7, src8); + XORI_B2_128_SB(src7, src8); + src0_ptr_tmp += 2 * src_stride; + + LD_SH2(src1_ptr_tmp, src2_stride, in0, in1); + src1_ptr_tmp += (2 * src2_stride); + + VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst7 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst7, dst7, dst7, dst7); + + ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); + dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, + filt_h0, filt_h1, filt_h2, filt_h3); + dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, + filt_h0, filt_h1, filt_h2, filt_h3); + + dst0_r >>= 6; + dst0_l >>= 6; + + /* row 8 */ + VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + + dst8 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst8, dst8, dst8, dst8); + + ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); + dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, + filt_h0, filt_h1, filt_h2, filt_h3); + dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, + filt_h0, filt_h1, filt_h2, filt_h3); + + dst1_r >>= 6; + dst1_l >>= 6; + + ILVRL_H2_SW(in0, in0, tmp0, tmp1); + ILVRL_H2_SW(in1, in1, tmp2, tmp3); + tmp0 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp0, + (v8i16) weight_vec0); + tmp1 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp1, + (v8i16) weight_vec0); + tmp2 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp2, + (v8i16) weight_vec0); + tmp3 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp3, + (v8i16) weight_vec0); + + tmp0 += (dst0_r * weight_vec1); + tmp1 += (dst0_l * weight_vec1); + tmp2 += (dst1_r * weight_vec1); + tmp3 += (dst1_l * weight_vec1); + + SRAR_W4_SW(tmp0, tmp1, tmp2, tmp3, rnd_vec); + tmp0 = CLIP_SW_0_255(tmp0); + tmp1 = CLIP_SW_0_255(tmp1); + tmp2 = CLIP_SW_0_255(tmp2); + tmp3 = CLIP_SW_0_255(tmp3); + HEVC_PCK_SW_SB4(tmp1, tmp0, tmp3, tmp2, dst0_r); + ST8x2_UB(dst0_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + + dst10_r = dst32_r; + dst32_r = dst54_r; + dst54_r = dst76_r; + dst10_l = dst32_l; + dst32_l = dst54_l; + dst54_l = dst76_l; + dst21_r = dst43_r; + dst43_r = dst65_r; + dst65_r = dst87_r; + dst21_l = dst43_l; + dst43_l = dst65_l; + dst65_l = dst87_l; + dst6 = dst8; + } + + src0_ptr += 8; + src1_ptr += 8; + dst += 8; + } +} + +static void hevc_hv_biwgt_8t_8w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, weight0, weight1, offset0, + offset1, rnd_val, 8); +} + +static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, weight0, weight1, offset0, + offset1, rnd_val, 8); + hevc_hv_biwgt_8t_4w_msa(src0_ptr + 8, src_stride, + src1_ptr + 8, src2_stride, + dst + 8, dst_stride, filter_x, filter_y, + height, weight0, weight1, offset0, offset1, + rnd_val); +} + +static void hevc_hv_biwgt_8t_16w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, weight0, weight1, offset0, + offset1, rnd_val, 16); +} + +static void hevc_hv_biwgt_8t_24w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, weight0, weight1, offset0, + offset1, rnd_val, 24); +} + +static void hevc_hv_biwgt_8t_32w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, weight0, weight1, offset0, + offset1, rnd_val, 32); +} + +static void hevc_hv_biwgt_8t_48w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, weight0, weight1, offset0, + offset1, rnd_val, 48); +} + +static void hevc_hv_biwgt_8t_64w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, weight0, weight1, offset0, + offset1, rnd_val, 64); +} + +static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + int32_t offset, weight; + v8i16 filt0, filt1; + v16i8 src0, src1; + v8i16 in0, in1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v16i8 mask1, vec0, vec1; + v8i16 dst0; + v4i32 dst0_r, dst0_l; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + LD_SB2(src0_ptr, src_stride, src0, src1); + LD_SH2(src1_ptr, src2_stride, in0, in1); + in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); + XORI_B2_128_SB(src0, src1); + + VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l); + dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec); + dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec); + SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); + dst0_r = CLIP_SW_0_255(dst0_r); + dst0_l = CLIP_SW_0_255(dst0_l); + + HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r); + ST4x2_UB(dst0_r, dst, dst_stride); +} + +static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + int32_t offset, weight; + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v16i8 mask1; + v8i16 dst0, dst1; + v16i8 vec0, vec1; + v8i16 in0, in1, in2, in3; + v4i32 dst0_r, dst1_r, dst0_l, dst1_l; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= 1; + + /* rearranging filter */ + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + + VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride); +} + +static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t weight, offset; + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v16i8 mask1; + v16i8 vec0, vec1; + v8i16 dst0, dst1, dst2, dst3; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src0_ptr, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src0_ptr += (8 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7); + src1_ptr += (4 * src2_stride); + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + ILVR_D2_SH(in5, in4, in7, in6, in2, in3); + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + + VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST4x8_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +static void hevc_hz_biwgt_4t_4w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + if (2 == height) { + hevc_hz_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height, + weight0, weight1, offset0, offset1, rnd_val); + } else if (4 == height) { + hevc_hz_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height, + weight0, weight1, offset0, offset1, rnd_val); + } else if (0 == (height % 8)) { + hevc_hz_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter, height, + weight0, weight1, offset0, offset1, + rnd_val); + } +} + +static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v16i8 vec0, vec1; + v8i16 in0, in1, in2, in3; + v8i16 dst0, dst1, dst2, dst3; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST6x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + int32_t offset, weight; + v8i16 filt0, filt1; + v16i8 src0, src1; + v8i16 in0, in1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1, vec0, vec1; + v8i16 dst0, dst1; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst0_l, dst1_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + mask1 = mask0 + 2; + + LD_SB2(src0_ptr, src_stride, src0, src1); + LD_SH2(src1_ptr, src2_stride, in0, in1); + XORI_B2_128_SB(src0, src1); + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST8x2_UB(dst0_r, dst, dst_stride); +} + +static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + int32_t weight, offset; + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3, src4, src5; + v8i16 in0, in1, in2, in3, in4, in5; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v16i8 vec0, vec1; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + mask1 = mask0 + 2; + + LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5); + + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + LD_SH2(src1_ptr, src2_stride, in4, in5); + XORI_B6_128_SB(src0, src1, src2, src3, src4, src5); + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5, + weight_vec, rnd_vec, offset_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + + HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + ST8x2_UB(dst2_r, dst, dst_stride); +} + +static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v16i8 vec0, vec1; + v8i16 in0, in1, in2, in3; + v8i16 dst0, dst1, dst2, dst3; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_biwgt_4t_8w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + if (2 == height) { + hevc_hz_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height, + weight0, weight1, offset0, offset1, rnd_val); + } else if (6 == height) { + hevc_hz_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height, + weight0, weight1, offset0, offset1, rnd_val); + } else if (0 == (height % 4)) { + hevc_hz_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter, height, + weight0, weight1, offset0, offset1, + rnd_val); + } +} + +static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask2 = { + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 + }; + v16i8 mask1, mask3; + v16i8 vec0, vec1; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + mask1 = mask0 + 2; + mask3 = mask2 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); + src1_ptr += (4 * src2_stride); + ILVR_D2_SH(in5, in4, in7, in6, in4, in5); + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5, + weight_vec, rnd_vec, offset_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + + HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + dst4_l, dst4_r, dst5_l, dst5_r, + dst0_r, dst1_r, dst2_r); + ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 filt0, filt1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16i8 vec0, vec1; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6); + LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6); + LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7); + src1_ptr += (4 * src2_stride); + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst6 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); + VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); + dst7 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + + HEVC_BIW_RND_CLIP4(dst4, dst5, dst6, dst7, + in4, in5, in6, in7, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + } +} + +static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + uint8_t *dst_tmp = dst + 16; + v16i8 src0, src1, src2, src3; + v8i16 filt0, filt1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1, mask2, mask3; + v16i8 vec0, vec1; + v8i16 dst0, dst1, dst2, dst3; + v8i16 in0, in1, in2, in3, in4, in5; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + mask1 = mask0 + 2; + mask2 = mask0 + 8; + mask3 = mask0 + 10; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src0_ptr, src_stride, src0, src2); + LD_SB2(src0_ptr + 16, src_stride, src1, src3); + src0_ptr += (2 * src_stride); + LD_SH2(src1_ptr, src2_stride, in0, in2); + LD_SH2(src1_ptr + 8, src2_stride, in1, in3); + LD_SH2(src1_ptr + 16, src2_stride, in4, in5); + src1_ptr += (2 * src2_stride); + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + /* 8 width */ + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + HEVC_BIW_RND_CLIP2(dst0, dst1, in4, in5, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST8x2_UB(dst0_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + } +} + +static void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v16i8 src0, src1, src2; + v8i16 filt0, filt1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1, mask2, mask3; + v8i16 dst0, dst1, dst2, dst3; + v16i8 vec0, vec1; + v8i16 in0, in1, in2, in3; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + mask1 = mask0 + 2; + mask2 = mask0 + 8; + mask3 = mask0 + 10; + + for (loop_cnt = height; loop_cnt--;) { + LD_SB2(src0_ptr, 16, src0, src1); + src2 = LD_SB(src0_ptr + 24); + src0_ptr += src_stride; + LD_SH4(src1_ptr, 8, in0, in1, in2, in3); + src1_ptr += src2_stride; + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, 16); + dst += dst_stride; + } +} + +static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + int32_t weight, offset; + v16i8 src0, src1, src2, src3, src4; + v8i16 in0, in1, dst10; + v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; + v4i32 dst10_r, dst10_l; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= src_stride; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + LD_SB2(src0_ptr, src_stride, src3, src4); + src0_ptr += (2 * src_stride); + LD_SH2(src1_ptr, src2_stride, in0, in1); + src1_ptr += (2 * src2_stride); + + in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r); + src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128); + + dst10 = const_vec; + DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); + + ILVRL_H2_SW(dst10, in0, dst10_r, dst10_l); + dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec); + dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec); + SRAR_W2_SW(dst10_r, dst10_l, rnd_vec); + dst10_r = CLIP_SW_0_255(dst10_r); + dst10_l = CLIP_SW_0_255(dst10_l); + + HEVC_PCK_SW_SB2(dst10_l, dst10_r, dst10_r); + ST4x2_UB(dst10_r, dst, dst_stride); +} + +static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + int32_t weight, offset; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v8i16 in0, in1, in2, in3; + v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; + v16i8 src2110, src4332, src6554; + v8i16 dst10, dst32; + v4i32 dst10_r, dst32_r, dst10_l, dst32_l; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= src_stride; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + + LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, + src32_r, src43_r, src54_r, src65_r); + ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554); + XORI_B2_128_SB(src4332, src6554); + + dst10 = const_vec; + DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); + dst32 = const_vec; + DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); + + HEVC_BIW_RND_CLIP2(dst10, dst32, in0, in1, + weight_vec, rnd_vec, offset_vec, + dst10_r, dst32_r, dst10_l, dst32_l); + + HEVC_PCK_SW_SB4(dst10_l, dst10_r, dst32_l, dst32_r, dst10_r); + ST4x4_UB(dst10_r, dst10_r, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); +} + +static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t weight, offset; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; + v16i8 src2110, src4332, src6554, src8776; + v8i16 dst10, dst32, dst54, dst76; + v4i32 dst10_r, dst32_r, dst54_r, dst76_r; + v4i32 dst10_l, dst32_l, dst54_l, dst76_l; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= src_stride; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8); + src0_ptr += (6 * src_stride); + LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); + src1_ptr += (8 * src2_stride); + + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + ILVR_D2_SH(in5, in4, in7, in6, in2, in3); + + ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, + src32_r, src43_r, src54_r, src65_r); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, + src4332, src6554, src8776); + XORI_B3_128_SB(src4332, src6554, src8776); + + dst10 = const_vec; + DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); + dst32 = const_vec; + DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); + dst54 = const_vec; + DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54); + + LD_SB2(src0_ptr, src_stride, src9, src2); + src0_ptr += (2 * src_stride); + ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + + dst76 = const_vec; + DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76); + HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst10_r, dst32_r, dst54_r, dst76_r, + dst10_l, dst32_l, dst54_l, dst76_l); + + HEVC_PCK_SW_SB8(dst10_l, dst10_r, dst32_l, dst32_r, + dst54_l, dst54_r, dst76_l, dst76_r, dst10_r, dst54_r); + ST4x8_UB(dst10_r, dst54_r, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +static void hevc_vt_biwgt_4t_4w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + if (2 == height) { + hevc_vt_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height, + weight0, weight1, offset0, offset1, rnd_val); + } else if (4 == height) { + hevc_vt_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height, + weight0, weight1, offset0, offset1, rnd_val); + } else if (0 == (height % 8)) { + hevc_vt_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter, height, + weight0, weight1, offset0, offset1, + rnd_val); + } +} + +static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v16i8 src0, src1, src2, src3, src4; + v8i16 in0, in1, in2, in3; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + + src0_ptr -= src_stride; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src0_ptr, src_stride, src3, src4); + src0_ptr += (2 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + + tmp0 = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1); + + LD_SB2(src0_ptr, src_stride, src1, src2); + src0_ptr += (2 * src_stride); + XORI_B2_128_SB(src1, src2); + ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r); + + tmp2 = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3); + HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST6x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + int32_t offset, weight; + v16i8 src0, src1, src2, src3, src4; + v8i16 in0, in1, tmp0, tmp1; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v4i32 dst0_r, dst1_r, dst0_l, dst1_l; + + src0_ptr -= src_stride; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + LD_SB2(src0_ptr, src_stride, src3, src4); + LD_SH2(src1_ptr, src2_stride, in0, in1); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + + tmp0 = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1); + HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST8x2_UB(dst0_r, dst, dst_stride); +} + +static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + int32_t offset, weight; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 in0, in1, in2, in3, in4, in5; + v16i8 src10_r, src32_r, src54_r, src76_r; + v16i8 src21_r, src43_r, src65_r, src87_r; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + + src0_ptr -= src_stride; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8); + LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); + XORI_B6_128_SB(src3, src4, src5, src6, src7, src8); + ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, + src32_r, src43_r, src54_r, src65_r); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + + tmp0 = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1); + tmp2 = const_vec; + DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, tmp3, tmp3); + tmp4 = const_vec; + DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, tmp4, tmp4); + tmp5 = const_vec; + DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, tmp5, tmp5); + HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5, + weight_vec, rnd_vec, offset_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + + HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + ST8x2_UB(dst2_r, dst, dst_stride); +} + +static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v16i8 src0, src1, src2, src3, src4; + v8i16 in0, in1, in2, in3; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + + src0_ptr -= src_stride; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src0_ptr, src_stride, src3, src4); + src0_ptr += (2 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + + tmp0 = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1); + + LD_SB2(src0_ptr, src_stride, src1, src2); + src0_ptr += (2 * src_stride); + XORI_B2_128_SB(src1, src2); + ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r); + + tmp2 = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3); + HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_vt_biwgt_4t_8w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + if (2 == height) { + hevc_vt_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height, + weight0, weight1, offset0, offset1, rnd_val); + } else if (6 == height) { + hevc_vt_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter, height, + weight0, weight1, offset0, offset1, rnd_val); + } else { + hevc_vt_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter, height, + weight0, weight1, offset0, offset1, + rnd_val); + } +} + +static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v16i8 src0, src1, src2, src3, src4, src5; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; + v16i8 src2110, src4332; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + + src0_ptr -= (1 * src_stride); + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src0_ptr, src_stride, src3, src4); + src0_ptr += (2 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); + src1_ptr += (4 * src2_stride); + ILVR_D2_SH(in5, in4, in7, in6, in4, in5); + XORI_B2_128_SB(src3, src4); + + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); + src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l); + + tmp0 = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1); + tmp4 = const_vec; + DPADD_SB2_SH(src2110, src4332, filt0, filt1, tmp4, tmp4); + + LD_SB2(src0_ptr, src_stride, src5, src2); + src0_ptr += (2 * src_stride); + XORI_B2_128_SB(src5, src2); + ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); + ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l); + + tmp2 = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3); + tmp5 = const_vec; + DPADD_SB2_SH(src4332, src2110, filt0, filt1, tmp5, tmp5); + HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5, + weight_vec, rnd_vec, offset_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + + HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + dst4_l, dst4_r, dst5_l, dst5_r, + dst0_r, dst1_r, dst2_r); + ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v16i8 src0, src1, src2, src3, src4, src5; + v8i16 in0, in1, in2, in3; + v16i8 src10_r, src32_r, src21_r, src43_r; + v16i8 src10_l, src32_l, src21_l, src43_l; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + + src0_ptr -= src_stride; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src0_ptr, src_stride, src3, src4); + src0_ptr += (2 * src_stride); + LD_SH2(src1_ptr, src2_stride, in0, in1); + LD_SH2(src1_ptr + 8, src2_stride, in2, in3); + src1_ptr += (2 * src2_stride); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); + + tmp0 = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1); + tmp2 = const_vec; + DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp3, tmp3); + + HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, + dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + LD_SB2(src0_ptr, src_stride, src5, src2); + src0_ptr += (2 * src_stride); + + LD_SH2(src1_ptr, src2_stride, in0, in1); + LD_SH2(src1_ptr + 8, src2_stride, in2, in3); + src1_ptr += (2 * src2_stride); + XORI_B2_128_SB(src5, src2); + ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); + ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); + + tmp0 = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1); + tmp2 = const_vec; + DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp3, tmp3); + HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, + dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + } +} + +static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v16i8 src0, src1, src2, src3, src4, src5; + v16i8 src6, src7, src8, src9, src10, src11; + v8i16 in0, in1, in2, in3, in4, in5; + v16i8 src10_r, src32_r, src76_r, src98_r; + v16i8 src10_l, src32_l, src21_l, src43_l; + v16i8 src21_r, src43_r, src87_r, src109_r; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + + src0_ptr -= src_stride; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + /* 16width */ + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + /* 8width */ + LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src6, src7, src8); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + /* 16width */ + LD_SB2(src0_ptr, src_stride, src3, src4); + LD_SH2(src1_ptr, src2_stride, in0, in1); + LD_SH2(src1_ptr + 8, src2_stride, in2, in3); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); + + /* 8width */ + LD_SB2(src0_ptr + 16, src_stride, src9, src10); + src0_ptr += (2 * src_stride); + LD_SH2(src1_ptr + 16, src2_stride, in4, in5); + src1_ptr += (2 * src2_stride); + XORI_B2_128_SB(src9, src10); + ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); + /* 16width */ + tmp0 = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0); + tmp4 = const_vec; + DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4); + tmp1 = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1); + tmp5 = const_vec; + DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5); + /* 8width */ + tmp2 = const_vec; + DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3); + /* 16width */ + HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + /* 8width */ + HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5, + weight_vec, rnd_vec, offset_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + /* 16width */ + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, + dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); + /* 8width */ + HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + ST8x2_UB(dst4_r, dst + 16, dst_stride); + dst += (2 * dst_stride); + + /* 16width */ + LD_SB2(src0_ptr, src_stride, src5, src2); + LD_SH2(src1_ptr, src2_stride, in0, in1); + LD_SH2(src1_ptr + 8, src2_stride, in2, in3); + XORI_B2_128_SB(src5, src2); + ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); + ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); + /* 8width */ + LD_SB2(src0_ptr + 16, src_stride, src11, src8); + src0_ptr += (2 * src_stride); + LD_SH2(src1_ptr + 16, src2_stride, in4, in5); + src1_ptr += (2 * src2_stride); + XORI_B2_128_SB(src11, src8); + ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r); + /* 16width */ + tmp0 = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0); + tmp4 = const_vec; + DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp4, tmp4); + tmp1 = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1); + tmp5 = const_vec; + DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp5, tmp5); + /* 8width */ + tmp2 = const_vec; + DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, tmp3, tmp3); + /* 16width */ + HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + /* 8width */ + HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5, + weight_vec, rnd_vec, offset_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + /* 16width */ + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, + dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); + + /* 8width */ + HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + ST8x2_UB(dst4_r, dst + 16, dst_stride); + dst += (2 * dst_stride); + } +} + +static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + uint8_t *dst_tmp = dst + 16; + int32_t offset, weight; + v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v16i8 src10_r, src32_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src87_r, src109_r; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v16i8 src10_l, src32_l, src76_l, src98_l; + v16i8 src21_l, src43_l, src87_l, src109_l; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l, dst6_l, dst7_l; + + src0_ptr -= src_stride; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + /* 16width */ + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + /* next 16width */ + LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src6, src7, src8); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); + + for (loop_cnt = (height >> 1); loop_cnt--;) { + /* 16width */ + LD_SB2(src0_ptr, src_stride, src3, src4); + LD_SH2(src1_ptr, src2_stride, in0, in1); + LD_SH2(src1_ptr + 8, src2_stride, in2, in3); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); + + /* 16width */ + tmp0 = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0); + tmp4 = const_vec; + DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4); + tmp1 = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1); + tmp5 = const_vec; + DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5); + /* 16width */ + HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + /* 16width */ + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, + dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + + src10_r = src32_r; + src21_r = src43_r; + src10_l = src32_l; + src21_l = src43_l; + src2 = src4; + + /* next 16width */ + LD_SB2(src0_ptr + 16, src_stride, src9, src10); + src0_ptr += (2 * src_stride); + LD_SH2(src1_ptr + 16, src2_stride, in4, in5); + LD_SH2(src1_ptr + 24, src2_stride, in6, in7); + src1_ptr += (2 * src2_stride); + XORI_B2_128_SB(src9, src10); + ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); + ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l); + /* next 16width */ + tmp2 = const_vec; + DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2); + tmp6 = const_vec; + DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, tmp6, tmp6); + tmp3 = const_vec; + DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3); + tmp7 = const_vec; + DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, tmp7, tmp7); + /* next 16width */ + HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7, + in4, in5, in6, in7, + weight_vec, rnd_vec, offset_vec, + dst4_r, dst5_r, dst6_r, dst7_r, + dst4_l, dst5_l, dst6_l, dst7_l); + + /* next 16width */ + HEVC_PCK_SW_SB8(dst4_l, dst4_r, dst6_l, dst6_r, + dst5_l, dst5_r, dst7_l, dst7_r, dst4_r, dst5_r); + ST_SW2(dst4_r, dst5_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + + src76_r = src98_r; + src87_r = src109_r; + src76_l = src98_l; + src87_l = src109_l; + src8 = src10; + } +} + +static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + int32_t offset, weight; + v8i16 in0, in1; + v16i8 src0, src1, src2, src3, src4; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4; + v4i32 dst0_r, dst1_r, dst0_l; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r); + + LD_SB2(src0_ptr, src_stride, src3, src4); + LD_SH2(src1_ptr, src2_stride, in0, in1); + in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); + XORI_B2_128_SB(src3, src4); + /* row 3 */ + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + dst32_r = __msa_ilvr_h(dst3, dst2); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_r >>= 6; + /* row 4 */ + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + dst43_r = __msa_ilvr_h(dst4, dst3); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r); + + ILVRL_H2_SW(dst1_r, in0, dst0_r, dst0_l); + dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec); + dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec); + SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); + dst0_r = CLIP_SW_0_255(dst0_r); + dst0_l = CLIP_SW_0_255(dst0_l); + + HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r); + ST4x2_UB(dst0_r, dst, dst_stride); +} + +static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + int32_t offset, weight; + v8i16 in0, in1, in2, in3; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v8i16 tmp0, tmp1; + v4i32 dst0_l, dst1_l; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r); + + LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + XORI_B4_128_SB(src3, src4, src5, src6); + /* row 3 */ + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + dst32_r = __msa_ilvr_h(dst3, dst2); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_r >>= 6; + /* row 4 */ + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + dst43_r = __msa_ilvr_h(dst4, dst3); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_r >>= 6; + /* row 5 */ + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + dst10_r = __msa_ilvr_h(dst5, dst4); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1); + dst2_r >>= 6; + /* row 6 */ + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + dst21_r = __msa_ilvr_h(dst2, dst5); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1); + dst3_r >>= 6; + PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1); + HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride); +} + +static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 dst10_r, dst32_r, dst54_r, dst76_r; + v8i16 dst21_r, dst43_r, dst65_r, dst87_r; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r); + + for (loop_cnt = height >> 3; loop_cnt--;) { + LD_SB8(src0_ptr, src_stride, + src3, src4, src5, src6, src7, src8, src9, src10); + src0_ptr += (8 * src_stride); + LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); + src1_ptr += (8 * src2_stride); + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + ILVR_D2_SH(in5, in4, in7, in6, in2, in3); + XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); + /* row 3 */ + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + dst32_r = __msa_ilvr_h(dst3, dst2); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_r >>= 6; + /* row 4 */ + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + dst43_r = __msa_ilvr_h(dst4, dst3); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_r >>= 6; + /* row 5 */ + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + dst54_r = __msa_ilvr_h(dst5, dst4); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); + dst2_r >>= 6; + /* row 6 */ + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst6 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); + dst65_r = __msa_ilvr_h(dst6, dst5); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); + dst3_r >>= 6; + VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); + dst7 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); + dst76_r = __msa_ilvr_h(dst7, dst6); + dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); + dst4_r >>= 6; + VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1); + dst8 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8); + dst87_r = __msa_ilvr_h(dst8, dst7); + dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); + dst5_r >>= 6; + VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1); + dst9 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9); + dst10_r = __msa_ilvr_h(dst9, dst8); + dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1); + dst6_r >>= 6; + VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + dst21_r = __msa_ilvr_h(dst2, dst9); + dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1); + dst7_r >>= 6; + PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r, + dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3); + HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST4x8_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +static void hevc_hv_biwgt_4t_4w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + if (2 == height) { + hevc_hv_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, weight0, weight1, offset0, offset1, + rnd_val); + } else if (4 == height) { + hevc_hv_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, weight0, weight1, offset0, offset1, + rnd_val); + } else if (0 == (height % 8)) { + hevc_hv_biwgt_4t_4multx8mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, weight0, weight1, + offset0, offset1, rnd_val); + } +} + +static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t loop_cnt; + int32_t offset, weight; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v8i16 in0, in1, in2, in3; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v8i16 dst10_l, dst32_l, dst21_l, dst43_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + for (loop_cnt = height >> 2; loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + XORI_B4_128_SB(src3, src4, src5, src6); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + dst0_r >>= 6; + dst0_l >>= 6; + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1); + dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1); + dst2_r >>= 6; + dst2_l >>= 6; + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1); + dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1); + dst3_r >>= 6; + dst3_l >>= 6; + PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3); + HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST6x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + int32_t weight, offset; + v16i8 src0, src1, src2, src3, src4; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4; + v8i16 in0, in1; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v8i16 dst10_l, dst32_l, dst21_l, dst43_l; + v8i16 tmp0, tmp1; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + LD_SB2(src0_ptr, src_stride, src3, src4); + + LD_SH2(src1_ptr, src2_stride, in0, in1); + XORI_B2_128_SB(src3, src4); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + dst0_r >>= 6; + dst0_l >>= 6; + tmp0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + tmp1 = __msa_pckev_h((v8i16) dst1_l, (v8i16) dst1_r); + + HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST8x2_UB(dst0_r, dst, dst_stride); +} + +static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + uint32_t offset, weight; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; + v4i32 dst4_r, dst4_l, dst5_r, dst5_l; + v8i16 dst10_r, dst32_r, dst10_l, dst32_l; + v8i16 dst21_r, dst43_r, dst21_l, dst43_l; + v8i16 dst54_r, dst54_l, dst65_r, dst65_l; + v8i16 dst76_r, dst76_l, dst87_r, dst87_l; + v8i16 in0, in1, in2, in3, in4, in5; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + LD_SB3(src0_ptr, src_stride, src0, src1, src2); + src0_ptr += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + LD_SB2(src0_ptr, src_stride, src3, src4); + src0_ptr += (2 * src_stride); + XORI_B2_128_SB(src3, src4); + LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + dst0_r >>= 6; + dst0_l >>= 6; + tmp0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + tmp1 = __msa_pckev_h((v8i16) dst1_l, (v8i16) dst1_r); + + LD_SB2(src0_ptr, src_stride, src5, src6); + src0_ptr += (2 * src_stride); + XORI_B2_128_SB(src5, src6); + + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); + dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); + dst2_r >>= 6; + dst2_l >>= 6; + tmp2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); + + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst6 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); + + ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); + dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); + dst3_r >>= 6; + dst3_l >>= 6; + tmp3 = __msa_pckev_h((v8i16) dst3_l, (v8i16) dst3_r); + + HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + + LD_SB2(src0_ptr, src_stride, src7, src8); + XORI_B2_128_SB(src7, src8); + + VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); + dst7 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); + + ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); + dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); + dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1); + dst4_r >>= 6; + dst4_l >>= 6; + tmp4 = __msa_pckev_h((v8i16) dst4_l, (v8i16) dst4_r); + + VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1); + dst8 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8); + + ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); + dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); + dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1); + dst5_r >>= 6; + dst5_l >>= 6; + tmp5 = __msa_pckev_h((v8i16) dst5_l, (v8i16) dst5_r); + + HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5, + weight_vec, rnd_vec, offset_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + + HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst2_r); + ST8x2_UB(dst2_r, dst, dst_stride); +} + +static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val, + int32_t width) +{ + uint32_t loop_cnt; + uint32_t cnt; + int32_t offset, weight; + uint8_t *src0_ptr_tmp; + int16_t *src1_ptr_tmp; + uint8_t *dst_tmp; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v8i16 in0, in1, in2, in3; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v8i16 dst10_l, dst32_l, dst21_l, dst43_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + for (cnt = width >> 3; cnt--;) { + src0_ptr_tmp = src0_ptr; + src1_ptr_tmp = src1_ptr; + dst_tmp = dst; + + LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2); + src0_ptr_tmp += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + for (loop_cnt = height >> 2; loop_cnt--;) { + LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6); + src0_ptr_tmp += (4 * src_stride); + LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); + src1_ptr_tmp += (4 * src2_stride); + XORI_B4_128_SB(src3, src4, src5, src6); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + dst0_r >>= 6; + dst0_l >>= 6; + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1); + dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1); + dst2_r >>= 6; + dst2_l >>= 6; + + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1); + dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1); + dst3_r >>= 6; + dst3_l >>= 6; + + PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3); + HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + } + + src0_ptr += 8; + dst += 8; + src1_ptr += 8; + } +} + +static void hevc_hv_biwgt_4t_8w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + if (2 == height) { + hevc_hv_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, weight0, weight1, offset0, offset1, + rnd_val); + } else if (6 == height) { + hevc_hv_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, weight0, weight1, offset0, offset1, + rnd_val); + } else if (0 == (height % 4)) { + hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, filter_x, filter_y, + height, weight0, + weight1, offset0, offset1, rnd_val, 8); + } +} + +static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, + filter_x, filter_y, height, weight0, + weight1, offset0, offset1, rnd_val, 8); + + hevc_hv_biwgt_4t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride, + dst + 8, dst_stride, filter_x, filter_y, + height, weight0, weight1, offset0, + offset1, rnd_val); +} + +static void hevc_hv_biwgt_4t_16w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, + filter_x, filter_y, height, weight0, + weight1, offset0, offset1, rnd_val, 16); +} + +static void hevc_hv_biwgt_4t_24w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, + filter_x, filter_y, height, weight0, + weight1, offset0, offset1, rnd_val, 24); +} + +static void hevc_hv_biwgt_4t_32w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) +{ + hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, + src1_ptr, src2_stride, + dst, dst_stride, + filter_x, filter_y, height, weight0, + weight1, offset0, offset1, rnd_val, 32); +} + +#define BI_W_MC_COPY(WIDTH) \ +void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \ + ptrdiff_t dst_stride, \ + uint8_t *src, \ + ptrdiff_t src_stride, \ + int16_t *src_16bit, \ + int height, \ + int denom, \ + int weight0, \ + int weight1, \ + int offset0, \ + int offset1, \ + intptr_t mx, \ + intptr_t my, \ + int width) \ +{ \ + int shift = 14 + 1 - 8; \ + int log2Wd = denom + shift - 1; \ + \ + hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \ + dst, dst_stride, height, \ + weight0, weight1, offset0, \ + offset1, log2Wd); \ +} + +BI_W_MC_COPY(4); +BI_W_MC_COPY(6); +BI_W_MC_COPY(8); +BI_W_MC_COPY(12); +BI_W_MC_COPY(16); +BI_W_MC_COPY(24); +BI_W_MC_COPY(32); +BI_W_MC_COPY(48); +BI_W_MC_COPY(64); + +#undef BI_W_MC_COPY + +#define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ +void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ + ptrdiff_t \ + dst_stride, \ + uint8_t *src, \ + ptrdiff_t \ + src_stride, \ + int16_t *src_16bit, \ + int height, \ + int denom, \ + int weight0, \ + int weight1, \ + int offset0, \ + int offset1, \ + intptr_t mx, \ + intptr_t my, \ + int width) \ +{ \ + const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ + \ + int shift = 14 + 1 - 8; \ + int log2Wd = denom + shift - 1; \ + \ + hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, \ + src_16bit, MAX_PB_SIZE, \ + dst, dst_stride, \ + filter, height, \ + weight0, weight1, offset0, \ + offset1, log2Wd); \ +} + +BI_W_MC(qpel, h, 4, 8, hz, mx); +BI_W_MC(qpel, h, 8, 8, hz, mx); +BI_W_MC(qpel, h, 12, 8, hz, mx); +BI_W_MC(qpel, h, 16, 8, hz, mx); +BI_W_MC(qpel, h, 24, 8, hz, mx); +BI_W_MC(qpel, h, 32, 8, hz, mx); +BI_W_MC(qpel, h, 48, 8, hz, mx); +BI_W_MC(qpel, h, 64, 8, hz, mx); + +BI_W_MC(qpel, v, 4, 8, vt, my); +BI_W_MC(qpel, v, 8, 8, vt, my); +BI_W_MC(qpel, v, 12, 8, vt, my); +BI_W_MC(qpel, v, 16, 8, vt, my); +BI_W_MC(qpel, v, 24, 8, vt, my); +BI_W_MC(qpel, v, 32, 8, vt, my); +BI_W_MC(qpel, v, 48, 8, vt, my); +BI_W_MC(qpel, v, 64, 8, vt, my); + +BI_W_MC(epel, h, 4, 4, hz, mx); +BI_W_MC(epel, h, 8, 4, hz, mx); +BI_W_MC(epel, h, 6, 4, hz, mx); +BI_W_MC(epel, h, 12, 4, hz, mx); +BI_W_MC(epel, h, 16, 4, hz, mx); +BI_W_MC(epel, h, 24, 4, hz, mx); +BI_W_MC(epel, h, 32, 4, hz, mx); + +BI_W_MC(epel, v, 4, 4, vt, my); +BI_W_MC(epel, v, 8, 4, vt, my); +BI_W_MC(epel, v, 6, 4, vt, my); +BI_W_MC(epel, v, 12, 4, vt, my); +BI_W_MC(epel, v, 16, 4, vt, my); +BI_W_MC(epel, v, 24, 4, vt, my); +BI_W_MC(epel, v, 32, 4, vt, my); + +#undef BI_W_MC + +#define BI_W_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \ +void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ + ptrdiff_t \ + dst_stride, \ + uint8_t *src, \ + ptrdiff_t \ + src_stride, \ + int16_t *src_16bit, \ + int height, \ + int denom, \ + int weight0, \ + int weight1, \ + int offset0, \ + int offset1, \ + intptr_t mx, \ + intptr_t my, \ + int width) \ +{ \ + const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ + const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ + \ + int shift = 14 + 1 - 8; \ + int log2Wd = denom + shift - 1; \ + \ + hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, \ + src_16bit, MAX_PB_SIZE, \ + dst, dst_stride, \ + filter_x, filter_y, \ + height, weight0, weight1, \ + offset0, offset1, log2Wd); \ +} + +BI_W_MC_HV(qpel, hv, 4, 8, hv); +BI_W_MC_HV(qpel, hv, 8, 8, hv); +BI_W_MC_HV(qpel, hv, 12, 8, hv); +BI_W_MC_HV(qpel, hv, 16, 8, hv); +BI_W_MC_HV(qpel, hv, 24, 8, hv); +BI_W_MC_HV(qpel, hv, 32, 8, hv); +BI_W_MC_HV(qpel, hv, 48, 8, hv); +BI_W_MC_HV(qpel, hv, 64, 8, hv); + +BI_W_MC_HV(epel, hv, 4, 4, hv); +BI_W_MC_HV(epel, hv, 8, 4, hv); +BI_W_MC_HV(epel, hv, 6, 4, hv); +BI_W_MC_HV(epel, hv, 12, 4, hv); +BI_W_MC_HV(epel, hv, 16, 4, hv); +BI_W_MC_HV(epel, hv, 24, 4, hv); +BI_W_MC_HV(epel, hv, 32, 4, hv); + +#undef BI_W_MC_HV diff --git a/libavcodec/mips/hevc_mc_uni_msa.c b/libavcodec/mips/hevc_mc_uni_msa.c new file mode 100644 index 0000000000..754fbdbb41 --- /dev/null +++ b/libavcodec/mips/hevc_mc_uni_msa.c @@ -0,0 +1,3964 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "libavcodec/mips/hevcdsp_mips.h" +#include "libavcodec/mips/hevc_macros_msa.h" + +static void copy_width8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + out4 = __msa_copy_u_d((v2i64) src4, 0); + out5 = __msa_copy_u_d((v2i64) src5, 0); + out6 = __msa_copy_u_d((v2i64) src6, 0); + out7 = __msa_copy_u_d((v2i64) src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + for (cnt = height >> 3; cnt--;) { + LD_UB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + out4 = __msa_copy_u_d((v2i64) src4, 0); + out5 = __msa_copy_u_d((v2i64) src5, 0); + out6 = __msa_copy_u_d((v2i64) src6, 0); + out7 = __msa_copy_u_d((v2i64) src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 4) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 2) { + for (cnt = (height / 2); cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + + SD(out0, dst); + dst += dst_stride; + SD(out1, dst); + dst += dst_stride; + } + } +} + +static void copy_width12_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); + dst += (8 * dst_stride); + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); +} + +static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t width) +{ + int32_t cnt, loop_cnt; + uint8_t *src_tmp, *dst_tmp; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src_tmp, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src_tmp += (8 * src_stride); + + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, + dst_tmp, dst_stride); + dst_tmp += (8 * dst_stride); + } + + src += 16; + dst += 16; + } +} + +static void copy_width16_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, + dst, dst_stride); + dst += (8 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); + } else if (0 == height % 4) { + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void copy_width24_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); + copy_width8_msa(src + 16, src_stride, dst + 16, dst_stride, height); +} + +static void copy_width32_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32); + } else if (0 == height % 4) { + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void copy_width48_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 48); +} + +static void copy_width64_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64); +} + +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \ + filt0, filt1, filt2, filt3) \ +( { \ + v8i16 tmp0, tmp1; \ + \ + tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \ + tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \ + tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2); \ + tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3); \ + tmp0 = __msa_adds_s_h(tmp0, tmp1); \ + \ + tmp0; \ +} ) + +#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ + mask0, mask1, mask2, mask3, \ + filt0, filt1, filt2, filt3, \ + out0, out1) \ +{ \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m; \ + \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ + DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ + DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ + DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ + DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \ + ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \ +} + +#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ + mask0, mask1, mask2, mask3, \ + filt0, filt1, filt2, filt3, \ + out0, out1, out2, out3) \ +{ \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ + res0_m, res1_m, res2_m, res3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ + res4_m, res5_m, res6_m, res7_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ + res0_m, res1_m, res2_m, res3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ + res4_m, res5_m, res6_m, res7_m); \ + ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \ + res7_m, out0, out1, out2, out3); \ +} + +#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \ +( { \ + v8i16 tmp0; \ + \ + tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \ + tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \ + \ + tmp0; \ +} ) + +#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ + mask0, mask1, filt0, filt1, \ + out0, out1) \ +{ \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ + \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ + DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ + DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ +} + +#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ + mask0, mask1, filt0, filt1, \ + out0, out1, out2, out3) \ +{ \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ + out0, out1, out2, out3); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ + DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ + out0, out1, out2, out3); \ +} + +static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, uint8_t rnd_val) +{ + v16u8 mask0, mask1, mask2, mask3, out; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v8i16 filt, out0, out1; + v8i16 rnd_vec; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1); + SRAR_H2_SH(out0, out1, rnd_vec); + SAT_SH2_SH(out0, out1, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, uint8_t rnd_val) +{ + v16i8 filt0, filt1, filt2, filt3; + v16i8 src0, src1, src2, src3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + v8i16 rnd_vec; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out2, out3); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, uint8_t rnd_val) +{ + v16u8 mask0, mask1, mask2, mask3, out; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v8i16 filt, out0, out1, out2, out3; + v8i16 rnd_vec; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out2, out3); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out2, out3); + + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, uint8_t rnd_val) +{ + if (4 == height) { + common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val); + } else if (8 == height) { + common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter, rnd_val); + } else if (16 == height) { + common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter, + rnd_val); + } +} + +static void common_hz_8t_8x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, uint8_t rnd_val) +{ + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + v8i16 rnd_vec; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); +} + +static void common_hz_8t_8x8mult_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + v8i16 rnd_vec; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, + out1, out2, out3); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + if (4 == height) { + common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val); + } else { + common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, + height, rnd_val); + } +} + +static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint8_t *src1_ptr, *dst1; + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v8i16 filt, out0, out1, out2, out3; + v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00, tmp0, tmp1; + v8i16 rnd_vec; + + mask00 = LD_UB(&mc_filt_mask_arr[0]); + mask0 = LD_UB(&mc_filt_mask_arr[16]); + rnd_vec = __msa_fill_h(rnd_val); + + src1_ptr = src - 3; + dst1 = dst; + + dst = dst1 + 8; + src = src1_ptr + 8; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask00 + 2; + mask2 = mask00 + 4; + mask3 = mask00 + 6; + mask4 = mask0 + 2; + mask5 = mask0 + 4; + mask6 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + /* 8 width */ + LD_SB4(src1_ptr, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src1_ptr += (4 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask00, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, + out1, out2, out3); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst1, dst_stride); + dst1 += (4 * dst_stride); + + /* 4 width */ + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask4, mask5, + mask6, filt0, filt1, filt2, filt3, out0, + out1); + SRAR_H2_SH(out0, out1, rnd_vec); + SAT_SH2_SH(out0, out1, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + v8i16 rnd_vec; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, src_stride, src0, src2); + LD_SB2(src + 8, src_stride, src1, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (2 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, + out1, out2, out3); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst); + dst += dst_stride; + } +} + +static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; + v16i8 vec11; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10; + v8i16 out11, filt; + v8i16 rnd_vec; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = mask0 + 8; + mask5 = mask0 + 10; + mask6 = mask0 + 12; + mask7 = mask0 + 14; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, src_stride, src0, src2); + LD_SB2(src + 16, src_stride, src1, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (2 * src_stride); + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8); + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9); + VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3); + DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0, + out8, out2, out9); + DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3); + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8); + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9); + VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3); + DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2, out4, + out10, out6, out11); + DOTP_SB2_SH(vec1, vec3, filt2, filt2, out5, out7); + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10); + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11); + VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7); + DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1, + out0, out8, out2, out9); + DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3); + VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10); + VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11); + VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7); + DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3, + out4, out10, out6, out11); + DPADD_SB2_SH(vec5, vec7, filt3, filt3, out5, out7); + ADDS_SH4_SH(out0, out4, out8, out10, out2, out6, out9, out11, out0, + out8, out2, out9); + ADDS_SH2_SH(out1, out5, out3, out7, out1, out3); + SRAR_H4_SH(out0, out8, out2, out9, rnd_vec); + SRAR_H2_SH(out1, out3, rnd_vec); + SAT_SH4_SH(out0, out8, out2, out9, 7); + SAT_SH2_SH(out1, out3, 7); + out = PCKEV_XORI128_UB(out8, out9); + ST8x2_UB(out, dst + 16, dst_stride); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst); + dst += dst_stride; + } +} + +static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + v8i16 rnd_vec; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, + out1, out2, out3); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + dst += dst_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, + out1, out2, out3); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + dst += dst_stride; + } +} + +static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2; + v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out; + v8i16 filt, out0, out1, out2, out3, out4, out5, out6; + v8i16 rnd_vec; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = mask0 + 8; + mask5 = mask0 + 10; + mask6 = mask0 + 12; + mask7 = mask0 + 14; + + for (loop_cnt = height; loop_cnt--;) { + LD_SB3(src, 16, src0, src2, src3); + src1 = __msa_sldi_b(src2, src0, 8); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0, + vec0, vec1, vec2); + DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2); + VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1, + vec0, vec1, vec2); + DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1); + out2 = __msa_dpadd_s_h(out2, vec2, filt1); + VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2, + vec0, vec1, vec2); + DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5); + VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3, + vec0, vec1, vec2); + DPADD_SB2_SH(vec0, vec1, filt3, filt3, out3, out4); + out5 = __msa_dpadd_s_h(out5, vec2, filt3); + ADDS_SH2_SH(out0, out3, out1, out4, out0, out1); + out2 = __msa_adds_s_h(out2, out5); + SRAR_H2_SH(out0, out1, rnd_vec); + out6 = __msa_srar_h(out2, rnd_vec); + SAT_SH3_SH(out0, out1, out6, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + + src1 = LD_SB(src + 40); + src += src_stride; + src1 = (v16i8) __msa_xori_b((v16u8) src1, 128); + + VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask4, mask0, mask0, + vec0, vec1, vec2); + DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2); + VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask5, mask1, mask1, + vec0, vec1, vec2); + DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1); + out2 = __msa_dpadd_s_h(out2, vec2, filt1); + VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask6, mask2, mask2, + vec0, vec1, vec2); + DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5); + VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask7, mask3, mask3, + vec0, vec1, vec2); + DPADD_SB2_SH(vec0, vec1, filt3, filt3, out3, out4); + out5 = __msa_dpadd_s_h(out5, vec2, filt3); + ADDS_SH2_SH(out0, out3, out1, out4, out3, out4); + out5 = __msa_adds_s_h(out2, out5); + SRAR_H3_SH(out3, out4, out5, rnd_vec); + SAT_SH3_SH(out3, out4, out5, 7); + out = PCKEV_XORI128_UB(out6, out3); + ST_UB(out, dst + 16); + out = PCKEV_XORI128_UB(out4, out5); + ST_UB(out, dst + 32); + dst += dst_stride; + } +} + +static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + v8i16 rnd_vec; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, + mask2, mask3, filt0, filt1, filt2, filt3, + out0, out1, out2, out3); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + + src0 = LD_SB(src + 32); + src2 = LD_SB(src + 48); + src3 = LD_SB(src + 56); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, + mask2, mask3, filt0, filt1, filt2, filt3, + out0, out1, out2, out3); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst + 32); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 48); + dst += dst_stride; + } +} + +static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; + v16i8 src10998, filt0, filt1, filt2, filt3; + v16u8 out; + v8i16 filt, out10, out32; + v8i16 rnd_vec; + + src -= (3 * src_stride); + rnd_vec = __msa_fill_h(rnd_val); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, + src4332, src6554); + XORI_B3_128_SB(src2110, src4332, src6554); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); + XORI_B2_128_SB(src8776, src10998); + out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, + filt1, filt2, filt3); + out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, + filt1, filt2, filt3); + SRAR_H2_SH(out10, out32, rnd_vec); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + src2110 = src6554; + src4332 = src8776; + src6554 = src10998; + src6 = src10; + } +} + +static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; + v16u8 tmp0, tmp1; + v8i16 filt, out0_r, out1_r, out2_r, out3_r; + v8i16 rnd_vec; + + src -= (3 * src_stride); + rnd_vec = __msa_fill_h(rnd_val); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); + tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src6 = src10; + } +} + +static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + int32_t loop_cnt; + uint32_t out2, out3; + uint64_t out0, out1; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1; + v16i8 res2, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec01, vec23, vec45, vec67, tmp0, tmp1, tmp2; + v8i16 filt, filt0, filt1, filt2, filt3; + v8i16 rnd_vec; + v4i32 mask = { 2, 6, 2, 6 }; + + src -= (3 * src_stride); + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter_y */ + filt = LD_SH(filter); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + + /* 4 width */ + VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1); + VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3); + VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5); + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, src_stride, src7, src8); + XORI_B2_128_SB(src7, src8); + src += (2 * src_stride); + + ILVR_B4_SH(src1, src0, src3, src2, src5, src4, src7, src6, + vec01, vec23, vec45, vec67); + tmp0 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1, + filt2, filt3); + ILVR_B4_SH(src2, src1, src4, src3, src6, src5, src8, src7, vec01, vec23, + vec45, vec67); + tmp1 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1, + filt2, filt3); + + /* 4 width */ + VSHF_W2_SB(src6, src7, src7, src8, mask, mask, vec6, vec7); + ILVR_B4_SH(vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec01, vec23, + vec45, vec67); + tmp2 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1, + filt2, filt3); + SRAR_H3_SH(tmp0, tmp1, tmp2, rnd_vec); + SAT_SH3_SH(tmp0, tmp1, tmp2, 7); + PCKEV_B3_SB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, res0, res1, res2); + XORI_B3_128_SB(res0, res1, res2); + + out0 = __msa_copy_u_d((v2i64) res0, 0); + out1 = __msa_copy_u_d((v2i64) res1, 0); + out2 = __msa_copy_u_w((v4i32) res2, 0); + out3 = __msa_copy_u_w((v4i32) res2, 1); + SD(out0, dst); + SW(out2, (dst + 8)); + dst += dst_stride; + SD(out1, dst); + SW(out3, (dst + 8)); + dst += dst_stride; + + src0 = src2; + src1 = src3; + src2 = src4; + src3 = src5; + src4 = src6; + src5 = src7; + src6 = src8; + vec0 = vec2; + vec1 = vec3; + vec2 = vec4; + vec3 = vec5; + vec4 = vec6; + vec5 = vec7; + } +} + +static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt0, filt1, filt2, filt3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + v8i16 rnd_vec; + + src -= (3 * src_stride); + rnd_vec = __msa_fill_h(rnd_val); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec); + SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } +} + +static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val, int32_t width) +{ + uint8_t *src_tmp; + uint8_t *dst_tmp; + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt0, filt1, filt2, filt3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + v8i16 rnd_vec; + + src -= (3 * src_stride); + rnd_vec = __msa_fill_h(rnd_val); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src_tmp += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, + src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, + src32_l, src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src_tmp += (4 * src_stride); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, + filt0, filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, + filt0, filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, + filt0, filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, + filt0, filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, + filt0, filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, + filt0, filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, + filt0, filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, + filt0, filt1, filt2, filt3); + SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec); + SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } + + src += 16; + dst += 16; + } +} + +static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, uint8_t rnd_val) +{ + common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, + rnd_val, 16); + + common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter, + height, rnd_val); +} + +static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, uint8_t rnd_val) +{ + common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, + rnd_val, 32); +} + +static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, uint8_t rnd_val) +{ + common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, + rnd_val, 48); +} + +static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, uint8_t rnd_val) +{ + common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, + rnd_val, 64); +} + +static void hevc_hv_uni_8t_4w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 filt0, filt1, filt2, filt3; + v4i32 filt_h0, filt_h1, filt_h2, filt_h3; + v16i8 mask1, mask2, mask3; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + v8i16 dst30, dst41, dst52, dst63, dst66, dst87; + v4i32 dst0_r, dst1_r; + v8i16 dst10_r, dst32_r, dst54_r, dst76_r; + v8i16 dst21_r, dst43_r, dst65_r, dst87_r; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v8i16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 }; + + src -= ((3 * src_stride) + 3); + filter_vec = LD_SH(filter_x); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + + VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); + VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); + VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3, + vec8, vec9, vec10, vec11); + VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3, + vec12, vec13, vec14, vec15); + + dst30 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst30, dst30, dst30, dst30); + dst41 = const_vec; + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, + dst41, dst41, dst41, dst41); + dst52 = const_vec; + DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, + dst52, dst52, dst52, dst52); + dst63 = const_vec; + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, + dst63, dst63, dst63, dst63); + + ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52, + dst10_r, dst21_r, dst32_r); + dst43_r = __msa_ilvl_h(dst41, dst30); + dst54_r = __msa_ilvl_h(dst52, dst41); + dst65_r = __msa_ilvl_h(dst63, dst52); + dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); + + for (loop_cnt = height >> 1; loop_cnt--;) { + LD_SB2(src, src_stride, src7, src8); + src += 2 * src_stride; + XORI_B2_128_SB(src7, src8); + + VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst87 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst87, dst87, dst87, dst87); + + dst76_r = __msa_ilvr_h(dst87, dst66); + dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, + filt_h0, filt_h1, filt_h2, filt_h3); + dst87_r = __msa_vshf_h(mask4, dst87, dst87); + dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, + filt_h0, filt_h1, filt_h2, filt_h3); + + dst0_r >>= 6; + dst1_r >>= 6; + SRARI_W2_SW(dst0_r, dst1_r, 6); + dst0_r = CLIP_SW_0_255(dst0_r); + dst1_r = CLIP_SW_0_255(dst1_r); + + HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r); + ST4x2_UB(dst0_r, dst, dst_stride); + dst += (2 * dst_stride); + + dst10_r = dst32_r; + dst32_r = dst54_r; + dst54_r = dst76_r; + dst21_r = dst43_r; + dst43_r = dst65_r; + dst65_r = dst87_r; + dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1); + } +} + +static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, int32_t width) +{ + uint32_t loop_cnt, cnt; + uint8_t *src_tmp; + uint8_t *dst_tmp; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 filt0, filt1, filt2, filt3; + v4i32 filt_h0, filt_h1, filt_h2, filt_h3; + v16i8 mask1, mask2, mask3; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l; + v8i16 dst10_r, dst32_r, dst54_r, dst76_r; + v8i16 dst10_l, dst32_l, dst54_l, dst76_l; + v8i16 dst21_r, dst43_r, dst65_r, dst87_r; + v8i16 dst21_l, dst43_l, dst65_l, dst87_l; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src -= ((3 * src_stride) + 3); + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter_x); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (cnt = width >> 3; cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + src_tmp += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + + /* row 0 row 1 row 2 row 3 */ + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec4, vec5, vec6, vec7); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec8, vec9, vec10, vec11); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec12, vec13, vec14, vec15); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + dst1 = const_vec; + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + dst2 = const_vec; + DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + dst3 = const_vec; + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, + vec4, vec5, vec6, vec7); + VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, + vec8, vec9, vec10, vec11); + dst4 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst4, dst4, dst4, dst4); + dst5 = const_vec; + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, + dst5, dst5, dst5, dst5); + dst6 = const_vec; + DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, + dst6, dst6, dst6, dst6); + + ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, + dst10_r, dst32_r, dst54_r, dst21_r); + ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r); + ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, + dst10_l, dst32_l, dst54_l, dst21_l); + ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l); + + for (loop_cnt = height >> 1; loop_cnt--;) { + LD_SB2(src_tmp, src_stride, src7, src8); + XORI_B2_128_SB(src7, src8); + src_tmp += 2 * src_stride; + + VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst7 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst7, dst7, dst7, dst7); + + ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); + dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, + filt_h0, filt_h1, filt_h2, filt_h3); + dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, + filt_h0, filt_h1, filt_h2, filt_h3); + dst0_r >>= 6; + dst0_l >>= 6; + + VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst8 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst8, dst8, dst8, dst8); + + ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); + dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, + filt_h0, filt_h1, filt_h2, filt_h3); + dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, + filt_h0, filt_h1, filt_h2, filt_h3); + dst1_r >>= 6; + dst1_l >>= 6; + SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6); + dst0_r = CLIP_SW_0_255(dst0_r); + dst0_l = CLIP_SW_0_255(dst0_l); + dst1_r = CLIP_SW_0_255(dst1_r); + dst1_l = CLIP_SW_0_255(dst1_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST8x2_UB(dst0_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + + dst10_r = dst32_r; + dst32_r = dst54_r; + dst54_r = dst76_r; + dst10_l = dst32_l; + dst32_l = dst54_l; + dst54_l = dst76_l; + dst21_r = dst43_r; + dst43_r = dst65_r; + dst65_r = dst87_r; + dst21_l = dst43_l; + dst43_l = dst65_l; + dst65_l = dst87_l; + dst6 = dst8; + } + + src += 8; + dst += 8; + } +} + +static void hevc_hv_uni_8t_8w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 8); +} + +static void hevc_hv_uni_8t_12w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 8); + + hevc_hv_uni_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride, + filter_x, filter_y, height); +} + +static void hevc_hv_uni_8t_16w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 16); +} + +static void hevc_hv_uni_8t_24w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 24); +} + +static void hevc_hv_uni_8t_32w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 32); +} + +static void hevc_hv_uni_8t_48w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 48); +} + +static void hevc_hv_uni_8t_64w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 64); +} + +static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, uint8_t rnd_val) +{ + v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1; + v16u8 out; + v8i16 filt, res0; + v8i16 rnd_vec; + + mask0 = LD_SB(&mc_filt_mask_arr[16]); + src -= 1; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + LD_SB2(src, src_stride, src0, src1); + XORI_B2_128_SB(src0, src1); + VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); + res0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1); + res0 = __msa_srar_h(res0, rnd_vec); + res0 = __msa_sat_s_h(res0, 7); + out = PCKEV_XORI128_UB(res0, res0); + ST4x2_UB(out, dst, dst_stride); +} + +static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, uint8_t rnd_val) +{ + v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; + v8i16 filt, out0, out1; + v16u8 out; + v8i16 rnd_vec; + + mask0 = LD_SB(&mc_filt_mask_arr[16]); + src -= 1; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, + filt0, filt1, out0, out1); + SRAR_H2_SH(out0, out1, rnd_vec); + SAT_SH2_SH(out0, out1, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, uint8_t rnd_val) +{ + v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; + v16u8 out; + v8i16 filt, out0, out1, out2, out3; + v8i16 rnd_vec; + + mask0 = LD_SB(&mc_filt_mask_arr[16]); + src -= 1; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, + filt0, filt1, out0, out1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, + filt0, filt1, out2, out3); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, uint8_t rnd_val) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 filt0, filt1, mask0, mask1; + v16u8 out; + v8i16 filt, out0, out1, out2, out3; + v8i16 rnd_vec; + + mask0 = LD_SB(&mc_filt_mask_arr[16]); + src -= 1; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, + filt0, filt1, out0, out1); + HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, + filt0, filt1, out2, out3); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, + filt0, filt1, out0, out1); + HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, + filt0, filt1, out2, out3); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + if (2 == height) { + common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter, rnd_val); + } else if (4 == height) { + common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val); + } else if (8 == height) { + common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter, rnd_val); + } else if (16 == height) { + common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter, + rnd_val); + } +} + +static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; + v16u8 out4, out5; + v8i16 filt, out0, out1, out2, out3; + v8i16 rnd_vec; + + mask0 = LD_SB(&mc_filt_mask_arr[0]); + src -= 1; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, + filt1, out0, out1, out2, out3); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + + out4 = PCKEV_XORI128_UB(out0, out1); + out5 = PCKEV_XORI128_UB(out2, out3); + ST6x4_UB(out4, out5, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, filt0, filt1, mask0, mask1; + v16u8 out; + v8i16 filt, vec0, vec1, vec2, vec3; + v8i16 rnd_vec; + + mask0 = LD_SB(&mc_filt_mask_arr[0]); + src -= 1; + rnd_vec = __msa_fill_h(rnd_val); + + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, src_stride, src0, src1); + src += (2 * src_stride); + + XORI_B2_128_SB(src0, src1); + VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1); + DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1); + VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3); + DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1); + SRAR_H2_SH(vec0, vec1, rnd_vec); + SAT_SH2_SH(vec0, vec1, 7); + out = PCKEV_XORI128_UB(vec0, vec1); + ST8x2_UB(out, dst, dst_stride); + dst += (2 * dst_stride); + } +} + +static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; + v16u8 tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + v8i16 rnd_vec; + + mask0 = LD_SB(&mc_filt_mask_arr[0]); + src -= 1; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, + filt1, out0, out1, out2, out3); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + if ((2 == height) || (6 == height)) { + common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter, + height, rnd_val); + } else { + common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter, + height, rnd_val); + } +} + +static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v16i8 vec10, vec11; + v16u8 tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3, out4, out5; + v8i16 rnd_vec; + + mask0 = LD_SB(&mc_filt_mask_arr[0]); + mask2 = LD_SB(&mc_filt_mask_arr[32]); + + src -= 1; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + mask3 = mask2 + 2; + + rnd_vec = __msa_fill_h(rnd_val); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5); + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7); + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1); + DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, + out2, out3, out4, out5); + DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1); + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9); + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11); + VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3); + DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1, + out2, out3, out4, out5); + DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRAR_H2_SH(out4, out5, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + SAT_SH2_SH(out4, out5, 7); + tmp0 = PCKEV_XORI128_UB(out2, out3); + tmp1 = PCKEV_XORI128_UB(out4, out5); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + tmp0 = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 filt0, filt1, mask0, mask1; + v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7; + v16u8 out; + v8i16 rnd_vec; + + mask0 = LD_SB(&mc_filt_mask_arr[0]); + src -= 1; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, + filt1, out0, out1, out2, out3); + HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0, + filt1, out4, out5, out6, out7); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRAR_H4_SH(out4, out5, out6, out7, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out4, out5, out6, out7, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out4, out5); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out6, out7); + ST_UB(out, dst); + dst += dst_stride; + } +} + +static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint8_t *dst1 = dst + 16; + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v16i8 filt0, filt1, mask0, mask1, mask00, mask11; + v8i16 filt, out0, out1, out2, out3; + v16u8 tmp0, tmp1; + v8i16 rnd_vec; + + mask0 = LD_SB(&mc_filt_mask_arr[0]); + src -= 1; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + mask00 = mask0 + 8; + mask11 = mask0 + 10; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 16, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1); + VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3); + VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5); + VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + out0, out1, out2, out3); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, + out0, out1, out2, out3); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + ST_UB(tmp0, dst); + dst += dst_stride; + tmp0 = PCKEV_XORI128_UB(out2, out3); + ST_UB(tmp0, dst); + dst += dst_stride; + + VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1); + VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3); + VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5); + VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + out0, out1, out2, out3); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, + out0, out1, out2, out3); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + ST_UB(tmp0, dst); + dst += dst_stride; + tmp0 = PCKEV_XORI128_UB(out2, out3); + ST_UB(tmp0, dst); + dst += dst_stride; + + /* 8 width */ + VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1); + VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3); + VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5); + VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7); + + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + out0, out1, out2, out3); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, + out0, out1, out2, out3); + + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst1, dst_stride); + dst1 += (4 * dst_stride); + } +} + +static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 filt0, filt1, mask0, mask1; + v16u8 out; + v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 rnd_vec; + + mask0 = LD_SB(&mc_filt_mask_arr[0]); + src -= 1; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src += src_stride; + src4 = LD_SB(src); + src6 = LD_SB(src + 16); + src7 = LD_SB(src + 24); + SLDI_B2_SB(src2, src6, src0, src4, src1, src5, 8); + src += src_stride; + + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, + filt0, filt1, out0, out1, out2, out3); + HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, + filt0, filt1, out4, out5, out6, out7); + SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRAR_H4_SH(out4, out5, out6, out7, rnd_vec); + SAT_SH4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out4, out5, out6, out7, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + dst += dst_stride; + out = PCKEV_XORI128_UB(out4, out5); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out6, out7); + ST_UB(out, dst + 16); + dst += dst_stride; + } +} + +static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, uint8_t rnd_val) +{ + v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r; + v16i8 src2110, src4332, filt0, filt1; + v16u8 out; + v8i16 filt, out10; + v8i16 rnd_vec; + + src -= src_stride; + rnd_vec = __msa_fill_h(rnd_val); + + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + LD_SB2(src, src_stride, src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r); + src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128); + out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1); + out10 = __msa_srar_h(out10, rnd_vec); + out10 = __msa_sat_s_h(out10, 7); + out = PCKEV_XORI128_UB(out10, out10); + ST4x2_UB(out, dst, dst_stride); +} + +static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; + v16i8 src2110, src4332, filt0, filt1; + v8i16 filt, out10, out32; + v16u8 out; + v8i16 rnd_vec; + + src -= src_stride; + rnd_vec = __msa_fill_h(rnd_val); + + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB3(src, src_stride, src3, src4, src5); + src += (3 * src_stride); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r); + src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128); + out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1); + + src2 = LD_SB(src); + src += (src_stride); + ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1); + SRAR_H2_SH(out10, out32, rnd_vec); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + if (2 == height) { + common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter, rnd_val); + } else { + common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter, + height, rnd_val); + } +} + +static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, out0, out1; + v8i16 vec01, vec12, vec23, vec30, tmp0, tmp1, tmp2, tmp3; + v8i16 filt, filt0, filt1; + v8i16 rnd_vec; + + src -= src_stride; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter_y */ + filt = LD_SH(filter); + SPLATI_H2_SH(filt, 0, 1, filt0, filt1); + + LD_UB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + vec0 = (v16u8) __msa_xori_b((v16u8) src0, 128); + vec1 = (v16u8) __msa_xori_b((v16u8) src1, 128); + vec2 = (v16u8) __msa_xori_b((v16u8) src2, 128); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src3, src0, src1, src2); + src += (4 * src_stride); + + vec3 = (v16u8) __msa_xori_b((v16u8) src3, 128); + ILVR_B2_SH(vec1, vec0, vec3, vec2, vec01, vec23); + tmp0 = FILT_4TAP_DPADD_S_H(vec01, vec23, filt0, filt1); + + vec0 = __msa_xori_b((v16u8) src0, 128); + ILVR_B2_SH(vec2, vec1, vec0, vec3, vec12, vec30); + tmp1 = FILT_4TAP_DPADD_S_H(vec12, vec30, filt0, filt1); + + vec1 = __msa_xori_b((v16u8) src1, 128); + vec01 = (v8i16) __msa_ilvr_b((v16i8) vec1, (v16i8) vec0); + tmp2 = FILT_4TAP_DPADD_S_H(vec23, vec01, filt0, filt1); + + vec2 = __msa_xori_b((v16u8) src2, 128); + vec12 = (v8i16) __msa_ilvr_b((v16i8) vec2, (v16i8) vec1); + tmp3 = FILT_4TAP_DPADD_S_H(vec30, vec12, filt0, filt1); + + SRAR_H4_SH(tmp0, tmp1, tmp2, tmp3, rnd_vec); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + out0 = PCKEV_XORI128_UB(tmp0, tmp1); + out1 = PCKEV_XORI128_UB(tmp2, tmp3); + ST6x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, uint8_t rnd_val) +{ + v16i8 src0, src1, src2, src3, src4; + v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1; + v16u8 out; + v8i16 rnd_vec; + + src -= src_stride; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter_y */ + filt = LD_SH(filter); + SPLATI_H2_SH(filt, 0, 1, filt0, filt1); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + XORI_B5_128_SB(src0, src1, src2, src3, src4); + ILVR_B2_SH(src1, src0, src3, src2, src01, src23); + tmp0 = FILT_4TAP_DPADD_S_H(src01, src23, filt0, filt1); + ILVR_B2_SH(src2, src1, src4, src3, src12, src34); + tmp1 = FILT_4TAP_DPADD_S_H(src12, src34, filt0, filt1); + SRAR_H2_SH(tmp0, tmp1, rnd_vec); + SAT_SH2_SH(tmp0, tmp1, 7); + out = PCKEV_XORI128_UB(tmp0, tmp1); + ST8x2_UB(out, dst, dst_stride); +} + +static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, uint8_t rnd_val) +{ + uint32_t loop_cnt; + uint64_t out0, out1, out2; + v16i8 src0, src1, src2, src3, src4, src5; + v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2; + v8i16 filt, filt0, filt1; + v8i16 rnd_vec; + + src -= src_stride; + rnd_vec = __msa_fill_h(rnd_val); + + /* rearranging filter_y */ + filt = LD_SH(filter); + SPLATI_H2_SH(filt, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2); + + for (loop_cnt = 2; loop_cnt--;) { + LD_SB3(src, src_stride, src3, src4, src5); + src += (3 * src_stride); + + XORI_B3_128_SB(src3, src4, src5); + ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4); + tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1); + tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt0, filt1); + tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt0, filt1); + SRAR_H3_SH(tmp0, tmp1, tmp2, rnd_vec); + SAT_SH3_SH(tmp0, tmp1, tmp2, 7); + PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2); + XORI_B2_128_SH(tmp0, tmp2); + + out0 = __msa_copy_u_d((v2i64) tmp0, 0); + out1 = __msa_copy_u_d((v2i64) tmp0, 1); + out2 = __msa_copy_u_d((v2i64) tmp2, 0); + SD(out0, dst); + dst += dst_stride; + SD(out1, dst); + dst += dst_stride; + SD(out2, dst); + dst += dst_stride; + + src2 = src5; + vec0 = vec3; + vec2 = vec4; + } +} + +static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src7, src8, src9, src10; + v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1; + v16u8 tmp0, tmp1; + v8i16 filt, out0_r, out1_r, out2_r, out3_r; + v8i16 rnd_vec; + + src -= src_stride; + rnd_vec = __msa_fill_h(rnd_val); + + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9, + src72_r, src87_r, src98_r, src109_r); + out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1); + out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1); + out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1); + out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1); + SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); + tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src98_r; + src21_r = src109_r; + src2 = src10; + } +} + +static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + if (2 == height) { + common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter, rnd_val); + } else if (6 == height) { + common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter, rnd_val); + } else { + common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride, + filter, height, rnd_val); + } +} + +static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 out0, out1; + v8i16 src10, src21, src32, src43, src54, src65, src87, src109, src1211; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, filt, filt0, filt1; + v4u32 mask = { 2, 6, 2, 6 }; + v8i16 rnd_vec; + + /* rearranging filter_y */ + filt = LD_SH(filter); + SPLATI_H2_SH(filt, 0, 1, filt0, filt1); + + rnd_vec = __msa_fill_h(rnd_val); + + src -= src_stride; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src3, src4, src5, src6); + src += (4 * src_stride); + + XORI_B4_128_SB(src3, src4, src5, src6); + ILVR_B2_SH(src1, src0, src3, src2, src10, src32); + VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3); + VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5); + tmp0 = FILT_4TAP_DPADD_S_H(src10, src32, filt0, filt1); + ILVR_B4_SH(src2, src1, src4, src3, src5, src4, src6, src5, + src21, src43, src54, src65); + tmp1 = FILT_4TAP_DPADD_S_H(src21, src43, filt0, filt1); + tmp2 = FILT_4TAP_DPADD_S_H(src32, src54, filt0, filt1); + tmp3 = FILT_4TAP_DPADD_S_H(src43, src65, filt0, filt1); + ILVR_B3_SH(vec1, vec0, vec3, vec2, vec5, vec4, src87, src109, src1211); + tmp4 = FILT_4TAP_DPADD_S_H(src87, src109, filt0, filt1); + tmp5 = FILT_4TAP_DPADD_S_H(src109, src1211, filt0, filt1); + SRAR_H4_SH(tmp0, tmp1, tmp2, tmp3, rnd_vec); + SRAR_H2_SH(tmp4, tmp5, rnd_vec); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_SH2_SH(tmp4, tmp5, 7); + out0 = PCKEV_XORI128_UB(tmp0, tmp1); + out1 = PCKEV_XORI128_UB(tmp2, tmp3); + ST8x4_UB(out0, out1, dst, dst_stride); + out0 = PCKEV_XORI128_UB(tmp4, tmp5); + ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride); + dst += (4 * dst_stride); + + src0 = src4; + src1 = src5; + src2 = src6; + vec0 = vec4; + vec1 = vec5; + src2 = src6; + } +} + +static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l; + v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + v8i16 rnd_vec; + + src -= src_stride; + rnd_vec = __msa_fill_h(rnd_val); + + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src3, src4, src5, src6); + src += (4 * src_stride); + + XORI_B4_128_SB(src3, src4, src5, src6); + ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, + src32_r, src43_r, src54_r, src65_r); + ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, + src32_l, src43_l, src54_l, src65_l); + out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1); + out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1); + out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1); + out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1); + out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1); + out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1); + out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1); + out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1); + SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec); + SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src21_r = src65_r; + src10_l = src54_l; + src21_l = src65_l; + src2 = src6; + } +} + +static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + uint32_t loop_cnt; + uint64_t out0, out1; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src11, filt0, filt1; + v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; + v16i8 src109_r, src10_l, src32_l, src21_l, src43_l; + v16u8 out; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l; + v8i16 rnd_vec; + + src -= src_stride; + + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + rnd_vec = __msa_fill_h(rnd_val); + + /* 16 width */ + LD_SB3(src, src_stride, src0, src1, src2); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + + /* 8 width */ + LD_SB3(src + 16, src_stride, src6, src7, src8); + src += (3 * src_stride); + XORI_B3_128_SB(src6, src7, src8); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + /* 16 width */ + LD_SB2(src, src_stride, src3, src4); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); + + /* 8 width */ + LD_SB2(src + 16, src_stride, src9, src10); + src += (2 * src_stride); + XORI_B2_128_SB(src9, src10); + ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); + + /* 16 width */ + out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1); + out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1); + out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1); + out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1); + + /* 8 width */ + out2_r = FILT_4TAP_DPADD_S_H(src76_r, src98_r, filt0, filt1); + out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1); + + /* 16 + 8 width */ + SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec); + SRAR_H2_SH(out0_l, out1_l, rnd_vec); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH2_SH(out0_l, out1_l, 7); + out = PCKEV_XORI128_UB(out0_r, out0_l); + ST_UB(out, dst); + PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r); + XORI_B2_128_SH(out2_r, out3_r); + out0 = __msa_copy_u_d((v2i64) out2_r, 0); + out1 = __msa_copy_u_d((v2i64) out3_r, 0); + SD(out0, dst + 16); + dst += dst_stride; + out = PCKEV_XORI128_UB(out1_r, out1_l); + ST_UB(out, dst); + SD(out1, dst + 16); + dst += dst_stride; + + /* 16 width */ + LD_SB2(src, src_stride, src5, src2); + XORI_B2_128_SB(src5, src2); + ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); + ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); + + /* 8 width */ + LD_SB2(src + 16, src_stride, src11, src8); + src += (2 * src_stride); + XORI_B2_128_SB(src11, src8); + ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r); + + /* 16 width */ + out0_r = FILT_4TAP_DPADD_S_H(src32_r, src10_r, filt0, filt1); + out0_l = FILT_4TAP_DPADD_S_H(src32_l, src10_l, filt0, filt1); + out1_r = FILT_4TAP_DPADD_S_H(src43_r, src21_r, filt0, filt1); + out1_l = FILT_4TAP_DPADD_S_H(src43_l, src21_l, filt0, filt1); + + /* 8 width */ + out2_r = FILT_4TAP_DPADD_S_H(src98_r, src76_r, filt0, filt1); + out3_r = FILT_4TAP_DPADD_S_H(src109_r, src87_r, filt0, filt1); + + /* 16 + 8 width */ + SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec); + SRAR_H2_SH(out0_l, out1_l, rnd_vec); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH2_SH(out0_l, out1_l, 7); + out = PCKEV_XORI128_UB(out0_r, out0_l); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2_r, out2_r); + ST8x1_UB(out, dst + 16); + dst += dst_stride; + out = PCKEV_XORI128_UB(out1_r, out1_l); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out3_r, out3_r); + ST8x1_UB(out, dst + 16); + dst += dst_stride; + } +} + +static void common_vt_4t_32w_mult_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val, int32_t width) +{ + uint32_t loop_cnt, cnt; + uint8_t *dst_tmp, *src_tmp; + v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src87_r, src109_r; + v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + v16i8 src10_l, src32_l, src76_l, src98_l; + v16i8 src21_l, src43_l, src87_l, src109_l; + v8i16 filt; + v16i8 filt0, filt1; + v8i16 rnd_vec; + v16u8 out; + + src -= src_stride; + rnd_vec = __msa_fill_h(rnd_val); + + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + for (cnt = (width >> 5); cnt--;) { + dst_tmp = dst; + src_tmp = src; + + /* 16 width */ + LD_SB3(src_tmp, src_stride, src0, src1, src2); + XORI_B3_128_SB(src0, src1, src2); + + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + + /* next 16 width */ + LD_SB3(src_tmp + 16, src_stride, src6, src7, src8); + src_tmp += (3 * src_stride); + + XORI_B3_128_SB(src6, src7, src8); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); + + for (loop_cnt = (height >> 1); loop_cnt--;) { + /* 16 width */ + LD_SB2(src_tmp, src_stride, src3, src4); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); + + /* 16 width */ + out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1); + out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1); + out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1); + out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1); + + /* 16 width */ + SRAR_H4_SH(out0_r, out1_r, out0_l, out1_l, rnd_vec); + SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7); + out = PCKEV_XORI128_UB(out0_r, out0_l); + ST_UB(out, dst_tmp); + out = PCKEV_XORI128_UB(out1_r, out1_l); + ST_UB(out, dst_tmp + dst_stride); + + src10_r = src32_r; + src21_r = src43_r; + src10_l = src32_l; + src21_l = src43_l; + src2 = src4; + + /* next 16 width */ + LD_SB2(src_tmp + 16, src_stride, src9, src10); + src_tmp += (2 * src_stride); + XORI_B2_128_SB(src9, src10); + ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); + ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l); + + /* next 16 width */ + out2_r = FILT_4TAP_DPADD_S_H(src76_r, src98_r, filt0, filt1); + out2_l = FILT_4TAP_DPADD_S_H(src76_l, src98_l, filt0, filt1); + out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1); + out3_l = FILT_4TAP_DPADD_S_H(src87_l, src109_l, filt0, filt1); + + /* next 16 width */ + SRAR_H4_SH(out2_r, out3_r, out2_l, out3_l, rnd_vec); + SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7); + out = PCKEV_XORI128_UB(out2_r, out2_l); + ST_UB(out, dst_tmp + 16); + out = PCKEV_XORI128_UB(out3_r, out3_l); + ST_UB(out, dst_tmp + 16 + dst_stride); + + dst_tmp += 2 * dst_stride; + + src76_r = src98_r; + src87_r = src109_r; + src76_l = src98_l; + src87_l = src109_l; + src8 = src10; + } + + src += 32; + dst += 32; + } +} + +static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + uint8_t rnd_val) +{ + common_vt_4t_32w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, rnd_val, 32); +} + +static void hevc_hv_uni_4t_4x2_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + v16i8 src0, src1, src2, src3, src4; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4; + v4i32 dst0_r, dst1_r; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + + src -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r); + LD_SB2(src, src_stride, src3, src4); + XORI_B2_128_SB(src3, src4); + + /* row 3 */ + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + dst32_r = __msa_ilvr_h(dst3, dst2); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_r >>= 6; + + /* row 4 */ + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + dst43_r = __msa_ilvr_h(dst4, dst3); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_r >>= 6; + + dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r); + dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 6); + dst0_r = (v4i32) CLIP_SH_0_255(dst0_r); + dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r); + + ST4x2_UB(dst0_r, dst, dst_stride); +} + +static void hevc_hv_uni_4t_4x4_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r; + v8i16 out0_r, out1_r; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + + src -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r); + LD_SB4(src, src_stride, src3, src4, src5, src6); + XORI_B4_128_SB(src3, src4, src5, src6); + + /* row 3 */ + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + dst32_r = __msa_ilvr_h(dst3, dst2); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_r >>= 6; + + /* row 4 */ + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + dst43_r = __msa_ilvr_h(dst4, dst3); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_r >>= 6; + + /* row 5 */ + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + dst10_r = __msa_ilvr_h(dst5, dst4); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1); + dst2_r >>= 6; + + /* row 6 */ + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + dst21_r = __msa_ilvr_h(dst2, dst5); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1); + dst3_r >>= 6; + + PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, out0_r, out1_r); + SRARI_H2_SH(out0_r, out1_r, 6); + CLIP_SH2_0_255(out0_r, out1_r); + out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r); + + ST4x4_UB(out0_r, out0_r, 0, 1, 2, 3, dst, dst_stride); +} + +static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v16i8 src6, src7, src8, src9, src10; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; + v8i16 dst10_r, dst32_r, dst54_r, dst76_r; + v8i16 dst21_r, dst43_r, dst65_r, dst87_r; + v8i16 out0_r, out1_r, out2_r, out3_r; + + src -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r); + + for (loop_cnt = height >> 3; loop_cnt--;) { + LD_SB8(src, src_stride, + src3, src4, src5, src6, src7, src8, src9, src10); + src += (8 * src_stride); + + XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); + + /* row 3 */ + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + dst32_r = __msa_ilvr_h(dst3, dst2); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_r >>= 6; + + /* row 4 */ + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + dst43_r = __msa_ilvr_h(dst4, dst3); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_r >>= 6; + + /* row 5 */ + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + dst54_r = __msa_ilvr_h(dst5, dst4); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); + dst2_r >>= 6; + + /* row 6 */ + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst6 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); + + dst65_r = __msa_ilvr_h(dst6, dst5); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); + dst3_r >>= 6; + + /* row 7 */ + VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); + dst7 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); + + dst76_r = __msa_ilvr_h(dst7, dst6); + dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); + dst4_r >>= 6; + + /* row 8 */ + VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1); + dst8 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8); + + dst87_r = __msa_ilvr_h(dst8, dst7); + dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); + dst5_r >>= 6; + + /* row 9 */ + VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1); + dst9 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9); + + dst10_r = __msa_ilvr_h(dst9, dst8); + dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1); + dst6_r >>= 6; + + /* row 10 */ + VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + dst21_r = __msa_ilvr_h(dst2, dst9); + dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1); + dst7_r >>= 6; + + PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r, + dst5_r, dst4_r, dst7_r, dst6_r, + out0_r, out1_r, out2_r, out3_r); + + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); + CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r); + + PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r); + ST4x8_UB(out0_r, out1_r, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +static void hevc_hv_uni_4t_4w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + if (2 == height) { + hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height); + } else if (4 == height) { + hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height); + } else if (0 == (height % 8)) { + hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height); + } +} + +static void hevc_hv_uni_4t_6w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v8i16 dst10_l, dst32_l, dst21_l, dst43_l; + v8i16 out0_r, out1_r, out2_r, out3_r; + + src -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + for (loop_cnt = height >> 2; loop_cnt--;) { + LD_SB4(src, src_stride, src3, src4, src5, src6); + src += (4 * src_stride); + + XORI_B4_128_SB(src3, src4, src5, src6); + + /* row 3 */ + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + dst0_r >>= 6; + dst0_l >>= 6; + + /* row 4 */ + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + + /* row 5 */ + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1); + dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1); + + dst2_r >>= 6; + dst2_l >>= 6; + + /* row 6 */ + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1); + dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1); + + dst3_r >>= 6; + dst3_l >>= 6; + + PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + out0_r, out1_r, out2_r, out3_r); + + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); + CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r); + + PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r); + ST6x4_UB(out0_r, out1_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hv_uni_4t_8x2_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + v16i8 src0, src1, src2, src3, src4; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v8i16 dst10_l, dst32_l, dst21_l, dst43_l; + v8i16 out0_r, out1_r; + + src -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + LD_SB2(src, src_stride, src3, src4); + XORI_B2_128_SB(src3, src4); + + /* row 3 */ + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + dst0_r >>= 6; + dst0_l >>= 6; + + /* row 4 */ + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + + PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r); + SRARI_H2_SH(out0_r, out1_r, 6); + CLIP_SH2_0_255(out0_r, out1_r); + out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r); + + ST8x2_UB(out0_r, dst, dst_stride); +} + +static void hevc_hv_uni_4t_8x6_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; + v4i32 dst4_r, dst4_l, dst5_r, dst5_l; + v8i16 dst10_r, dst32_r, dst10_l, dst32_l; + v8i16 dst21_r, dst43_r, dst21_l, dst43_l; + v8i16 dst54_r, dst54_l, dst65_r, dst65_l; + v8i16 dst76_r, dst76_l, dst87_r, dst87_l; + v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r; + + src -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + LD_SB2(src, src_stride, src3, src4); + src += (2 * src_stride); + + XORI_B2_128_SB(src3, src4); + + /* row 3 */ + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + + dst0_r >>= 6; + dst0_l >>= 6; + + /* row 4 */ + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + + LD_SB2(src, src_stride, src5, src6); + src += (2 * src_stride); + + XORI_B2_128_SB(src5, src6); + + /* row 5 */ + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); + dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); + dst2_r >>= 6; + dst2_l >>= 6; + + /* row 6 */ + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst6 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); + + ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); + dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); + dst3_r >>= 6; + dst3_l >>= 6; + + LD_SB2(src, src_stride, src7, src8); + src += (2 * src_stride); + + XORI_B2_128_SB(src7, src8); + + /* row 7 */ + VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); + dst7 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); + + ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); + dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); + dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1); + + dst4_r >>= 6; + dst4_l >>= 6; + + /* row 8 */ + VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1); + dst8 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8); + + ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); + dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); + dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1); + dst5_r >>= 6; + dst5_l >>= 6; + + PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r); + PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); + SRARI_H2_SH(out4_r, out5_r, 6); + CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r); + CLIP_SH2_0_255(out4_r, out5_r); + + PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r); + out2_r = (v8i16) __msa_pckev_b((v16i8) out5_r, (v16i8) out4_r); + + ST8x4_UB(out0_r, out1_r, dst, dst_stride); + dst += (4 * dst_stride); + ST8x2_UB(out2_r, dst, dst_stride); +} + +static void hevc_hv_uni_4t_8w_mult_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t width) +{ + uint32_t loop_cnt, cnt; + uint8_t *src_tmp; + uint8_t *dst_tmp; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v8i16 dst10_l, dst32_l, dst21_l, dst43_l; + v8i16 out0_r, out1_r, out2_r, out3_r; + + src -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + for (cnt = width >> 3; cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB3(src_tmp, src_stride, src0, src1, src2); + src_tmp += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + for (loop_cnt = height >> 2; loop_cnt--;) { + LD_SB4(src_tmp, src_stride, src3, src4, src5, src6); + src_tmp += (4 * src_stride); + + XORI_B4_128_SB(src3, src4, src5, src6); + + /* row 3 */ + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + + dst0_r >>= 6; + dst0_l >>= 6; + + /* row 4 */ + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + + /* row 5 */ + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1); + dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1); + + dst2_r >>= 6; + dst2_l >>= 6; + + /* row 6 */ + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1); + dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1); + + dst3_r >>= 6; + dst3_l >>= 6; + + PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + out0_r, out1_r, out2_r, out3_r); + + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); + CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r); + + PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r); + ST8x4_UB(out0_r, out1_r, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + } + + src += 8; + dst += 8; + } +} + +static void hevc_hv_uni_4t_8w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + if (2 == height) { + hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height); + } else if (6 == height) { + hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height); + } else if (0 == (height % 4)) { + hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 8); + } +} + +static void hevc_hv_uni_4t_12w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 8); + + hevc_hv_uni_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride, + filter_x, filter_y, height); +} + +static void hevc_hv_uni_4t_16w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 16); +} + +static void hevc_hv_uni_4t_24w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 24); +} + +static void hevc_hv_uni_4t_32w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 32); +} + +#define UNI_MC_COPY(WIDTH) \ +void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \ + ptrdiff_t dst_stride, \ + uint8_t *src, \ + ptrdiff_t src_stride, \ + int height, \ + intptr_t mx, \ + intptr_t my, \ + int width) \ +{ \ + copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \ +} + +UNI_MC_COPY(8); +UNI_MC_COPY(12); +UNI_MC_COPY(16); +UNI_MC_COPY(24); +UNI_MC_COPY(32); +UNI_MC_COPY(48); +UNI_MC_COPY(64); + +#undef UNI_MC_COPY + +#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ +void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ + ptrdiff_t \ + dst_stride, \ + uint8_t *src, \ + ptrdiff_t \ + src_stride, \ + int height, \ + intptr_t mx, \ + intptr_t my, \ + int width) \ +{ \ + const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ + \ + common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \ + filter, height, 6); \ +} + +UNI_MC(qpel, h, 4, 8, hz, mx); +UNI_MC(qpel, h, 8, 8, hz, mx); +UNI_MC(qpel, h, 12, 8, hz, mx); +UNI_MC(qpel, h, 16, 8, hz, mx); +UNI_MC(qpel, h, 24, 8, hz, mx); +UNI_MC(qpel, h, 32, 8, hz, mx); +UNI_MC(qpel, h, 48, 8, hz, mx); +UNI_MC(qpel, h, 64, 8, hz, mx); + +UNI_MC(qpel, v, 4, 8, vt, my); +UNI_MC(qpel, v, 8, 8, vt, my); +UNI_MC(qpel, v, 12, 8, vt, my); +UNI_MC(qpel, v, 16, 8, vt, my); +UNI_MC(qpel, v, 24, 8, vt, my); +UNI_MC(qpel, v, 32, 8, vt, my); +UNI_MC(qpel, v, 48, 8, vt, my); +UNI_MC(qpel, v, 64, 8, vt, my); + +UNI_MC(epel, h, 4, 4, hz, mx); +UNI_MC(epel, h, 6, 4, hz, mx); +UNI_MC(epel, h, 8, 4, hz, mx); +UNI_MC(epel, h, 12, 4, hz, mx); +UNI_MC(epel, h, 16, 4, hz, mx); +UNI_MC(epel, h, 24, 4, hz, mx); +UNI_MC(epel, h, 32, 4, hz, mx); + +UNI_MC(epel, v, 4, 4, vt, my); +UNI_MC(epel, v, 6, 4, vt, my); +UNI_MC(epel, v, 8, 4, vt, my); +UNI_MC(epel, v, 12, 4, vt, my); +UNI_MC(epel, v, 16, 4, vt, my); +UNI_MC(epel, v, 24, 4, vt, my); +UNI_MC(epel, v, 32, 4, vt, my); + +#undef UNI_MC + +#define UNI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \ +void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ + ptrdiff_t \ + dst_stride, \ + uint8_t *src, \ + ptrdiff_t \ + src_stride, \ + int height, \ + intptr_t mx, \ + intptr_t my, \ + int width) \ +{ \ + const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ + const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ + \ + hevc_##DIR1##_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \ + dst_stride, filter_x, \ + filter_y, height); \ +} + +UNI_MC_HV(qpel, hv, 4, 8, hv); +UNI_MC_HV(qpel, hv, 8, 8, hv); +UNI_MC_HV(qpel, hv, 12, 8, hv); +UNI_MC_HV(qpel, hv, 16, 8, hv); +UNI_MC_HV(qpel, hv, 24, 8, hv); +UNI_MC_HV(qpel, hv, 32, 8, hv); +UNI_MC_HV(qpel, hv, 48, 8, hv); +UNI_MC_HV(qpel, hv, 64, 8, hv); + +UNI_MC_HV(epel, hv, 4, 4, hv); +UNI_MC_HV(epel, hv, 6, 4, hv); +UNI_MC_HV(epel, hv, 8, 4, hv); +UNI_MC_HV(epel, hv, 12, 4, hv); +UNI_MC_HV(epel, hv, 16, 4, hv); +UNI_MC_HV(epel, hv, 24, 4, hv); +UNI_MC_HV(epel, hv, 32, 4, hv); + +#undef UNI_MC_HV diff --git a/libavcodec/mips/hevc_mc_uniw_msa.c b/libavcodec/mips/hevc_mc_uniw_msa.c new file mode 100644 index 0000000000..ce10f413ed --- /dev/null +++ b/libavcodec/mips/hevc_mc_uniw_msa.c @@ -0,0 +1,4790 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "libavcodec/mips/hevcdsp_mips.h" +#include "libavcodec/mips/hevc_macros_msa.h" + +#define HEVC_HV_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd, \ + out0, out1, out2, out3) \ +{ \ + MUL4(in0, wgt, in1, wgt, in2, wgt, in3, wgt, out0, out1, out2, out3); \ + SRAR_W4_SW(out0, out1, out2, out3, rnd); \ + ADD4(out0, offset, out1, offset, out2, offset, out3, offset, \ + out0, out1, out2, out3); \ + out0 = CLIP_SW_0_255(out0); \ + out1 = CLIP_SW_0_255(out1); \ + out2 = CLIP_SW_0_255(out2); \ + out3 = CLIP_SW_0_255(out3); \ +} + +#define HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd, \ + out0_r, out1_r, out0_l, out1_l) \ +{ \ + ILVR_H2_SW(in0, in0, in1, in1, out0_r, out1_r); \ + ILVL_H2_SW(in0, in0, in1, in1, out0_l, out1_l); \ + DOTP_SH4_SW(out0_r, out1_r, out0_l, out1_l, wgt, wgt, wgt, wgt, \ + out0_r, out1_r, out0_l, out1_l); \ + SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \ + ADD4(out0_r, offset, out1_r, offset, \ + out0_l, offset, out1_l, offset, \ + out0_r, out1_r, out0_l, out1_l); \ + out0_r = CLIP_SW_0_255(out0_r); \ + out1_r = CLIP_SW_0_255(out1_r); \ + out0_l = CLIP_SW_0_255(out0_l); \ + out1_l = CLIP_SW_0_255(out1_l); \ +} + +#define HEVC_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd, \ + out0_r, out1_r, out2_r, out3_r, \ + out0_l, out1_l, out2_l, out3_l) \ +{ \ + HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd, \ + out0_r, out1_r, out0_l, out1_l); \ + HEVC_UNIW_RND_CLIP2(in2, in3, wgt, offset, rnd, \ + out2_r, out3_r, out2_l, out3_l); \ +} + +static void hevc_uniwgt_copy_4w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + v16i8 zero = { 0 }; + v4i32 weight_vec, offset_vec, rnd_vec; + + weight = weight & 0x0000FFFF; + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + if (2 == height) { + v16i8 src0, src1; + v8i16 dst0; + v4i32 dst0_r, dst0_l; + + LD_SB2(src, src_stride, src0, src1); + src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0); + dst0 = (v8i16) __msa_ilvr_b(zero, src0); + dst0 <<= 6; + + ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l); + DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l); + SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); + ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l); + dst0_r = CLIP_SW_0_255(dst0_r); + dst0_l = CLIP_SW_0_255(dst0_l); + + HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r); + ST4x2_UB(dst0_r, dst, dst_stride); + } else if (4 == height) { + v16i8 src0, src1, src2, src3; + v8i16 dst0, dst1; + v4i32 dst0_r, dst1_r; + v4i32 dst0_l, dst1_l; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + ILVR_W2_SB(src1, src0, src3, src2, src0, src1); + ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1); + dst0 <<= 6; + dst1 <<= 6; + + HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride); + } else if (0 == height % 8) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 dst0, dst1, dst2, dst3; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6, + src0, src1, src2, src3); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0, dst1, dst2, dst3); + + SLLI_4V(dst0, dst1, dst2, dst3, 6); + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST4x8_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (8 * dst_stride); + } + } +} + +static void hevc_uniwgt_copy_6w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 zero = { 0 }; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + weight = weight & 0x0000FFFF; + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0, dst1, dst2, dst3); + ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, + dst4, dst5, dst6, dst7); + + SLLI_4V(dst0, dst1, dst2, dst3, 6); + SLLI_4V(dst4, dst5, dst6, dst7, 6); + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST6x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + + HEVC_UNIW_RND_CLIP4(dst4, dst5, dst6, dst7, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST6x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_uniwgt_copy_8w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + v16i8 zero = { 0 }; + v4i32 weight_vec, offset_vec, rnd_vec; + + weight = weight & 0x0000FFFF; + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + if (2 == height) { + v16i8 src0, src1; + v8i16 dst0, dst1; + v4i32 dst0_r, dst1_r, dst0_l, dst1_l; + + LD_SB2(src, src_stride, src0, src1); + ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1); + + dst0 <<= 6; + dst1 <<= 6; + HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST8x2_UB(dst0_r, dst, dst_stride); + } else if (6 == height) { + v16i8 src0, src1, src2, src3, src4, src5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + + LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0, dst1, dst2, dst3); + ILVR_B2_SH(zero, src4, zero, src5, dst4, dst5); + + SLLI_4V(dst0, dst1, dst2, dst3, 6); + dst4 <<= 6; + dst5 <<= 6; + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + + HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + dst4_l, dst4_r, dst5_l, dst5_r, + dst0_r, dst1_r, dst2_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + ST8x2_UB(dst2_r, dst, dst_stride); + } else if (0 == height % 4) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v8i16 dst0, dst1, dst2, dst3; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0, dst1, dst2, dst3); + + SLLI_4V(dst0, dst1, dst2, dst3, 6); + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void hevc_uniwgt_copy_12w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + v16i8 zero = { 0 }; + v4i32 weight_vec, offset_vec, rnd_vec; + + weight = weight & 0x0000FFFF; + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + dst0, dst1, dst2, dst3); + + SLLI_4V(dst0, dst1, dst2, dst3, 6); + ILVL_W2_SB(src1, src0, src3, src2, src0, src1); + ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5); + dst4 <<= 6; + dst5 <<= 6; + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + + HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + dst4_l, dst4_r, dst5_l, dst5_r, + dst0_r, dst1_r, dst2_r); + ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_uniwgt_copy_16multx4mult_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val, + int32_t width) +{ + uint32_t loop_cnt, cnt; + uint8_t *src_tmp; + uint8_t *dst_tmp; + v16i8 src0, src1, src2, src3; + v8i16 tmp0, tmp1, tmp2, tmp3; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v16i8 zero = { 0 }; + v4i32 weight_vec, offset_vec, rnd_vec; + + weight = weight & 0x0000FFFF; + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + for (cnt = width >> 4; cnt--;) { + src_tmp = src; + dst_tmp = dst; + + for (loop_cnt = height >> 2; loop_cnt--;) { + LD_SB4(src_tmp, src_stride, src0, src1, src2, src3); + src_tmp += (4 * src_stride); + ILVR_B2_SH(zero, src0, zero, src1, tmp0, tmp1); + ILVL_B2_SH(zero, src0, zero, src1, tmp2, tmp3); + + SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6); + HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, + dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); + + ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + + ILVR_B2_SH(zero, src2, zero, src3, tmp0, tmp1); + ILVL_B2_SH(zero, src2, zero, src3, tmp2, tmp3); + + SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6); + HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, + dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); + + ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + } + + src += 16; + dst += 16; + } +} + +static void hevc_uniwgt_copy_16w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_uniwgt_copy_16multx4mult_msa(src, src_stride, dst, dst_stride, + height, weight, offset, rnd_val, 16); +} + +static void hevc_uniwgt_copy_24w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_uniwgt_copy_16multx4mult_msa(src, src_stride, dst, dst_stride, + height, weight, offset, rnd_val, 16); + + hevc_uniwgt_copy_8w_msa(src + 16, src_stride, dst + 16, dst_stride, + height, weight, offset, rnd_val); +} + +static void hevc_uniwgt_copy_32w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_uniwgt_copy_16multx4mult_msa(src, src_stride, dst, dst_stride, + height, weight, offset, rnd_val, 32); +} + +static void hevc_uniwgt_copy_48w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_uniwgt_copy_16multx4mult_msa(src, src_stride, dst, dst_stride, + height, weight, offset, rnd_val, 48); +} + +static void hevc_uniwgt_copy_64w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_uniwgt_copy_16multx4mult_msa(src, src_stride, dst, dst_stride, + height, weight, offset, rnd_val, 64); +} + +static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + v8i16 filt0, filt1, filt2, filt3; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 mask1, mask2, mask3; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + + src -= 3; + weight = weight & 0x0000FFFF; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + + VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST4x8_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +static void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src -= 3; + weight = weight & 0x0000FFFF; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_hz_uniwgt_8t_8w_msa(src, src_stride, dst, dst_stride, + filter, height, weight, offset, rnd_val); + hevc_hz_uniwgt_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride, + filter, height, weight, offset, rnd_val); +} + +static void hevc_hz_uniwgt_8t_16w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src -= 3; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight = weight & 0x0000FFFF; + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, src_stride, src0, src2); + LD_SB2(src + 8, src_stride, src1, src3); + src += (2 * src_stride); + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + } +} + +static void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + v4i32 weight_vec, offset_vec, rnd_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src -= 3; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight = weight & 0x0000FFFF; + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = mask0 + 8; + mask5 = mask0 + 10; + mask6 = mask0 + 12; + mask7 = mask0 + 14; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, 16, src0, src1); + src += src_stride; + LD_SB2(src, 16, src2, src3); + src += src_stride; + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst4 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst4, dst4, dst4, dst4); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst5 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst5, dst5, dst5, dst5); + + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst3_l, dst3_r, dst4_l, dst4_r, dst0_r, dst1_r); + HEVC_PCK_SW_SB4(dst2_l, dst2_r, dst5_l, dst5_r, dst2_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + ST8x2_UB(dst2_r, dst + 16, dst_stride); + dst += (2 * dst_stride); + } +} + +static void hevc_hz_uniwgt_8t_32w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src -= 3; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight = weight & 0x0000FFFF; + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = mask0 + 8; + mask5 = mask0 + 10; + mask6 = mask0 + 12; + mask7 = mask0 + 14; + + for (loop_cnt = height; loop_cnt--;) { + LD_SB2(src, 16, src0, src1); + src2 = LD_SB(src + 24); + src += src_stride; + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, 16); + dst += dst_stride; + } +} + +static void hevc_hz_uniwgt_8t_48w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + v4i32 weight_vec, offset_vec, rnd_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src -= 3; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight = weight & 0x0000FFFF; + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = mask0 + 8; + mask5 = mask0 + 10; + mask6 = mask0 + 12; + mask7 = mask0 + 14; + + for (loop_cnt = height; loop_cnt--;) { + LD_SB3(src, 16, src0, src1, src2); + src3 = LD_SB(src + 40); + src += src_stride; + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst4 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst4, dst4, dst4, dst4); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst5 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst5, dst5, dst5, dst5); + + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + + HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + dst4_l, dst4_r, dst5_l, dst5_r, + dst0_r, dst1_r, dst2_r); + ST_SW2(dst0_r, dst1_r, dst, 16); + ST_SW(dst2_r, dst + 32); + dst += dst_stride; + } +} + +static void hevc_hz_uniwgt_8t_64w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint8_t *src_tmp; + uint8_t *dst_tmp; + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src -= 3; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight = weight & 0x0000FFFF; + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = mask0 + 8; + mask5 = mask0 + 10; + mask6 = mask0 + 12; + mask7 = mask0 + 14; + + for (loop_cnt = height; loop_cnt--;) { + src_tmp = src; + dst_tmp = dst; + + for (cnt = 2; cnt--;) { + LD_SB2(src_tmp, 16, src0, src1); + src2 = LD_SB(src_tmp + 24); + src_tmp += 32; + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst_tmp, 16); + dst_tmp += 32; + } + + src += src_stride; + dst += dst_stride; + } +} + +static void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src9, src10, src11, src12, src13, src14; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; + v16i8 src1110_r, src1211_r, src1312_r, src1413_r; + v16i8 src2110, src4332, src6554, src8776, src10998; + v16i8 src12111110, src14131312; + v8i16 dst10, dst32, dst54, dst76; + v8i16 filt0, filt1, filt2, filt3; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= (3 * src_stride); + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight = weight & 0x0000FFFF; + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_r, src32_r, src54_r, src21_r); + + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + + ILVR_D3_SB(src21_r, src10_r, src43_r, + src32_r, src65_r, src54_r, src2110, src4332, src6554); + + XORI_B3_128_SB(src2110, src4332, src6554); + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src, src_stride, + src7, src8, src9, src10, src11, src12, src13, src14); + src += (8 * src_stride); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, + src1110_r, src1211_r, src1312_r, src1413_r); + ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r, + src1413_r, src1312_r, + src8776, src10998, src12111110, src14131312); + XORI_B4_128_SB(src8776, src10998, src12111110, src14131312); + + dst10 = const_vec; + DPADD_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt1, + filt2, filt3, dst10, dst10, dst10, dst10); + dst32 = const_vec; + DPADD_SB4_SH(src4332, src6554, src8776, src10998, + filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32); + dst54 = const_vec; + DPADD_SB4_SH(src6554, src8776, src10998, src12111110, + filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54); + dst76 = const_vec; + DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, + filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76); + + HEVC_UNIW_RND_CLIP4(dst10, dst32, dst54, dst76, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST4x8_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (8 * dst_stride); + + src2110 = src10998; + src4332 = src12111110; + src6554 = src14131312; + src6 = src14; + } +} + +static void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt0, filt1, filt2, filt3; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= (3 * src_stride); + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight = weight & 0x0000FFFF; + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_r, src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + + tmp0 = const_vec; + DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, + filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, + filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1); + tmp2 = const_vec; + DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, + filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, + filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3); + + HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src6 = src10; + } +} + +static void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + v16i8 src10_l, src32_l, src54_l, src76_l, src98_l; + v16i8 src21_l, src43_l, src65_l, src87_l, src109_l; + v16i8 src2110, src4332, src6554, src8776, src10998; + v8i16 filt0, filt1, filt2, filt3; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= (3 * src_stride); + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight = weight & 0x0000FFFF; + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_r, src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_l, src32_l, src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l, + src2110, src4332, src6554); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + XORI_B4_128_SB(src7, src8, src9, src10); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, + src76_l, src87_l, src98_l, src109_l); + ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998); + + tmp0 = const_vec; + DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, + filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, + filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1); + tmp2 = const_vec; + DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, + filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, + filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3); + tmp4 = const_vec; + DPADD_SB4_SH(src2110, src4332, src6554, src8776, + filt0, filt1, filt2, filt3, tmp4, tmp4, tmp4, tmp4); + tmp5 = const_vec; + DPADD_SB4_SH(src4332, src6554, src8776, src10998, + filt0, filt1, filt2, filt3, tmp5, tmp5, tmp5, tmp5); + + HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + HEVC_UNIW_RND_CLIP2(tmp4, tmp5, weight_vec, offset_vec, rnd_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + + HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + dst4_l, dst4_r, dst5_l, dst5_r, + dst0_r, dst1_r, dst2_r); + ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src2110 = src6554; + src4332 = src8776; + src6554 = src10998; + src6 = src10; + } +} + +static void hevc_vt_uniwgt_8t_16multx2mult_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val, + int32_t width) +{ + uint8_t *src_tmp; + uint8_t *dst_tmp; + int32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r; + v16i8 src21_r, src43_r, src65_r, src87_r; + v8i16 tmp0, tmp1, tmp2, tmp3; + v16i8 src10_l, src32_l, src54_l, src76_l; + v16i8 src21_l, src43_l, src65_l, src87_l; + v8i16 filt0, filt1, filt2, filt3; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= (3 * src_stride); + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight = weight & 0x0000FFFF; + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + src_tmp += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_r, src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_l, src32_l, src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src_tmp, src_stride, src7, src8); + src_tmp += (2 * src_stride); + XORI_B2_128_SB(src7, src8); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); + + tmp0 = const_vec; + DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, + filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, + filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1); + tmp2 = const_vec; + DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l, + filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l, + filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3); + + HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, + dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + + src10_r = src32_r; + src32_r = src54_r; + src54_r = src76_r; + src21_r = src43_r; + src43_r = src65_r; + src65_r = src87_r; + src10_l = src32_l; + src32_l = src54_l; + src54_l = src76_l; + src21_l = src43_l; + src43_l = src65_l; + src65_l = src87_l; + src6 = src8; + } + + src += 16; + dst += 16; + } +} + +static void hevc_vt_uniwgt_8t_16w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride, + filter, height, weight, + offset, rnd_val, 16); +} + +static void hevc_vt_uniwgt_8t_24w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride, + filter, height, weight, + offset, rnd_val, 16); + + hevc_vt_uniwgt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, + filter, height, weight, offset, rnd_val); +} + +static void hevc_vt_uniwgt_8t_32w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride, + filter, height, weight, + offset, rnd_val, 32); +} + +static void hevc_vt_uniwgt_8t_48w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride, + filter, height, weight, + offset, rnd_val, 48); +} + +static void hevc_vt_uniwgt_8t_64w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride, + filter, height, weight, + offset, rnd_val, 64); +} + +static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 filt0, filt1, filt2, filt3; + v4i32 filt_h0, filt_h1, filt_h2, filt_h3; + v16i8 mask1, mask2, mask3; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + v8i16 dst30, dst41, dst52, dst63, dst66, dst87; + v4i32 dst0_r, dst1_r, weight_vec, offset_vec, rnd_vec; + v8i16 dst10_r, dst32_r, dst54_r, dst76_r; + v8i16 dst21_r, dst43_r, dst65_r, dst87_r; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 }; + + src -= ((3 * src_stride) + 3); + filter_vec = LD_SH(filter_x); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + + /* row 0 row 1 row 2 row 3 */ + VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); + VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); + VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3, + vec8, vec9, vec10, vec11); + VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3, + vec12, vec13, vec14, vec15); + dst30 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst30, dst30, dst30, dst30); + dst41 = const_vec; + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, + dst41, dst41, dst41, dst41); + dst52 = const_vec; + DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, + dst52, dst52, dst52, dst52); + dst63 = const_vec; + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, + dst63, dst63, dst63, dst63); + + ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52, + dst10_r, dst21_r, dst32_r); + + dst43_r = __msa_ilvl_h(dst41, dst30); + dst54_r = __msa_ilvl_h(dst52, dst41); + dst65_r = __msa_ilvl_h(dst63, dst52); + + dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); + + for (loop_cnt = height >> 1; loop_cnt--;) { + LD_SB2(src, src_stride, src7, src8); + src += (2 * src_stride); + XORI_B2_128_SB(src7, src8); + + VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst87 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst87, dst87, dst87, dst87); + dst76_r = __msa_ilvr_h(dst87, dst66); + dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, + filt_h0, filt_h1, filt_h2, filt_h3); + dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87); + dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, + filt_h0, filt_h1, filt_h2, filt_h3); + + dst0_r >>= 6; + dst1_r >>= 6; + MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r); + SRAR_W2_SW(dst0_r, dst1_r, rnd_vec); + ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r); + dst0_r = CLIP_SW_0_255(dst0_r); + dst1_r = CLIP_SW_0_255(dst1_r); + + HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r); + ST4x2_UB(dst0_r, dst, dst_stride); + dst += (2 * dst_stride); + + dst10_r = dst32_r; + dst32_r = dst54_r; + dst54_r = dst76_r; + dst21_r = dst43_r; + dst43_r = dst65_r; + dst65_r = dst87_r; + dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1); + } +} + +static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val, + int32_t width) +{ + uint32_t loop_cnt, cnt; + uint8_t *src_tmp; + uint8_t *dst_tmp; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 filt0, filt1, filt2, filt3; + v4i32 filt_h0, filt_h1, filt_h2, filt_h3; + v16i8 mask1, mask2, mask3; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l; + v8i16 dst10_r, dst32_r, dst54_r, dst76_r; + v8i16 dst10_l, dst32_l, dst54_l, dst76_l; + v8i16 dst21_r, dst43_r, dst65_r, dst87_r; + v8i16 dst21_l, dst43_l, dst65_l, dst87_l; + v4i32 weight_vec, offset_vec, rnd_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src -= ((3 * src_stride) + 3); + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter_x); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (cnt = width >> 3; cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + src_tmp += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec4, vec5, vec6, vec7); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec8, vec9, vec10, vec11); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec12, vec13, vec14, vec15); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + dst1 = const_vec; + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + dst2 = const_vec; + DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + dst3 = const_vec; + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, + vec4, vec5, vec6, vec7); + VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, + vec8, vec9, vec10, vec11); + dst4 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst4, dst4, dst4, dst4); + dst5 = const_vec; + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, + dst5, dst5, dst5, dst5); + dst6 = const_vec; + DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, + dst6, dst6, dst6, dst6); + + ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, + dst10_r, dst32_r, dst54_r, dst21_r); + ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r); + ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, + dst10_l, dst32_l, dst54_l, dst21_l); + ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l); + + for (loop_cnt = height >> 1; loop_cnt--;) { + LD_SB2(src_tmp, src_stride, src7, src8); + src_tmp += 2 * src_stride; + XORI_B2_128_SB(src7, src8); + + VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst7 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst7, dst7, dst7, dst7); + + ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); + dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, + filt_h0, filt_h1, filt_h2, filt_h3); + dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, + filt_h0, filt_h1, filt_h2, filt_h3); + dst0_r >>= 6; + dst0_l >>= 6; + + /* row 8 */ + VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst8 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst8, dst8, dst8, dst8); + + ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); + dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, + filt_h0, filt_h1, filt_h2, filt_h3); + dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, + filt_h0, filt_h1, filt_h2, filt_h3); + dst1_r >>= 6; + dst1_l >>= 6; + + HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST8x2_UB(dst0_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + + dst10_r = dst32_r; + dst32_r = dst54_r; + dst54_r = dst76_r; + dst10_l = dst32_l; + dst32_l = dst54_l; + dst54_l = dst76_l; + dst21_r = dst43_r; + dst43_r = dst65_r; + dst65_r = dst87_r; + dst21_l = dst43_l; + dst43_l = dst65_l; + dst65_l = dst87_l; + dst6 = dst8; + } + + src += 8; + dst += 8; + } +} + +static void hevc_hv_uniwgt_8t_8w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, weight, + offset, rnd_val, 8); +} + +static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, weight, + offset, rnd_val, 8); + hevc_hv_uniwgt_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride, + filter_x, filter_y, height, weight, offset, + rnd_val); +} + +static void hevc_hv_uniwgt_8t_16w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, weight, + offset, rnd_val, 16); +} + +static void hevc_hv_uniwgt_8t_24w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, weight, + offset, rnd_val, 24); +} + +static void hevc_hv_uniwgt_8t_32w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, weight, + offset, rnd_val, 32); +} + +static void hevc_hv_uniwgt_8t_48w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, weight, + offset, rnd_val, 48); +} + +static void hevc_hv_uniwgt_8t_64w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, weight, + offset, rnd_val, 64); +} + +static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + v8i16 filt0, filt1; + v16i8 src0, src1, vec0, vec1; + v16i8 mask1; + v8i16 dst0; + v4i32 dst0_r, dst0_l; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + weight = weight & 0x0000FFFF; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + LD_SB2(src, src_stride, src0, src1); + XORI_B2_128_SB(src0, src1); + + VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l); + DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l); + SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); + ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l); + dst0_r = CLIP_SW_0_255(dst0_r); + dst0_l = CLIP_SW_0_255(dst0_l); + + HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r); + ST4x2_UB(dst0_r, dst, dst_stride); + dst += (4 * dst_stride); +} + +static void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3; + v16i8 mask1, vec0, vec1; + v8i16 dst0, dst1; + v4i32 dst0_r, dst1_r, dst0_l, dst1_l; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + + src -= 1; + + /* rearranging filter */ + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + weight = weight & 0x0000FFFF; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); +} + +static void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 mask1, vec0, vec1; + v8i16 dst0, dst1, dst2, dst3; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + weight = weight & 0x0000FFFF; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + + VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST4x8_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +static void hevc_hz_uniwgt_4t_4w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + if (2 == height) { + hevc_hz_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride, + filter, height, weight, offset, rnd_val); + } else if (4 == height) { + hevc_hz_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride, + filter, height, weight, offset, rnd_val); + } else if (8 == height || 16 == height) { + hevc_hz_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride, + filter, height, weight, + offset, rnd_val); + } +} + +static void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v16i8 vec0, vec1; + v8i16 dst0, dst1, dst2, dst3; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + weight = weight & 0x0000FFFF; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + + ST6x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + v8i16 filt0, filt1, dst0, dst1; + v16i8 src0, src1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v16i8 vec0, vec1; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst0_l, dst1_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + weight = weight & 0x0000FFFF; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + mask1 = mask0 + 2; + + LD_SB2(src, src_stride, src0, src1); + XORI_B2_128_SB(src0, src1); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST8x2_UB(dst0_r, dst, dst_stride); +} + +static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3, src4, src5; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v16i8 vec0, vec1; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + weight = weight & 0x0000FFFF; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + mask1 = mask0 + 2; + + LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5); + LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5); + XORI_B6_128_SB(src0, src1, src2, src3, src4, src5); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + + HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r); + + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + ST8x2_UB(dst2_r, dst, dst_stride); +} + +static void hevc_hz_uniwgt_4t_8x4multiple_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v16i8 vec0, vec1; + v8i16 dst0, dst1, dst2, dst3; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + weight = weight & 0x0000FFFF; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_uniwgt_4t_8w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + if (2 == height) { + hevc_hz_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride, + filter, height, weight, offset, rnd_val); + } else if (6 == height) { + hevc_hz_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride, + filter, height, weight, offset, rnd_val); + } else { + hevc_hz_uniwgt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride, + filter, height, weight, offset, + rnd_val); + } +} + +static void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 + }; + v16i8 mask1; + v16i8 vec0, vec1; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v8i16 filter_vec, const_vec; + v16i8 mask3; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + weight = weight & 0x0000FFFF; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + mask1 = mask0 + 2; + mask3 = mask2 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + + HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + dst4_l, dst4_r, dst5_l, dst5_r, + dst0_r, dst1_r, dst2_r); + + ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_uniwgt_4t_16w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 filt0, filt1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16i8 vec0, vec1; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + weight = weight & 0x0000FFFF; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst6 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); + + VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); + dst7 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); + + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + + HEVC_UNIW_RND_CLIP4(dst4, dst5, dst6, dst7, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + } +} + +static void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + uint8_t *dst_tmp = dst + 16; + v16i8 src0, src1, src2, src3; + v8i16 filt0, filt1; + v8i16 dst0, dst1, dst2, dst3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1, mask2, mask3; + v16i8 vec0, vec1; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + weight = weight & 0x0000FFFF; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + mask1 = mask0 + 2; + mask2 = mask0 + 8; + mask3 = mask0 + 10; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + /* 16 width */ + LD_SB2(src, src_stride, src0, src2); + LD_SB2(src + 16, src_stride, src1, src3); + src += (2 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + + /* 8 width */ + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST8x2_UB(dst0_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + } +} + +static void hevc_hz_uniwgt_4t_32w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2; + v8i16 filt0, filt1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1, mask2, mask3; + v8i16 dst0, dst1, dst2, dst3; + v16i8 vec0, vec1; + v8i16 filter_vec, const_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + weight = weight & 0x0000FFFF; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + mask1 = mask0 + 2; + mask2 = mask0 + 8; + mask3 = mask0 + 10; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, 16, src0, src1); + src2 = LD_SB(src + 24); + src += src_stride; + + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, 16); + dst += dst_stride; + + LD_SB2(src, 16, src0, src1); + src2 = LD_SB(src + 24); + src += src_stride; + + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, 16); + dst += dst_stride; + } +} + +static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + v16i8 src0, src1, src2, src3, src4; + v16i8 src10_r, src32_r, src21_r, src43_r; + v16i8 src2110, src4332; + v8i16 dst10; + v4i32 dst0_r, dst0_l; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + weight = weight & 0x0000FFFF; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + LD_SB2(src, src_stride, src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r); + src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128); + + dst10 = const_vec; + DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); + + ILVRL_H2_SW(dst10, dst10, dst0_r, dst0_l); + DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l); + SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); + ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l); + dst0_r = CLIP_SW_0_255(dst0_r); + dst0_l = CLIP_SW_0_255(dst0_l); + + HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r); + ST4x2_UB(dst0_r, dst, dst_stride); +} + +static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6; + v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; + v16i8 src2110, src4332, src6554; + v8i16 dst10, dst32; + v4i32 dst0_r, dst1_r, dst0_l, dst1_l; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + weight = weight & 0x0000FFFF; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + + LD_SB4(src, src_stride, src3, src4, src5, src6); + ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, + src32_r, src43_r, src54_r, src65_r); + ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554); + XORI_B2_128_SB(src4332, src6554); + + dst10 = const_vec; + DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); + dst32 = const_vec; + DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); + HEVC_UNIW_RND_CLIP2(dst10, dst32, weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); +} + +static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; + v16i8 src2110, src4332, src6554, src8776; + v8i16 dst10, dst32, dst54, dst76; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + weight = weight & 0x0000FFFF; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8); + src += (6 * src_stride); + ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, + src32_r, src43_r, src54_r, src65_r); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, + src4332, src6554, src8776); + XORI_B3_128_SB(src4332, src6554, src8776); + + dst10 = const_vec; + DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); + dst32 = const_vec; + DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); + dst54 = const_vec; + DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54); + + LD_SB2(src, src_stride, src9, src2); + src += (2 * src_stride); + ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + + dst76 = const_vec; + DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76); + HEVC_UNIW_RND_CLIP4(dst10, dst32, dst54, dst76, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST4x8_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +static void hevc_vt_uniwgt_4t_4w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + if (2 == height) { + hevc_vt_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride, + filter, height, weight, offset, rnd_val); + } else if (4 == height) { + hevc_vt_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride, + filter, height, weight, offset, rnd_val); + } else if (0 == (height % 8)) { + hevc_vt_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride, + filter, height, weight, offset, + rnd_val); + } +} + +static void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + + src -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + weight = weight & 0x0000FFFF; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src, src_stride, src3, src4); + src += (2 * src_stride); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + + tmp0 = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1); + + LD_SB2(src, src_stride, src1, src2); + src += (2 * src_stride); + XORI_B2_128_SB(src1, src2); + ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r); + + tmp2 = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3); + HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + + ST6x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + v16i8 src0, src1, src2, src3, src4; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 tmp0, tmp1; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v4i32 dst0_r, dst1_r, dst0_l, dst1_l; + + src -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + weight = weight & 0x0000FFFF; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + LD_SB2(src, src_stride, src3, src4); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + + tmp0 = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1); + HEVC_UNIW_RND_CLIP2(tmp0, tmp1, weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST8x2_UB(dst0_r, dst, dst_stride); +} + +static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r; + v16i8 src21_r, src43_r, src65_r, src87_r; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + + src -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + weight = weight & 0x0000FFFF; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8); + XORI_B6_128_SB(src3, src4, src5, src6, src7, src8); + ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, + src32_r, src43_r, src54_r, src65_r); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + + tmp0 = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1); + tmp2 = const_vec; + DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, tmp3, tmp3); + tmp4 = const_vec; + DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, tmp4, tmp4); + tmp5 = const_vec; + DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, tmp5, tmp5); + HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + HEVC_UNIW_RND_CLIP2(tmp4, tmp5, weight_vec, offset_vec, rnd_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + + HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + ST8x2_UB(dst2_r, dst, dst_stride); +} + +static void hevc_vt_uniwgt_4t_8x4multiple_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + + src -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + weight = weight & 0x0000FFFF; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src, src_stride, src3, src4); + src += (2 * src_stride); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + + tmp0 = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1); + + LD_SB2(src, src_stride, src1, src2); + src += (2 * src_stride); + XORI_B2_128_SB(src1, src2); + ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r); + + tmp2 = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3); + HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_vt_uniwgt_4t_8w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + if (2 == height) { + hevc_vt_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride, + filter, height, weight, offset, rnd_val); + } else if (6 == height) { + hevc_vt_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride, + filter, height, weight, offset, rnd_val); + } else { + hevc_vt_uniwgt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride, + filter, height, weight, offset, + rnd_val); + } +} + +static void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; + v16i8 src2110, src4332; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + + src -= (1 * src_stride); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + weight = weight & 0x0000FFFF; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src, src_stride, src3, src4); + src += (2 * src_stride); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); + src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l); + + tmp0 = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1); + tmp4 = const_vec; + DPADD_SB2_SH(src2110, src4332, filt0, filt1, tmp4, tmp4); + + LD_SB2(src, src_stride, src5, src2); + src += (2 * src_stride); + XORI_B2_128_SB(src5, src2); + ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); + ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l); + + tmp2 = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3); + tmp5 = const_vec; + DPADD_SB2_SH(src4332, src2110, filt0, filt1, tmp5, tmp5); + HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + HEVC_UNIW_RND_CLIP2(tmp4, tmp5, weight_vec, offset_vec, rnd_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + + HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + dst4_l, dst4_r, dst5_l, dst5_r, + dst0_r, dst1_r, dst2_r); + ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_vt_uniwgt_4t_16w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v16i8 src10_r, src32_r, src21_r, src43_r; + v16i8 src10_l, src32_l, src21_l, src43_l; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + + src -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + weight = weight & 0x0000FFFF; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src, src_stride, src3, src4); + src += (2 * src_stride); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); + + tmp0 = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1); + tmp2 = const_vec; + DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp3, tmp3); + HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, + dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + + LD_SB2(src, src_stride, src5, src2); + src += (2 * src_stride); + XORI_B2_128_SB(src5, src2); + ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); + ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); + + tmp0 = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0); + tmp1 = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1); + tmp2 = const_vec; + DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp3, tmp3); + HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, + dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + } +} + +static void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v16i8 src6, src7, src8, src9, src10, src11; + v16i8 src10_r, src32_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src87_r, src109_r; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + v16i8 src10_l, src32_l, src21_l, src43_l; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + + src -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + weight = weight & 0x0000FFFF; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + + LD_SB3(src + 16, src_stride, src6, src7, src8); + src += (3 * src_stride); + XORI_B3_128_SB(src6, src7, src8); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src, src_stride, src3, src4); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); + LD_SB2(src + 16, src_stride, src9, src10); + src += (2 * src_stride); + XORI_B2_128_SB(src9, src10); + ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); + + tmp0 = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0); + tmp4 = const_vec; + DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4); + tmp1 = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1); + tmp5 = const_vec; + DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5); + tmp2 = const_vec; + DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3); + + HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + HEVC_UNIW_RND_CLIP2(tmp2, tmp3, weight_vec, offset_vec, rnd_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, + dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); + HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + ST8x2_UB(dst4_r, dst + 16, dst_stride); + dst += (2 * dst_stride); + + LD_SB2(src, src_stride, src5, src2); + XORI_B2_128_SB(src5, src2); + ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); + ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); + LD_SB2(src + 16, src_stride, src11, src8); + src += (2 * src_stride); + XORI_B2_128_SB(src11, src8); + ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r); + + tmp0 = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0); + tmp4 = const_vec; + DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp4, tmp4); + tmp1 = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1); + tmp5 = const_vec; + DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp5, tmp5); + tmp2 = const_vec; + DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, tmp2, tmp2); + tmp3 = const_vec; + DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, tmp3, tmp3); + + HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + HEVC_UNIW_RND_CLIP2(tmp2, tmp3, weight_vec, offset_vec, rnd_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, + dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); + HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + ST8x2_UB(dst4_r, dst + 16, dst_stride); + dst += (2 * dst_stride); + } +} + +static void hevc_vt_uniwgt_4t_32w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + int32_t loop_cnt; + uint8_t *dst_tmp = dst + 16; + v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src87_r, src109_r; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v16i8 src10_l, src32_l, src76_l, src98_l; + v16i8 src21_l, src43_l, src87_l, src109_l; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; + v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l, dst6_l, dst7_l; + + src -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + weight = weight & 0x0000FFFF; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + + LD_SB3(src + 16, src_stride, src6, src7, src8); + src += (3 * src_stride); + XORI_B3_128_SB(src6, src7, src8); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, src_stride, src3, src4); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); + + tmp0 = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0); + tmp4 = const_vec; + DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4); + tmp1 = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1); + tmp5 = const_vec; + DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5); + + HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r, + dst0_l, dst1_l, dst2_l, dst3_l); + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, + dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + + src10_r = src32_r; + src21_r = src43_r; + src10_l = src32_l; + src21_l = src43_l; + src2 = src4; + + LD_SB2(src + 16, src_stride, src9, src10); + src += (2 * src_stride); + XORI_B2_128_SB(src9, src10); + ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); + ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l); + + tmp2 = const_vec; + DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2); + tmp6 = const_vec; + DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, tmp6, tmp6); + tmp3 = const_vec; + DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3); + tmp7 = const_vec; + DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, tmp7, tmp7); + + HEVC_UNIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7, + weight_vec, offset_vec, rnd_vec, + dst4_r, dst5_r, dst6_r, dst7_r, + dst4_l, dst5_l, dst6_l, dst7_l); + + HEVC_PCK_SW_SB8(dst4_l, dst4_r, dst6_l, dst6_r, + dst5_l, dst5_r, dst7_l, dst7_r, dst4_r, dst5_r); + ST_SW2(dst4_r, dst5_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + + src76_r = src98_r; + src87_r = src109_r; + src76_l = src98_l; + src87_l = src109_l; + src8 = src10; + } +} + +static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + v16i8 src0, src1, src2, src3, src4; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v4i32 dst0_r, dst1_r; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r); + LD_SB2(src, src_stride, src3, src4); + XORI_B2_128_SB(src3, src4); + + /* row 3 */ + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + dst32_r = __msa_ilvr_h(dst3, dst2); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_r >>= 6; + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + dst43_r = __msa_ilvr_h(dst4, dst3); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_r >>= 6; + + MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r); + SRAR_W2_SW(dst0_r, dst1_r, rnd_vec); + ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r); + dst0_r = CLIP_SW_0_255(dst0_r); + dst1_r = CLIP_SW_0_255(dst1_r); + + HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r); + ST4x2_UB(dst0_r, dst, dst_stride); +} + +static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r); + + LD_SB4(src, src_stride, src3, src4, src5, src6); + XORI_B4_128_SB(src3, src4, src5, src6); + + /* row 3 */ + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + dst32_r = __msa_ilvr_h(dst3, dst2); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_r >>= 6; + + /* row 4 */ + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + dst43_r = __msa_ilvr_h(dst4, dst3); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_r >>= 6; + + /* row 5 */ + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + dst10_r = __msa_ilvr_h(dst5, dst4); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1); + dst2_r >>= 6; + + /* row 6 */ + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + dst21_r = __msa_ilvr_h(dst2, dst5); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1); + dst3_r >>= 6; + + HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst2_r, dst3_r, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r); + HEVC_PCK_SW_SB4(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r); + ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride); +} + +static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; + v8i16 dst10_r, dst32_r, dst54_r, dst76_r; + v8i16 dst21_r, dst43_r, dst65_r, dst87_r; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r); + + for (loop_cnt = height >> 3; loop_cnt--;) { + LD_SB8(src, src_stride, + src3, src4, src5, src6, src7, src8, src9, src10); + src += (8 * src_stride); + XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + dst32_r = __msa_ilvr_h(dst3, dst2); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_r >>= 6; + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + dst43_r = __msa_ilvr_h(dst4, dst3); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_r >>= 6; + + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + dst54_r = __msa_ilvr_h(dst5, dst4); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); + dst2_r >>= 6; + + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst6 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); + dst65_r = __msa_ilvr_h(dst6, dst5); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); + dst3_r >>= 6; + + VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); + dst7 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); + dst76_r = __msa_ilvr_h(dst7, dst6); + dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); + dst4_r >>= 6; + + VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1); + dst8 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8); + dst87_r = __msa_ilvr_h(dst8, dst7); + dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); + dst5_r >>= 6; + + VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1); + dst9 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9); + dst10_r = __msa_ilvr_h(dst9, dst8); + dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1); + dst6_r >>= 6; + + VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + dst21_r = __msa_ilvr_h(dst2, dst9); + dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1); + dst7_r >>= 6; + + HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst2_r, dst3_r, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst2_r, dst3_r); + HEVC_PCK_SW_SB4(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r); + ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + HEVC_HV_UNIW_RND_CLIP4(dst4_r, dst5_r, dst6_r, dst7_r, + weight_vec, offset_vec, rnd_vec, + dst4_r, dst5_r, dst6_r, dst7_r); + HEVC_PCK_SW_SB4(dst5_r, dst4_r, dst7_r, dst6_r, dst0_r); + ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hv_uniwgt_4t_4w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + if (2 == height) { + hevc_hv_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, weight, + offset, rnd_val); + } else if (4 == height) { + hevc_hv_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, weight, + offset, rnd_val); + } else if (0 == (height % 8)) { + hevc_hv_uniwgt_4t_4multx8mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, weight, + offset, rnd_val); + } +} + +static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l; + v4i32 weight_vec, offset_vec, rnd_vec; + v4i32 dst2_r, dst2_l, dst3_r, dst3_l; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v8i16 dst10_l, dst32_l, dst21_l, dst43_l; + + src -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + for (loop_cnt = height >> 2; loop_cnt--;) { + LD_SB4(src, src_stride, src3, src4, src5, src6); + src += (4 * src_stride); + XORI_B4_128_SB(src3, src4, src5, src6); + + /* row 3 */ + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + dst0_r >>= 6; + dst0_l >>= 6; + + /* row 4 */ + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + + /* row 5 */ + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1); + dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1); + dst2_r >>= 6; + dst2_l >>= 6; + + /* row 6 */ + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1); + dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1); + dst3_r >>= 6; + dst3_l >>= 6; + + HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + HEVC_HV_UNIW_RND_CLIP4(dst2_r, dst3_r, dst2_l, dst3_l, + weight_vec, offset_vec, rnd_vec, + dst2_r, dst3_r, dst2_l, dst3_l); + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST6x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + v16i8 src0, src1, src2, src3, src4; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v8i16 dst10_l, dst32_l, dst21_l, dst43_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + LD_SB2(src, src_stride, src3, src4); + src += (2 * src_stride); + XORI_B2_128_SB(src3, src4); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + dst0_r >>= 6; + dst0_l >>= 6; + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + + HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); + ST8x2_UB(dst0_r, dst, dst_stride); + dst += (2 * dst_stride); +} + +static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; + v4i32 dst4_r, dst4_l, dst5_r, dst5_l; + v8i16 dst10_r, dst32_r, dst10_l, dst32_l; + v8i16 dst21_r, dst43_r, dst21_l, dst43_l; + v8i16 dst54_r, dst54_l, dst65_r, dst65_l; + v8i16 dst76_r, dst76_l, dst87_r, dst87_l; + v4i32 weight_vec, offset_vec, rnd_vec; + + src -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + LD_SB2(src, src_stride, src3, src4); + src += (2 * src_stride); + XORI_B2_128_SB(src3, src4); + + /* row 3 */ + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + dst0_r >>= 6; + dst0_l >>= 6; + + /* row 4 */ + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + + LD_SB2(src, src_stride, src5, src6); + src += (2 * src_stride); + XORI_B2_128_SB(src5, src6); + + /* row 5 */ + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); + dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); + dst2_r >>= 6; + dst2_l >>= 6; + + /* row 6 */ + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst6 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); + ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); + dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); + dst3_r >>= 6; + dst3_l >>= 6; + + LD_SB2(src, src_stride, src7, src8); + src += (2 * src_stride); + XORI_B2_128_SB(src7, src8); + + /* row 7 */ + VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); + dst7 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); + ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); + dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); + dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1); + + dst4_r >>= 6; + dst4_l >>= 6; + + /* row 8 */ + VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1); + dst8 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8); + ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); + dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); + dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1); + dst5_r >>= 6; + dst5_l >>= 6; + + HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + HEVC_HV_UNIW_RND_CLIP4(dst2_r, dst3_r, dst2_l, dst3_l, + weight_vec, offset_vec, rnd_vec, + dst2_r, dst3_r, dst2_l, dst3_l); + HEVC_HV_UNIW_RND_CLIP4(dst4_r, dst5_r, dst4_l, dst5_l, + weight_vec, offset_vec, rnd_vec, + dst4_r, dst5_r, dst4_l, dst5_l); + HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r); + ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + dst += (4 * dst_stride); + ST8x2_UB(dst2_r, dst, dst_stride); +} + +static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val, + int32_t width) +{ + uint32_t loop_cnt, cnt; + uint8_t *src_tmp; + uint8_t *dst_tmp; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l; + v4i32 weight_vec, offset_vec, rnd_vec; + v4i32 dst2_r, dst2_l, dst3_r, dst3_l; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v8i16 dst10_l, dst32_l, dst21_l, dst43_l; + + src -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + weight_vec = __msa_fill_w(weight); + offset_vec = __msa_fill_w(offset); + rnd_vec = __msa_fill_w(rnd_val); + + for (cnt = width >> 3; cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB3(src_tmp, src_stride, src0, src1, src2); + src_tmp += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + for (loop_cnt = height >> 2; loop_cnt--;) { + LD_SB4(src_tmp, src_stride, src3, src4, src5, src6); + src_tmp += (4 * src_stride); + XORI_B4_128_SB(src3, src4, src5, src6); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + dst0_r >>= 6; + dst0_l >>= 6; + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1); + dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1); + dst2_r >>= 6; + dst2_l >>= 6; + + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1); + dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1); + dst3_r >>= 6; + dst3_l >>= 6; + + HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l, + weight_vec, offset_vec, rnd_vec, + dst0_r, dst1_r, dst0_l, dst1_l); + HEVC_HV_UNIW_RND_CLIP4(dst2_r, dst3_r, dst2_l, dst3_l, + weight_vec, offset_vec, rnd_vec, + dst2_r, dst3_r, dst2_l, dst3_l); + HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); + ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + } + + src += 8; + dst += 8; + } +} + +static void hevc_hv_uniwgt_4t_8w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + + if (2 == height) { + hevc_hv_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, weight, + offset, rnd_val); + } else if (6 == height) { + hevc_hv_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, weight, + offset, rnd_val); + } else if (0 == (height % 4)) { + hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, weight, + offset, rnd_val, 8); + } +} + +static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, weight, + offset, rnd_val, 8); + hevc_hv_uniwgt_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride, + filter_x, filter_y, height, weight, + offset, rnd_val); +} + +static void hevc_hv_uniwgt_4t_16w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, weight, + offset, rnd_val, 16); +} + +static void hevc_hv_uniwgt_4t_24w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, weight, + offset, rnd_val, 24); +} + +static void hevc_hv_uniwgt_4t_32w_msa(uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t weight, + int32_t offset, + int32_t rnd_val) +{ + hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, weight, + offset, rnd_val, 32); +} + +#define UNIWGT_MC_COPY(WIDTH) \ +void ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \ + ptrdiff_t dst_stride, \ + uint8_t *src, \ + ptrdiff_t src_stride, \ + int height, \ + int denom, \ + int weight, \ + int offset, \ + intptr_t mx, \ + intptr_t my, \ + int width) \ +{ \ + int shift = denom + 14 - 8; \ + hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \ + height, weight, offset, shift); \ +} + +UNIWGT_MC_COPY(4); +UNIWGT_MC_COPY(6); +UNIWGT_MC_COPY(8); +UNIWGT_MC_COPY(12); +UNIWGT_MC_COPY(16); +UNIWGT_MC_COPY(24); +UNIWGT_MC_COPY(32); +UNIWGT_MC_COPY(48); +UNIWGT_MC_COPY(64); + +#undef UNIWGT_MC_COPY + +#define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ +void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ + ptrdiff_t \ + dst_stride, \ + uint8_t *src, \ + ptrdiff_t \ + src_stride, \ + int height, \ + int denom, \ + int weight, \ + int offset, \ + intptr_t mx, \ + intptr_t my, \ + int width) \ +{ \ + const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ + int shift = denom + 14 - 8; \ + \ + hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \ + dst_stride, filter, height, \ + weight, offset, shift); \ +} + +UNI_W_MC(qpel, h, 4, 8, hz, mx); +UNI_W_MC(qpel, h, 8, 8, hz, mx); +UNI_W_MC(qpel, h, 12, 8, hz, mx); +UNI_W_MC(qpel, h, 16, 8, hz, mx); +UNI_W_MC(qpel, h, 24, 8, hz, mx); +UNI_W_MC(qpel, h, 32, 8, hz, mx); +UNI_W_MC(qpel, h, 48, 8, hz, mx); +UNI_W_MC(qpel, h, 64, 8, hz, mx); + +UNI_W_MC(qpel, v, 4, 8, vt, my); +UNI_W_MC(qpel, v, 8, 8, vt, my); +UNI_W_MC(qpel, v, 12, 8, vt, my); +UNI_W_MC(qpel, v, 16, 8, vt, my); +UNI_W_MC(qpel, v, 24, 8, vt, my); +UNI_W_MC(qpel, v, 32, 8, vt, my); +UNI_W_MC(qpel, v, 48, 8, vt, my); +UNI_W_MC(qpel, v, 64, 8, vt, my); + +UNI_W_MC(epel, h, 4, 4, hz, mx); +UNI_W_MC(epel, h, 6, 4, hz, mx); +UNI_W_MC(epel, h, 8, 4, hz, mx); +UNI_W_MC(epel, h, 12, 4, hz, mx); +UNI_W_MC(epel, h, 16, 4, hz, mx); +UNI_W_MC(epel, h, 24, 4, hz, mx); +UNI_W_MC(epel, h, 32, 4, hz, mx); + +UNI_W_MC(epel, v, 4, 4, vt, my); +UNI_W_MC(epel, v, 6, 4, vt, my); +UNI_W_MC(epel, v, 8, 4, vt, my); +UNI_W_MC(epel, v, 12, 4, vt, my); +UNI_W_MC(epel, v, 16, 4, vt, my); +UNI_W_MC(epel, v, 24, 4, vt, my); +UNI_W_MC(epel, v, 32, 4, vt, my); + +#undef UNI_W_MC + +#define UNI_W_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \ +void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ + ptrdiff_t \ + dst_stride, \ + uint8_t *src, \ + ptrdiff_t \ + src_stride, \ + int height, \ + int denom, \ + int weight, \ + int offset, \ + intptr_t mx, \ + intptr_t my, \ + int width) \ +{ \ + const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ + const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ + int shift = denom + 14 - 8; \ + \ + hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \ + dst_stride, filter_x, \ + filter_y, height, weight, \ + offset, shift); \ +} + +UNI_W_MC_HV(qpel, hv, 4, 8, hv); +UNI_W_MC_HV(qpel, hv, 8, 8, hv); +UNI_W_MC_HV(qpel, hv, 12, 8, hv); +UNI_W_MC_HV(qpel, hv, 16, 8, hv); +UNI_W_MC_HV(qpel, hv, 24, 8, hv); +UNI_W_MC_HV(qpel, hv, 32, 8, hv); +UNI_W_MC_HV(qpel, hv, 48, 8, hv); +UNI_W_MC_HV(qpel, hv, 64, 8, hv); + +UNI_W_MC_HV(epel, hv, 4, 4, hv); +UNI_W_MC_HV(epel, hv, 6, 4, hv); +UNI_W_MC_HV(epel, hv, 8, 4, hv); +UNI_W_MC_HV(epel, hv, 12, 4, hv); +UNI_W_MC_HV(epel, hv, 16, 4, hv); +UNI_W_MC_HV(epel, hv, 24, 4, hv); +UNI_W_MC_HV(epel, hv, 32, 4, hv); + +#undef UNI_W_MC_HV diff --git a/libavcodec/mips/hevcdsp_init_mips.c b/libavcodec/mips/hevcdsp_init_mips.c new file mode 100644 index 0000000000..3675b93155 --- /dev/null +++ b/libavcodec/mips/hevcdsp_init_mips.c @@ -0,0 +1,454 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/mips/hevcdsp_mips.h" + +#if HAVE_MSA +static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c, + const int bit_depth) +{ + if (8 == bit_depth) { + c->put_hevc_qpel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_msa; + c->put_hevc_qpel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_msa; + c->put_hevc_qpel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_msa; + c->put_hevc_qpel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_msa; + c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_msa; + c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_msa; + c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_msa; + c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_msa; + c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_msa; + + c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_msa; + c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_msa; + c->put_hevc_qpel[4][0][1] = ff_hevc_put_hevc_qpel_h12_8_msa; + c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_msa; + c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_8_msa; + c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_msa; + c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_msa; + c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_msa; + + c->put_hevc_qpel[1][1][0] = ff_hevc_put_hevc_qpel_v4_8_msa; + c->put_hevc_qpel[3][1][0] = ff_hevc_put_hevc_qpel_v8_8_msa; + c->put_hevc_qpel[4][1][0] = ff_hevc_put_hevc_qpel_v12_8_msa; + c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_8_msa; + c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_8_msa; + c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_msa; + c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_msa; + c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_msa; + + c->put_hevc_qpel[1][1][1] = ff_hevc_put_hevc_qpel_hv4_8_msa; + c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_msa; + c->put_hevc_qpel[4][1][1] = ff_hevc_put_hevc_qpel_hv12_8_msa; + c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_8_msa; + c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_8_msa; + c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_8_msa; + c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_8_msa; + c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_8_msa; + + c->put_hevc_epel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_msa; + c->put_hevc_epel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_msa; + c->put_hevc_epel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_msa; + c->put_hevc_epel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_msa; + c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_msa; + c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_msa; + c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_msa; + + c->put_hevc_epel[1][0][1] = ff_hevc_put_hevc_epel_h4_8_msa; + c->put_hevc_epel[2][0][1] = ff_hevc_put_hevc_epel_h6_8_msa; + c->put_hevc_epel[3][0][1] = ff_hevc_put_hevc_epel_h8_8_msa; + c->put_hevc_epel[4][0][1] = ff_hevc_put_hevc_epel_h12_8_msa; + c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_8_msa; + c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_8_msa; + c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_msa; + + c->put_hevc_epel[1][1][0] = ff_hevc_put_hevc_epel_v4_8_msa; + c->put_hevc_epel[2][1][0] = ff_hevc_put_hevc_epel_v6_8_msa; + c->put_hevc_epel[3][1][0] = ff_hevc_put_hevc_epel_v8_8_msa; + c->put_hevc_epel[4][1][0] = ff_hevc_put_hevc_epel_v12_8_msa; + c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_8_msa; + c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_8_msa; + c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_msa; + + c->put_hevc_epel[1][1][1] = ff_hevc_put_hevc_epel_hv4_8_msa; + c->put_hevc_epel[2][1][1] = ff_hevc_put_hevc_epel_hv6_8_msa; + c->put_hevc_epel[3][1][1] = ff_hevc_put_hevc_epel_hv8_8_msa; + c->put_hevc_epel[4][1][1] = ff_hevc_put_hevc_epel_hv12_8_msa; + c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_8_msa; + c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_8_msa; + c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_msa; + + c->put_hevc_qpel_uni[3][0][0] = ff_hevc_put_hevc_uni_pel_pixels8_8_msa; + c->put_hevc_qpel_uni[4][0][0] = ff_hevc_put_hevc_uni_pel_pixels12_8_msa; + c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels16_8_msa; + c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels24_8_msa; + c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_msa; + c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_msa; + c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_msa; + + c->put_hevc_qpel_uni[1][0][1] = ff_hevc_put_hevc_uni_qpel_h4_8_msa; + c->put_hevc_qpel_uni[3][0][1] = ff_hevc_put_hevc_uni_qpel_h8_8_msa; + c->put_hevc_qpel_uni[4][0][1] = ff_hevc_put_hevc_uni_qpel_h12_8_msa; + c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_8_msa; + c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_8_msa; + c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_msa; + c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_msa; + c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_msa; + + c->put_hevc_qpel_uni[1][1][0] = ff_hevc_put_hevc_uni_qpel_v4_8_msa; + c->put_hevc_qpel_uni[3][1][0] = ff_hevc_put_hevc_uni_qpel_v8_8_msa; + c->put_hevc_qpel_uni[4][1][0] = ff_hevc_put_hevc_uni_qpel_v12_8_msa; + c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_8_msa; + c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_8_msa; + c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_msa; + c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_msa; + c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_msa; + + c->put_hevc_qpel_uni[1][1][1] = ff_hevc_put_hevc_uni_qpel_hv4_8_msa; + c->put_hevc_qpel_uni[3][1][1] = ff_hevc_put_hevc_uni_qpel_hv8_8_msa; + c->put_hevc_qpel_uni[4][1][1] = ff_hevc_put_hevc_uni_qpel_hv12_8_msa; + c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_8_msa; + c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_8_msa; + c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_8_msa; + c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_msa; + c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_msa; + + c->put_hevc_epel_uni[3][0][0] = ff_hevc_put_hevc_uni_pel_pixels8_8_msa; + c->put_hevc_epel_uni[4][0][0] = ff_hevc_put_hevc_uni_pel_pixels12_8_msa; + c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels16_8_msa; + c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels24_8_msa; + c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_msa; + + c->put_hevc_epel_uni[1][0][1] = ff_hevc_put_hevc_uni_epel_h4_8_msa; + c->put_hevc_epel_uni[2][0][1] = ff_hevc_put_hevc_uni_epel_h6_8_msa; + c->put_hevc_epel_uni[3][0][1] = ff_hevc_put_hevc_uni_epel_h8_8_msa; + c->put_hevc_epel_uni[4][0][1] = ff_hevc_put_hevc_uni_epel_h12_8_msa; + c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_8_msa; + c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_8_msa; + c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_msa; + + c->put_hevc_epel_uni[1][1][0] = ff_hevc_put_hevc_uni_epel_v4_8_msa; + c->put_hevc_epel_uni[2][1][0] = ff_hevc_put_hevc_uni_epel_v6_8_msa; + c->put_hevc_epel_uni[3][1][0] = ff_hevc_put_hevc_uni_epel_v8_8_msa; + c->put_hevc_epel_uni[4][1][0] = ff_hevc_put_hevc_uni_epel_v12_8_msa; + c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_8_msa; + c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_8_msa; + c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_msa; + + c->put_hevc_epel_uni[1][1][1] = ff_hevc_put_hevc_uni_epel_hv4_8_msa; + c->put_hevc_epel_uni[2][1][1] = ff_hevc_put_hevc_uni_epel_hv6_8_msa; + c->put_hevc_epel_uni[3][1][1] = ff_hevc_put_hevc_uni_epel_hv8_8_msa; + c->put_hevc_epel_uni[4][1][1] = ff_hevc_put_hevc_uni_epel_hv12_8_msa; + c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_8_msa; + c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_msa; + c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_msa; + + c->put_hevc_qpel_uni_w[1][0][0] = + ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa; + c->put_hevc_qpel_uni_w[3][0][0] = + ff_hevc_put_hevc_uni_w_pel_pixels8_8_msa; + c->put_hevc_qpel_uni_w[4][0][0] = + ff_hevc_put_hevc_uni_w_pel_pixels12_8_msa; + c->put_hevc_qpel_uni_w[5][0][0] = + ff_hevc_put_hevc_uni_w_pel_pixels16_8_msa; + c->put_hevc_qpel_uni_w[6][0][0] = + ff_hevc_put_hevc_uni_w_pel_pixels24_8_msa; + c->put_hevc_qpel_uni_w[7][0][0] = + ff_hevc_put_hevc_uni_w_pel_pixels32_8_msa; + c->put_hevc_qpel_uni_w[8][0][0] = + ff_hevc_put_hevc_uni_w_pel_pixels48_8_msa; + c->put_hevc_qpel_uni_w[9][0][0] = + ff_hevc_put_hevc_uni_w_pel_pixels64_8_msa; + + c->put_hevc_qpel_uni_w[1][0][1] = ff_hevc_put_hevc_uni_w_qpel_h4_8_msa; + c->put_hevc_qpel_uni_w[3][0][1] = ff_hevc_put_hevc_uni_w_qpel_h8_8_msa; + c->put_hevc_qpel_uni_w[4][0][1] = ff_hevc_put_hevc_uni_w_qpel_h12_8_msa; + c->put_hevc_qpel_uni_w[5][0][1] = ff_hevc_put_hevc_uni_w_qpel_h16_8_msa; + c->put_hevc_qpel_uni_w[6][0][1] = ff_hevc_put_hevc_uni_w_qpel_h24_8_msa; + c->put_hevc_qpel_uni_w[7][0][1] = ff_hevc_put_hevc_uni_w_qpel_h32_8_msa; + c->put_hevc_qpel_uni_w[8][0][1] = ff_hevc_put_hevc_uni_w_qpel_h48_8_msa; + c->put_hevc_qpel_uni_w[9][0][1] = ff_hevc_put_hevc_uni_w_qpel_h64_8_msa; + + c->put_hevc_qpel_uni_w[1][1][0] = ff_hevc_put_hevc_uni_w_qpel_v4_8_msa; + c->put_hevc_qpel_uni_w[3][1][0] = ff_hevc_put_hevc_uni_w_qpel_v8_8_msa; + c->put_hevc_qpel_uni_w[4][1][0] = ff_hevc_put_hevc_uni_w_qpel_v12_8_msa; + c->put_hevc_qpel_uni_w[5][1][0] = ff_hevc_put_hevc_uni_w_qpel_v16_8_msa; + c->put_hevc_qpel_uni_w[6][1][0] = ff_hevc_put_hevc_uni_w_qpel_v24_8_msa; + c->put_hevc_qpel_uni_w[7][1][0] = ff_hevc_put_hevc_uni_w_qpel_v32_8_msa; + c->put_hevc_qpel_uni_w[8][1][0] = ff_hevc_put_hevc_uni_w_qpel_v48_8_msa; + c->put_hevc_qpel_uni_w[9][1][0] = ff_hevc_put_hevc_uni_w_qpel_v64_8_msa; + + c->put_hevc_qpel_uni_w[1][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv4_8_msa; + c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_msa; + c->put_hevc_qpel_uni_w[4][1][1] = + ff_hevc_put_hevc_uni_w_qpel_hv12_8_msa; + c->put_hevc_qpel_uni_w[5][1][1] = + ff_hevc_put_hevc_uni_w_qpel_hv16_8_msa; + c->put_hevc_qpel_uni_w[6][1][1] = + ff_hevc_put_hevc_uni_w_qpel_hv24_8_msa; + c->put_hevc_qpel_uni_w[7][1][1] = + ff_hevc_put_hevc_uni_w_qpel_hv32_8_msa; + c->put_hevc_qpel_uni_w[8][1][1] = + ff_hevc_put_hevc_uni_w_qpel_hv48_8_msa; + c->put_hevc_qpel_uni_w[9][1][1] = + ff_hevc_put_hevc_uni_w_qpel_hv64_8_msa; + + c->put_hevc_epel_uni_w[1][0][0] = + ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa; + c->put_hevc_epel_uni_w[2][0][0] = + ff_hevc_put_hevc_uni_w_pel_pixels6_8_msa; + c->put_hevc_epel_uni_w[3][0][0] = + ff_hevc_put_hevc_uni_w_pel_pixels8_8_msa; + c->put_hevc_epel_uni_w[4][0][0] = + ff_hevc_put_hevc_uni_w_pel_pixels12_8_msa; + c->put_hevc_epel_uni_w[5][0][0] = + ff_hevc_put_hevc_uni_w_pel_pixels16_8_msa; + c->put_hevc_epel_uni_w[6][0][0] = + ff_hevc_put_hevc_uni_w_pel_pixels24_8_msa; + c->put_hevc_epel_uni_w[7][0][0] = + ff_hevc_put_hevc_uni_w_pel_pixels32_8_msa; + + c->put_hevc_epel_uni_w[1][0][1] = ff_hevc_put_hevc_uni_w_epel_h4_8_msa; + c->put_hevc_epel_uni_w[2][0][1] = ff_hevc_put_hevc_uni_w_epel_h6_8_msa; + c->put_hevc_epel_uni_w[3][0][1] = ff_hevc_put_hevc_uni_w_epel_h8_8_msa; + c->put_hevc_epel_uni_w[4][0][1] = ff_hevc_put_hevc_uni_w_epel_h12_8_msa; + c->put_hevc_epel_uni_w[5][0][1] = ff_hevc_put_hevc_uni_w_epel_h16_8_msa; + c->put_hevc_epel_uni_w[6][0][1] = ff_hevc_put_hevc_uni_w_epel_h24_8_msa; + c->put_hevc_epel_uni_w[7][0][1] = ff_hevc_put_hevc_uni_w_epel_h32_8_msa; + + c->put_hevc_epel_uni_w[1][1][0] = ff_hevc_put_hevc_uni_w_epel_v4_8_msa; + c->put_hevc_epel_uni_w[2][1][0] = ff_hevc_put_hevc_uni_w_epel_v6_8_msa; + c->put_hevc_epel_uni_w[3][1][0] = ff_hevc_put_hevc_uni_w_epel_v8_8_msa; + c->put_hevc_epel_uni_w[4][1][0] = ff_hevc_put_hevc_uni_w_epel_v12_8_msa; + c->put_hevc_epel_uni_w[5][1][0] = ff_hevc_put_hevc_uni_w_epel_v16_8_msa; + c->put_hevc_epel_uni_w[6][1][0] = ff_hevc_put_hevc_uni_w_epel_v24_8_msa; + c->put_hevc_epel_uni_w[7][1][0] = ff_hevc_put_hevc_uni_w_epel_v32_8_msa; + + c->put_hevc_epel_uni_w[1][1][1] = ff_hevc_put_hevc_uni_w_epel_hv4_8_msa; + c->put_hevc_epel_uni_w[2][1][1] = ff_hevc_put_hevc_uni_w_epel_hv6_8_msa; + c->put_hevc_epel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_epel_hv8_8_msa; + c->put_hevc_epel_uni_w[4][1][1] = + ff_hevc_put_hevc_uni_w_epel_hv12_8_msa; + c->put_hevc_epel_uni_w[5][1][1] = + ff_hevc_put_hevc_uni_w_epel_hv16_8_msa; + c->put_hevc_epel_uni_w[6][1][1] = + ff_hevc_put_hevc_uni_w_epel_hv24_8_msa; + c->put_hevc_epel_uni_w[7][1][1] = + ff_hevc_put_hevc_uni_w_epel_hv32_8_msa; + + c->put_hevc_qpel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_msa; + c->put_hevc_qpel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_msa; + c->put_hevc_qpel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_msa; + c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_msa; + c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_msa; + c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_msa; + c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_msa; + c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_msa; + + c->put_hevc_qpel_bi[1][0][1] = ff_hevc_put_hevc_bi_qpel_h4_8_msa; + c->put_hevc_qpel_bi[3][0][1] = ff_hevc_put_hevc_bi_qpel_h8_8_msa; + c->put_hevc_qpel_bi[4][0][1] = ff_hevc_put_hevc_bi_qpel_h12_8_msa; + c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_8_msa; + c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_8_msa; + c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_msa; + c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_msa; + c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_msa; + + c->put_hevc_qpel_bi[1][1][0] = ff_hevc_put_hevc_bi_qpel_v4_8_msa; + c->put_hevc_qpel_bi[3][1][0] = ff_hevc_put_hevc_bi_qpel_v8_8_msa; + c->put_hevc_qpel_bi[4][1][0] = ff_hevc_put_hevc_bi_qpel_v12_8_msa; + c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_8_msa; + c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_8_msa; + c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_msa; + c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_msa; + c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_msa; + + c->put_hevc_qpel_bi[1][1][1] = ff_hevc_put_hevc_bi_qpel_hv4_8_msa; + c->put_hevc_qpel_bi[3][1][1] = ff_hevc_put_hevc_bi_qpel_hv8_8_msa; + c->put_hevc_qpel_bi[4][1][1] = ff_hevc_put_hevc_bi_qpel_hv12_8_msa; + c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_8_msa; + c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_8_msa; + c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_8_msa; + c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_8_msa; + c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_8_msa; + + c->put_hevc_epel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_msa; + c->put_hevc_epel_bi[2][0][0] = ff_hevc_put_hevc_bi_pel_pixels6_8_msa; + c->put_hevc_epel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_msa; + c->put_hevc_epel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_msa; + c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_msa; + c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_msa; + c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_msa; + + c->put_hevc_epel_bi[1][0][1] = ff_hevc_put_hevc_bi_epel_h4_8_msa; + c->put_hevc_epel_bi[2][0][1] = ff_hevc_put_hevc_bi_epel_h6_8_msa; + c->put_hevc_epel_bi[3][0][1] = ff_hevc_put_hevc_bi_epel_h8_8_msa; + c->put_hevc_epel_bi[4][0][1] = ff_hevc_put_hevc_bi_epel_h12_8_msa; + c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_8_msa; + c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_8_msa; + c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_msa; + + c->put_hevc_epel_bi[1][1][0] = ff_hevc_put_hevc_bi_epel_v4_8_msa; + c->put_hevc_epel_bi[2][1][0] = ff_hevc_put_hevc_bi_epel_v6_8_msa; + c->put_hevc_epel_bi[3][1][0] = ff_hevc_put_hevc_bi_epel_v8_8_msa; + c->put_hevc_epel_bi[4][1][0] = ff_hevc_put_hevc_bi_epel_v12_8_msa; + c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_8_msa; + c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_8_msa; + c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_msa; + + c->put_hevc_epel_bi[1][1][1] = ff_hevc_put_hevc_bi_epel_hv4_8_msa; + c->put_hevc_epel_bi[2][1][1] = ff_hevc_put_hevc_bi_epel_hv6_8_msa; + c->put_hevc_epel_bi[3][1][1] = ff_hevc_put_hevc_bi_epel_hv8_8_msa; + c->put_hevc_epel_bi[4][1][1] = ff_hevc_put_hevc_bi_epel_hv12_8_msa; + c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_8_msa; + c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_8_msa; + c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_msa; + + c->put_hevc_qpel_bi_w[1][0][0] = + ff_hevc_put_hevc_bi_w_pel_pixels4_8_msa; + c->put_hevc_qpel_bi_w[3][0][0] = + ff_hevc_put_hevc_bi_w_pel_pixels8_8_msa; + c->put_hevc_qpel_bi_w[4][0][0] = + ff_hevc_put_hevc_bi_w_pel_pixels12_8_msa; + c->put_hevc_qpel_bi_w[5][0][0] = + ff_hevc_put_hevc_bi_w_pel_pixels16_8_msa; + c->put_hevc_qpel_bi_w[6][0][0] = + ff_hevc_put_hevc_bi_w_pel_pixels24_8_msa; + c->put_hevc_qpel_bi_w[7][0][0] = + ff_hevc_put_hevc_bi_w_pel_pixels32_8_msa; + c->put_hevc_qpel_bi_w[8][0][0] = + ff_hevc_put_hevc_bi_w_pel_pixels48_8_msa; + c->put_hevc_qpel_bi_w[9][0][0] = + ff_hevc_put_hevc_bi_w_pel_pixels64_8_msa; + + c->put_hevc_qpel_bi_w[1][0][1] = ff_hevc_put_hevc_bi_w_qpel_h4_8_msa; + c->put_hevc_qpel_bi_w[3][0][1] = ff_hevc_put_hevc_bi_w_qpel_h8_8_msa; + c->put_hevc_qpel_bi_w[4][0][1] = ff_hevc_put_hevc_bi_w_qpel_h12_8_msa; + c->put_hevc_qpel_bi_w[5][0][1] = ff_hevc_put_hevc_bi_w_qpel_h16_8_msa; + c->put_hevc_qpel_bi_w[6][0][1] = ff_hevc_put_hevc_bi_w_qpel_h24_8_msa; + c->put_hevc_qpel_bi_w[7][0][1] = ff_hevc_put_hevc_bi_w_qpel_h32_8_msa; + c->put_hevc_qpel_bi_w[8][0][1] = ff_hevc_put_hevc_bi_w_qpel_h48_8_msa; + c->put_hevc_qpel_bi_w[9][0][1] = ff_hevc_put_hevc_bi_w_qpel_h64_8_msa; + + c->put_hevc_qpel_bi_w[1][1][0] = ff_hevc_put_hevc_bi_w_qpel_v4_8_msa; + c->put_hevc_qpel_bi_w[3][1][0] = ff_hevc_put_hevc_bi_w_qpel_v8_8_msa; + c->put_hevc_qpel_bi_w[4][1][0] = ff_hevc_put_hevc_bi_w_qpel_v12_8_msa; + c->put_hevc_qpel_bi_w[5][1][0] = ff_hevc_put_hevc_bi_w_qpel_v16_8_msa; + c->put_hevc_qpel_bi_w[6][1][0] = ff_hevc_put_hevc_bi_w_qpel_v24_8_msa; + c->put_hevc_qpel_bi_w[7][1][0] = ff_hevc_put_hevc_bi_w_qpel_v32_8_msa; + c->put_hevc_qpel_bi_w[8][1][0] = ff_hevc_put_hevc_bi_w_qpel_v48_8_msa; + c->put_hevc_qpel_bi_w[9][1][0] = ff_hevc_put_hevc_bi_w_qpel_v64_8_msa; + + c->put_hevc_qpel_bi_w[1][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv4_8_msa; + c->put_hevc_qpel_bi_w[3][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv8_8_msa; + c->put_hevc_qpel_bi_w[4][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv12_8_msa; + c->put_hevc_qpel_bi_w[5][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv16_8_msa; + c->put_hevc_qpel_bi_w[6][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv24_8_msa; + c->put_hevc_qpel_bi_w[7][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv32_8_msa; + c->put_hevc_qpel_bi_w[8][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv48_8_msa; + c->put_hevc_qpel_bi_w[9][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv64_8_msa; + + c->put_hevc_epel_bi_w[1][0][0] = + ff_hevc_put_hevc_bi_w_pel_pixels4_8_msa; + c->put_hevc_epel_bi_w[2][0][0] = + ff_hevc_put_hevc_bi_w_pel_pixels6_8_msa; + c->put_hevc_epel_bi_w[3][0][0] = + ff_hevc_put_hevc_bi_w_pel_pixels8_8_msa; + c->put_hevc_epel_bi_w[4][0][0] = + ff_hevc_put_hevc_bi_w_pel_pixels12_8_msa; + c->put_hevc_epel_bi_w[5][0][0] = + ff_hevc_put_hevc_bi_w_pel_pixels16_8_msa; + c->put_hevc_epel_bi_w[6][0][0] = + ff_hevc_put_hevc_bi_w_pel_pixels24_8_msa; + c->put_hevc_epel_bi_w[7][0][0] = + ff_hevc_put_hevc_bi_w_pel_pixels32_8_msa; + + c->put_hevc_epel_bi_w[1][0][1] = ff_hevc_put_hevc_bi_w_epel_h4_8_msa; + c->put_hevc_epel_bi_w[2][0][1] = ff_hevc_put_hevc_bi_w_epel_h6_8_msa; + c->put_hevc_epel_bi_w[3][0][1] = ff_hevc_put_hevc_bi_w_epel_h8_8_msa; + c->put_hevc_epel_bi_w[4][0][1] = ff_hevc_put_hevc_bi_w_epel_h12_8_msa; + c->put_hevc_epel_bi_w[5][0][1] = ff_hevc_put_hevc_bi_w_epel_h16_8_msa; + c->put_hevc_epel_bi_w[6][0][1] = ff_hevc_put_hevc_bi_w_epel_h24_8_msa; + c->put_hevc_epel_bi_w[7][0][1] = ff_hevc_put_hevc_bi_w_epel_h32_8_msa; + + c->put_hevc_epel_bi_w[1][1][0] = ff_hevc_put_hevc_bi_w_epel_v4_8_msa; + c->put_hevc_epel_bi_w[2][1][0] = ff_hevc_put_hevc_bi_w_epel_v6_8_msa; + c->put_hevc_epel_bi_w[3][1][0] = ff_hevc_put_hevc_bi_w_epel_v8_8_msa; + c->put_hevc_epel_bi_w[4][1][0] = ff_hevc_put_hevc_bi_w_epel_v12_8_msa; + c->put_hevc_epel_bi_w[5][1][0] = ff_hevc_put_hevc_bi_w_epel_v16_8_msa; + c->put_hevc_epel_bi_w[6][1][0] = ff_hevc_put_hevc_bi_w_epel_v24_8_msa; + c->put_hevc_epel_bi_w[7][1][0] = ff_hevc_put_hevc_bi_w_epel_v32_8_msa; + + c->put_hevc_epel_bi_w[1][1][1] = ff_hevc_put_hevc_bi_w_epel_hv4_8_msa; + c->put_hevc_epel_bi_w[2][1][1] = ff_hevc_put_hevc_bi_w_epel_hv6_8_msa; + c->put_hevc_epel_bi_w[3][1][1] = ff_hevc_put_hevc_bi_w_epel_hv8_8_msa; + c->put_hevc_epel_bi_w[4][1][1] = ff_hevc_put_hevc_bi_w_epel_hv12_8_msa; + c->put_hevc_epel_bi_w[5][1][1] = ff_hevc_put_hevc_bi_w_epel_hv16_8_msa; + c->put_hevc_epel_bi_w[6][1][1] = ff_hevc_put_hevc_bi_w_epel_hv24_8_msa; + c->put_hevc_epel_bi_w[7][1][1] = ff_hevc_put_hevc_bi_w_epel_hv32_8_msa; + + c->sao_band_filter[0] = + c->sao_band_filter[1] = + c->sao_band_filter[2] = + c->sao_band_filter[3] = + c->sao_band_filter[4] = ff_hevc_sao_band_filter_0_8_msa; + + c->sao_edge_filter[0] = + c->sao_edge_filter[1] = + c->sao_edge_filter[2] = + c->sao_edge_filter[3] = + c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_msa; + + c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_msa; + c->hevc_v_loop_filter_luma = ff_hevc_loop_filter_luma_v_8_msa; + + c->hevc_h_loop_filter_chroma = ff_hevc_loop_filter_chroma_h_8_msa; + c->hevc_v_loop_filter_chroma = ff_hevc_loop_filter_chroma_v_8_msa; + + c->hevc_h_loop_filter_luma_c = ff_hevc_loop_filter_luma_h_8_msa; + c->hevc_v_loop_filter_luma_c = ff_hevc_loop_filter_luma_v_8_msa; + + c->hevc_h_loop_filter_chroma_c = + ff_hevc_loop_filter_chroma_h_8_msa; + c->hevc_v_loop_filter_chroma_c = + ff_hevc_loop_filter_chroma_v_8_msa; + + c->idct[0] = ff_hevc_idct_4x4_msa; + c->idct[1] = ff_hevc_idct_8x8_msa; + c->idct[2] = ff_hevc_idct_16x16_msa; + c->idct[3] = ff_hevc_idct_32x32_msa; + c->idct_dc[0] = ff_hevc_idct_dc_4x4_msa; + c->idct_dc[1] = ff_hevc_idct_dc_8x8_msa; + c->idct_dc[2] = ff_hevc_idct_dc_16x16_msa; + c->idct_dc[3] = ff_hevc_idct_dc_32x32_msa; + c->transform_add[0] = ff_hevc_addblk_4x4_msa; + c->transform_add[1] = ff_hevc_addblk_8x8_msa; + c->transform_add[2] = ff_hevc_addblk_16x16_msa; + c->transform_add[3] = ff_hevc_addblk_32x32_msa; + c->idct_4x4_luma = ff_hevc_idct_luma_4x4_msa; + } +} +#endif // #if HAVE_MSA + +void ff_hevc_dsp_init_mips(HEVCDSPContext *c, const int bit_depth) +{ +#if HAVE_MSA + hevc_dsp_init_msa(c, bit_depth); +#endif // #if HAVE_MSA +} diff --git a/libavcodec/mips/hevcdsp_mips.h b/libavcodec/mips/hevcdsp_mips.h new file mode 100644 index 0000000000..1573d1cc9d --- /dev/null +++ b/libavcodec/mips/hevcdsp_mips.h @@ -0,0 +1,482 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_HEVCDSP_MIPS_H +#define AVCODEC_MIPS_HEVCDSP_MIPS_H + +#include "libavcodec/hevcdsp.h" + +#define MC(PEL, DIR, WIDTH) \ +void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst, \ + uint8_t *src, \ + ptrdiff_t src_stride, \ + int height, \ + intptr_t mx, \ + intptr_t my, \ + int width) + +MC(pel, pixels, 4); +MC(pel, pixels, 6); +MC(pel, pixels, 8); +MC(pel, pixels, 12); +MC(pel, pixels, 16); +MC(pel, pixels, 24); +MC(pel, pixels, 32); +MC(pel, pixels, 48); +MC(pel, pixels, 64); + +MC(qpel, h, 4); +MC(qpel, h, 8); +MC(qpel, h, 12); +MC(qpel, h, 16); +MC(qpel, h, 24); +MC(qpel, h, 32); +MC(qpel, h, 48); +MC(qpel, h, 64); + +MC(qpel, v, 4); +MC(qpel, v, 8); +MC(qpel, v, 12); +MC(qpel, v, 16); +MC(qpel, v, 24); +MC(qpel, v, 32); +MC(qpel, v, 48); +MC(qpel, v, 64); + +MC(qpel, hv, 4); +MC(qpel, hv, 8); +MC(qpel, hv, 12); +MC(qpel, hv, 16); +MC(qpel, hv, 24); +MC(qpel, hv, 32); +MC(qpel, hv, 48); +MC(qpel, hv, 64); + +MC(epel, h, 4); +MC(epel, h, 6); +MC(epel, h, 8); +MC(epel, h, 12); +MC(epel, h, 16); +MC(epel, h, 24); +MC(epel, h, 32); +MC(epel, h, 48); +MC(epel, h, 64); + +MC(epel, v, 4); +MC(epel, v, 6); +MC(epel, v, 8); +MC(epel, v, 12); +MC(epel, v, 16); +MC(epel, v, 24); +MC(epel, v, 32); +MC(epel, v, 48); +MC(epel, v, 64); + +MC(epel, hv, 4); +MC(epel, hv, 6); +MC(epel, hv, 8); +MC(epel, hv, 12); +MC(epel, hv, 16); +MC(epel, hv, 24); +MC(epel, hv, 32); +MC(epel, hv, 48); +MC(epel, hv, 64); + +#undef MC + +#define UNI_MC(PEL, DIR, WIDTH) \ +void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ + ptrdiff_t dst_stride, \ + uint8_t *src, \ + ptrdiff_t src_stride, \ + int height, \ + intptr_t mx, \ + intptr_t my, \ + int width) + +UNI_MC(pel, pixels, 4); +UNI_MC(pel, pixels, 6); +UNI_MC(pel, pixels, 8); +UNI_MC(pel, pixels, 12); +UNI_MC(pel, pixels, 16); +UNI_MC(pel, pixels, 24); +UNI_MC(pel, pixels, 32); +UNI_MC(pel, pixels, 48); +UNI_MC(pel, pixels, 64); + +UNI_MC(qpel, h, 4); +UNI_MC(qpel, h, 8); +UNI_MC(qpel, h, 12); +UNI_MC(qpel, h, 16); +UNI_MC(qpel, h, 24); +UNI_MC(qpel, h, 32); +UNI_MC(qpel, h, 48); +UNI_MC(qpel, h, 64); + +UNI_MC(qpel, v, 4); +UNI_MC(qpel, v, 8); +UNI_MC(qpel, v, 12); +UNI_MC(qpel, v, 16); +UNI_MC(qpel, v, 24); +UNI_MC(qpel, v, 32); +UNI_MC(qpel, v, 48); +UNI_MC(qpel, v, 64); + +UNI_MC(qpel, hv, 4); +UNI_MC(qpel, hv, 8); +UNI_MC(qpel, hv, 12); +UNI_MC(qpel, hv, 16); +UNI_MC(qpel, hv, 24); +UNI_MC(qpel, hv, 32); +UNI_MC(qpel, hv, 48); +UNI_MC(qpel, hv, 64); + +UNI_MC(epel, h, 4); +UNI_MC(epel, h, 6); +UNI_MC(epel, h, 8); +UNI_MC(epel, h, 12); +UNI_MC(epel, h, 16); +UNI_MC(epel, h, 24); +UNI_MC(epel, h, 32); +UNI_MC(epel, h, 48); +UNI_MC(epel, h, 64); + +UNI_MC(epel, v, 4); +UNI_MC(epel, v, 6); +UNI_MC(epel, v, 8); +UNI_MC(epel, v, 12); +UNI_MC(epel, v, 16); +UNI_MC(epel, v, 24); +UNI_MC(epel, v, 32); +UNI_MC(epel, v, 48); +UNI_MC(epel, v, 64); + +UNI_MC(epel, hv, 4); +UNI_MC(epel, hv, 6); +UNI_MC(epel, hv, 8); +UNI_MC(epel, hv, 12); +UNI_MC(epel, hv, 16); +UNI_MC(epel, hv, 24); +UNI_MC(epel, hv, 32); +UNI_MC(epel, hv, 48); +UNI_MC(epel, hv, 64); + +#undef UNI_MC + +#define UNI_W_MC(PEL, DIR, WIDTH) \ +void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ + ptrdiff_t \ + dst_stride, \ + uint8_t *src, \ + ptrdiff_t \ + src_stride, \ + int height, \ + int denom, \ + int weight, \ + int offset, \ + intptr_t mx, \ + intptr_t my, \ + int width) + +UNI_W_MC(pel, pixels, 4); +UNI_W_MC(pel, pixels, 6); +UNI_W_MC(pel, pixels, 8); +UNI_W_MC(pel, pixels, 12); +UNI_W_MC(pel, pixels, 16); +UNI_W_MC(pel, pixels, 24); +UNI_W_MC(pel, pixels, 32); +UNI_W_MC(pel, pixels, 48); +UNI_W_MC(pel, pixels, 64); + +UNI_W_MC(qpel, h, 4); +UNI_W_MC(qpel, h, 8); +UNI_W_MC(qpel, h, 12); +UNI_W_MC(qpel, h, 16); +UNI_W_MC(qpel, h, 24); +UNI_W_MC(qpel, h, 32); +UNI_W_MC(qpel, h, 48); +UNI_W_MC(qpel, h, 64); + +UNI_W_MC(qpel, v, 4); +UNI_W_MC(qpel, v, 8); +UNI_W_MC(qpel, v, 12); +UNI_W_MC(qpel, v, 16); +UNI_W_MC(qpel, v, 24); +UNI_W_MC(qpel, v, 32); +UNI_W_MC(qpel, v, 48); +UNI_W_MC(qpel, v, 64); + +UNI_W_MC(qpel, hv, 4); +UNI_W_MC(qpel, hv, 8); +UNI_W_MC(qpel, hv, 12); +UNI_W_MC(qpel, hv, 16); +UNI_W_MC(qpel, hv, 24); +UNI_W_MC(qpel, hv, 32); +UNI_W_MC(qpel, hv, 48); +UNI_W_MC(qpel, hv, 64); + +UNI_W_MC(epel, h, 4); +UNI_W_MC(epel, h, 6); +UNI_W_MC(epel, h, 8); +UNI_W_MC(epel, h, 12); +UNI_W_MC(epel, h, 16); +UNI_W_MC(epel, h, 24); +UNI_W_MC(epel, h, 32); +UNI_W_MC(epel, h, 48); +UNI_W_MC(epel, h, 64); + +UNI_W_MC(epel, v, 4); +UNI_W_MC(epel, v, 6); +UNI_W_MC(epel, v, 8); +UNI_W_MC(epel, v, 12); +UNI_W_MC(epel, v, 16); +UNI_W_MC(epel, v, 24); +UNI_W_MC(epel, v, 32); +UNI_W_MC(epel, v, 48); +UNI_W_MC(epel, v, 64); + +UNI_W_MC(epel, hv, 4); +UNI_W_MC(epel, hv, 6); +UNI_W_MC(epel, hv, 8); +UNI_W_MC(epel, hv, 12); +UNI_W_MC(epel, hv, 16); +UNI_W_MC(epel, hv, 24); +UNI_W_MC(epel, hv, 32); +UNI_W_MC(epel, hv, 48); +UNI_W_MC(epel, hv, 64); + +#undef UNI_W_MC + +#define BI_MC(PEL, DIR, WIDTH) \ +void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ + ptrdiff_t dst_stride, \ + uint8_t *src, \ + ptrdiff_t src_stride, \ + int16_t *src_16bit, \ + int height, \ + intptr_t mx, \ + intptr_t my, \ + int width) + +BI_MC(pel, pixels, 4); +BI_MC(pel, pixels, 6); +BI_MC(pel, pixels, 8); +BI_MC(pel, pixels, 12); +BI_MC(pel, pixels, 16); +BI_MC(pel, pixels, 24); +BI_MC(pel, pixels, 32); +BI_MC(pel, pixels, 48); +BI_MC(pel, pixels, 64); + +BI_MC(qpel, h, 4); +BI_MC(qpel, h, 8); +BI_MC(qpel, h, 12); +BI_MC(qpel, h, 16); +BI_MC(qpel, h, 24); +BI_MC(qpel, h, 32); +BI_MC(qpel, h, 48); +BI_MC(qpel, h, 64); + +BI_MC(qpel, v, 4); +BI_MC(qpel, v, 8); +BI_MC(qpel, v, 12); +BI_MC(qpel, v, 16); +BI_MC(qpel, v, 24); +BI_MC(qpel, v, 32); +BI_MC(qpel, v, 48); +BI_MC(qpel, v, 64); + +BI_MC(qpel, hv, 4); +BI_MC(qpel, hv, 8); +BI_MC(qpel, hv, 12); +BI_MC(qpel, hv, 16); +BI_MC(qpel, hv, 24); +BI_MC(qpel, hv, 32); +BI_MC(qpel, hv, 48); +BI_MC(qpel, hv, 64); + +BI_MC(epel, h, 4); +BI_MC(epel, h, 6); +BI_MC(epel, h, 8); +BI_MC(epel, h, 12); +BI_MC(epel, h, 16); +BI_MC(epel, h, 24); +BI_MC(epel, h, 32); +BI_MC(epel, h, 48); +BI_MC(epel, h, 64); + +BI_MC(epel, v, 4); +BI_MC(epel, v, 6); +BI_MC(epel, v, 8); +BI_MC(epel, v, 12); +BI_MC(epel, v, 16); +BI_MC(epel, v, 24); +BI_MC(epel, v, 32); +BI_MC(epel, v, 48); +BI_MC(epel, v, 64); + +BI_MC(epel, hv, 4); +BI_MC(epel, hv, 6); +BI_MC(epel, hv, 8); +BI_MC(epel, hv, 12); +BI_MC(epel, hv, 16); +BI_MC(epel, hv, 24); +BI_MC(epel, hv, 32); +BI_MC(epel, hv, 48); +BI_MC(epel, hv, 64); + +#undef BI_MC + +#define BI_W_MC(PEL, DIR, WIDTH) \ +void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ + ptrdiff_t \ + dst_stride, \ + uint8_t *src, \ + ptrdiff_t \ + src_stride, \ + int16_t *src_16bit, \ + int height, \ + int denom, \ + int weight0, \ + int weight1, \ + int offset0, \ + int offset1, \ + intptr_t mx, \ + intptr_t my, \ + int width) + +BI_W_MC(pel, pixels, 4); +BI_W_MC(pel, pixels, 6); +BI_W_MC(pel, pixels, 8); +BI_W_MC(pel, pixels, 12); +BI_W_MC(pel, pixels, 16); +BI_W_MC(pel, pixels, 24); +BI_W_MC(pel, pixels, 32); +BI_W_MC(pel, pixels, 48); +BI_W_MC(pel, pixels, 64); + +BI_W_MC(qpel, h, 4); +BI_W_MC(qpel, h, 8); +BI_W_MC(qpel, h, 12); +BI_W_MC(qpel, h, 16); +BI_W_MC(qpel, h, 24); +BI_W_MC(qpel, h, 32); +BI_W_MC(qpel, h, 48); +BI_W_MC(qpel, h, 64); + +BI_W_MC(qpel, v, 4); +BI_W_MC(qpel, v, 8); +BI_W_MC(qpel, v, 12); +BI_W_MC(qpel, v, 16); +BI_W_MC(qpel, v, 24); +BI_W_MC(qpel, v, 32); +BI_W_MC(qpel, v, 48); +BI_W_MC(qpel, v, 64); + +BI_W_MC(qpel, hv, 4); +BI_W_MC(qpel, hv, 8); +BI_W_MC(qpel, hv, 12); +BI_W_MC(qpel, hv, 16); +BI_W_MC(qpel, hv, 24); +BI_W_MC(qpel, hv, 32); +BI_W_MC(qpel, hv, 48); +BI_W_MC(qpel, hv, 64); + +BI_W_MC(epel, h, 4); +BI_W_MC(epel, h, 6); +BI_W_MC(epel, h, 8); +BI_W_MC(epel, h, 12); +BI_W_MC(epel, h, 16); +BI_W_MC(epel, h, 24); +BI_W_MC(epel, h, 32); +BI_W_MC(epel, h, 48); +BI_W_MC(epel, h, 64); + +BI_W_MC(epel, v, 4); +BI_W_MC(epel, v, 6); +BI_W_MC(epel, v, 8); +BI_W_MC(epel, v, 12); +BI_W_MC(epel, v, 16); +BI_W_MC(epel, v, 24); +BI_W_MC(epel, v, 32); +BI_W_MC(epel, v, 48); +BI_W_MC(epel, v, 64); + +BI_W_MC(epel, hv, 4); +BI_W_MC(epel, hv, 6); +BI_W_MC(epel, hv, 8); +BI_W_MC(epel, hv, 12); +BI_W_MC(epel, hv, 16); +BI_W_MC(epel, hv, 24); +BI_W_MC(epel, hv, 32); +BI_W_MC(epel, hv, 48); +BI_W_MC(epel, hv, 64); + +#undef BI_W_MC + +void ff_hevc_loop_filter_luma_h_8_msa(uint8_t *src, + ptrdiff_t src_stride, + int32_t beta, int32_t *tc, + uint8_t *no_p, uint8_t *no_q); + +void ff_hevc_loop_filter_luma_v_8_msa(uint8_t *src, + ptrdiff_t src_stride, + int32_t beta, int32_t *tc, + uint8_t *no_p, uint8_t *no_q); + +void ff_hevc_loop_filter_chroma_h_8_msa(uint8_t *src, + ptrdiff_t src_stride, + int32_t *tc, uint8_t *no_p, + uint8_t *no_q); + +void ff_hevc_loop_filter_chroma_v_8_msa(uint8_t *src, + ptrdiff_t src_stride, + int32_t *tc, uint8_t *no_p, + uint8_t *no_q); + +void ff_hevc_sao_band_filter_0_8_msa(uint8_t *dst, uint8_t *src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, + int width, int height); + +void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src, + ptrdiff_t stride_dst, + int16_t *sao_offset_val, + int eo, int width, int height); + +void ff_hevc_idct_4x4_msa(int16_t *coeffs, int col_limit); +void ff_hevc_idct_8x8_msa(int16_t *coeffs, int col_limit); +void ff_hevc_idct_16x16_msa(int16_t *coeffs, int col_limit); +void ff_hevc_idct_32x32_msa(int16_t *coeffs, int col_limit); +void ff_hevc_idct_dc_4x4_msa(int16_t *coeffs); +void ff_hevc_idct_dc_8x8_msa(int16_t *coeffs); +void ff_hevc_idct_dc_16x16_msa(int16_t *coeffs); +void ff_hevc_idct_dc_32x32_msa(int16_t *coeffs); +void ff_hevc_addblk_4x4_msa(uint8_t *dst, int16_t *pi16Coeffs, + ptrdiff_t stride); +void ff_hevc_addblk_8x8_msa(uint8_t *dst, int16_t *pi16Coeffs, + ptrdiff_t stride); +void ff_hevc_addblk_16x16_msa(uint8_t *dst, int16_t *pi16Coeffs, + ptrdiff_t stride); +void ff_hevc_addblk_32x32_msa(uint8_t *dst, int16_t *pi16Coeffs, + ptrdiff_t stride); +void ff_hevc_idct_luma_4x4_msa(int16_t *pi16Coeffs); + +#endif // #ifndef AVCODEC_MIPS_HEVCDSP_MIPS_H diff --git a/libavcodec/mips/hevcdsp_msa.c b/libavcodec/mips/hevcdsp_msa.c new file mode 100644 index 0000000000..f2bc748e37 --- /dev/null +++ b/libavcodec/mips/hevcdsp_msa.c @@ -0,0 +1,3878 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "libavcodec/mips/hevcdsp_mips.h" +#include "libavcodec/mips/hevc_macros_msa.h" + +static void hevc_copy_4w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + int32_t height) +{ + v16i8 zero = { 0 }; + + if (2 == height) { + v16i8 src0, src1; + v8i16 in0; + + LD_SB2(src, src_stride, src0, src1); + + src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0); + in0 = (v8i16) __msa_ilvr_b(zero, src0); + in0 <<= 6; + ST8x2_UB(in0, dst, 2 * dst_stride); + } else if (4 == height) { + v16i8 src0, src1, src2, src3; + v8i16 in0, in1; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + + ILVR_W2_SB(src1, src0, src3, src2, src0, src1); + ILVR_B2_SH(zero, src0, zero, src1, in0, in1); + in0 <<= 6; + in1 <<= 6; + ST8x4_UB(in0, in1, dst, 2 * dst_stride); + } else if (0 == height % 8) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 in0, in1, in2, in3; + uint32_t loop_cnt; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6, + src0, src1, src2, src3); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + in0, in1, in2, in3); + SLLI_4V(in0, in1, in2, in3, 6); + ST8x8_UB(in0, in1, in2, in3, dst, 2 * dst_stride); + dst += (8 * dst_stride); + } + } +} + +static void hevc_copy_6w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 zero = { 0 }; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + in0, in1, in2, in3); + ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, + in4, in5, in6, in7); + SLLI_4V(in0, in1, in2, in3, 6); + SLLI_4V(in4, in5, in6, in7, 6); + ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, dst, 2 * dst_stride); + dst += (8 * dst_stride); + } +} + +static void hevc_copy_8w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + int32_t height) +{ + v16i8 zero = { 0 }; + + if (2 == height) { + v16i8 src0, src1; + v8i16 in0, in1; + + LD_SB2(src, src_stride, src0, src1); + + ILVR_B2_SH(zero, src0, zero, src1, in0, in1); + in0 <<= 6; + in1 <<= 6; + ST_SH2(in0, in1, dst, dst_stride); + } else if (4 == height) { + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + in0, in1, in2, in3); + SLLI_4V(in0, in1, in2, in3, 6); + ST_SH4(in0, in1, in2, in3, dst, dst_stride); + } else if (6 == height) { + v16i8 src0, src1, src2, src3, src4, src5; + v8i16 in0, in1, in2, in3, in4, in5; + + LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5); + + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + in0, in1, in2, in3); + ILVR_B2_SH(zero, src4, zero, src5, in4, in5); + SLLI_4V(in0, in1, in2, in3, 6); + in4 <<= 6; + in5 <<= 6; + ST_SH6(in0, in1, in2, in3, in4, in5, dst, dst_stride); + } else if (0 == height % 8) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + in0, in1, in2, in3); + ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, + in4, in5, in6, in7); + SLLI_4V(in0, in1, in2, in3, 6); + SLLI_4V(in4, in5, in6, in7, 6); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, dst, dst_stride); + dst += (8 * dst_stride); + } + } +} + +static void hevc_copy_12w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 zero = { 0 }; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 in0, in1, in0_r, in1_r, in2_r, in3_r; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + in0_r, in1_r, in2_r, in3_r); + SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); + ILVL_W2_SB(src1, src0, src3, src2, src0, src1); + ILVR_B2_SH(zero, src0, zero, src1, in0, in1); + in0 <<= 6; + in1 <<= 6; + ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); + ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride); + dst += (4 * dst_stride); + + ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, + in0_r, in1_r, in2_r, in3_r); + SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); + ILVL_W2_SB(src5, src4, src7, src6, src0, src1); + ILVR_B2_SH(zero, src0, zero, src1, in0, in1); + in0 <<= 6; + in1 <<= 6; + ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); + ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_copy_16multx8mult_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + int32_t height, + int32_t width) +{ + uint8_t *src_tmp; + int16_t *dst_tmp; + uint32_t loop_cnt, cnt; + v16i8 zero = { 0 }; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 in0_r, in1_r, in2_r, in3_r; + v8i16 in0_l, in1_l, in2_l, in3_l; + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src_tmp, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src_tmp += (8 * src_stride); + + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + in0_r, in1_r, in2_r, in3_r); + ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + in0_l, in1_l, in2_l, in3_l); + SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); + SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); + ST_SH4(in0_r, in1_r, in2_r, in3_r, dst_tmp, dst_stride); + ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst_tmp + 8), dst_stride); + dst_tmp += (4 * dst_stride); + + ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, + in0_r, in1_r, in2_r, in3_r); + ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, + in0_l, in1_l, in2_l, in3_l); + SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); + SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); + ST_SH4(in0_r, in1_r, in2_r, in3_r, dst_tmp, dst_stride); + ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst_tmp + 8), dst_stride); + dst_tmp += (4 * dst_stride); + } + + src += 16; + dst += 16; + } +} + +static void hevc_copy_16w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + int32_t height) +{ + v16i8 zero = { 0 }; + + if (4 == height) { + v16i8 src0, src1, src2, src3; + v8i16 in0_r, in1_r, in2_r, in3_r; + v8i16 in0_l, in1_l, in2_l, in3_l; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + in0_r, in1_r, in2_r, in3_r); + ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + in0_l, in1_l, in2_l, in3_l); + SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); + SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); + ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); + ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride); + } else if (12 == height) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 src8, src9, src10, src11; + v8i16 in0_r, in1_r, in2_r, in3_r; + v8i16 in0_l, in1_l, in2_l, in3_l; + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + LD_SB4(src, src_stride, src8, src9, src10, src11); + + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + in0_r, in1_r, in2_r, in3_r); + ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, + in0_l, in1_l, in2_l, in3_l); + SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); + SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); + ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); + ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride); + dst += (4 * dst_stride); + + ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, + in0_r, in1_r, in2_r, in3_r); + ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, + in0_l, in1_l, in2_l, in3_l); + SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); + SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); + ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); + ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride); + dst += (4 * dst_stride); + + ILVR_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11, + in0_r, in1_r, in2_r, in3_r); + ILVL_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11, + in0_l, in1_l, in2_l, in3_l); + SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); + SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); + ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); + ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride); + } else if (0 == (height % 8)) { + hevc_copy_16multx8mult_msa(src, src_stride, dst, dst_stride, + height, 16); + } +} + +static void hevc_copy_24w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + int32_t height) +{ + hevc_copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); + hevc_copy_8w_msa(src + 16, src_stride, dst + 16, dst_stride, height); +} + +static void hevc_copy_32w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + int32_t height) +{ + hevc_copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32); +} + +static void hevc_copy_48w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + int32_t height) +{ + hevc_copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 48); +} + +static void hevc_copy_64w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + int32_t height) +{ + hevc_copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64); +} + +static void hevc_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3; + v8i16 filter_vec, const_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + + src -= 3; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + + VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride); + dst += (8 * dst_stride); + } +} + +static void hevc_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3; + v8i16 filter_vec, const_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src -= 3; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + hevc_hz_8t_8w_msa(src, src_stride, dst, dst_stride, filter, height); + hevc_hz_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride, filter, height); +} + +static void hevc_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8i16 filter_vec, const_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src -= 3; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst4 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst4, dst4, dst4, dst4); + VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst5 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst5, dst5, dst5, dst5); + VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst6 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst6, dst6, dst6, dst6); + VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst7 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst7, dst7, dst7, dst7); + + ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride); + ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v8i16 filter_vec, const_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src -= 3; + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = mask0 + 8; + mask5 = mask0 + 10; + mask6 = mask0 + 12; + mask7 = mask0 + 14; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, 16, src0, src1); + src += src_stride; + LD_SB2(src, 16, src2, src3); + src += src_stride; + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst4 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst4, dst4, dst4, dst4); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst5 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst5, dst5, dst5, dst5); + + ST_SH2(dst0, dst1, dst, 8); + ST_SH(dst2, dst + 16); + dst += dst_stride; + ST_SH2(dst3, dst4, dst, 8); + ST_SH(dst5, dst + 16); + dst += dst_stride; + } +} + +static void hevc_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3; + v8i16 filter_vec, const_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src -= 3; + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = mask0 + 8; + mask5 = mask0 + 10; + mask6 = mask0 + 12; + mask7 = mask0 + 14; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + for (loop_cnt = height; loop_cnt--;) { + LD_SB2(src, 16, src0, src1); + src2 = LD_SB(src + 24); + src += src_stride; + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + ST_SH4(dst0, dst1, dst2, dst3, dst, 8); + dst += dst_stride; + } +} + +static void hevc_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v8i16 filter_vec, const_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src -= 3; + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = mask0 + 8; + mask5 = mask0 + 10; + mask6 = mask0 + 12; + mask7 = mask0 + 14; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + for (loop_cnt = height; loop_cnt--;) { + LD_SB3(src, 16, src0, src1, src2); + src3 = LD_SB(src + 40); + src += src_stride; + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst4 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst4, dst4, dst4, dst4); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst5 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst5, dst5, dst5, dst5); + + ST_SH6(dst0, dst1, dst2, dst3, dst4, dst5, dst, 8); + dst += dst_stride; + } +} + +static void hevc_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4; + v8i16 filt0, filt1, filt2, filt3; + v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; + v16i8 vec0, vec1, vec2, vec3; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8i16 filter_vec, const_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src -= 3; + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = mask0 + 8; + mask5 = mask0 + 10; + mask6 = mask0 + 12; + mask7 = mask0 + 14; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + for (loop_cnt = height; loop_cnt--;) { + LD_SB4(src, 16, src0, src1, src2, src3); + src4 = LD_SB(src + 56); + src += src_stride; + XORI_B5_128_SB(src0, src1, src2, src3, src4); + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + ST_SH(dst0, dst); + + VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst1 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + ST_SH(dst1, dst + 8); + + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst2 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + ST_SH(dst2, dst + 16); + + VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst3 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + ST_SH(dst3, dst + 24); + + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst4 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst4, dst4, dst4, dst4); + ST_SH(dst4, dst + 32); + + VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, + vec0, vec1, vec2, vec3); + dst5 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst5, dst5, dst5, dst5); + ST_SH(dst5, dst + 40); + + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst6 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst6, dst6, dst6, dst6); + ST_SH(dst6, dst + 48); + + VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst7 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst7, dst7, dst7, dst7); + ST_SH(dst7, dst + 56); + dst += dst_stride; + } +} + +static void hevc_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src9, src10, src11, src12, src13, src14; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; + v16i8 src1110_r, src1211_r, src1312_r, src1413_r; + v16i8 src2110, src4332, src6554, src8776, src10998; + v16i8 src12111110, src14131312; + v8i16 dst10, dst32, dst54, dst76; + v8i16 filt0, filt1, filt2, filt3; + v8i16 filter_vec, const_vec; + + src -= (3 * src_stride); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_r, src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, + src2110, src4332, src6554); + XORI_B3_128_SB(src2110, src4332, src6554); + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src, src_stride, + src7, src8, src9, src10, src11, src12, src13, src14); + src += (8 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, + src1110_r, src1211_r, src1312_r, src1413_r); + ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, + src1211_r, src1110_r, src1413_r, src1312_r, + src8776, src10998, src12111110, src14131312); + XORI_B4_128_SB(src8776, src10998, src12111110, src14131312); + + dst10 = const_vec; + DPADD_SB4_SH(src2110, src4332, src6554, src8776, + filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10); + dst32 = const_vec; + DPADD_SB4_SH(src4332, src6554, src8776, src10998, + filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32); + dst54 = const_vec; + DPADD_SB4_SH(src6554, src8776, src10998, src12111110, + filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54); + dst76 = const_vec; + DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, + filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76); + + ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride); + dst += (8 * dst_stride); + + src2110 = src10998; + src4332 = src12111110; + src6554 = src14131312; + src6 = src14; + } +} + +static void hevc_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; + v8i16 dst0_r, dst1_r, dst2_r, dst3_r; + v8i16 filter_vec, const_vec; + v8i16 filt0, filt1, filt2, filt3; + + src -= (3 * src_stride); + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_r, src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + + dst0_r = const_vec; + DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, + filt0, filt1, filt2, filt3, + dst0_r, dst0_r, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, + filt0, filt1, filt2, filt3, + dst1_r, dst1_r, dst1_r, dst1_r); + dst2_r = const_vec; + DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, + filt0, filt1, filt2, filt3, + dst2_r, dst2_r, dst2_r, dst2_r); + dst3_r = const_vec; + DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, + filt0, filt1, filt2, filt3, + dst3_r, dst3_r, dst3_r, dst3_r); + + ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src6 = src10; + } +} + +static void hevc_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; + v8i16 dst0_r, dst1_r, dst2_r, dst3_r; + v16i8 src10_l, src32_l, src54_l, src76_l, src98_l; + v16i8 src21_l, src43_l, src65_l, src87_l, src109_l; + v16i8 src2110, src4332, src6554, src8776, src10998; + v8i16 dst0_l, dst1_l; + v8i16 filter_vec, const_vec; + v8i16 filt0, filt1, filt2, filt3; + + src -= (3 * src_stride); + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_r, src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_l, src32_l, src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l, + src2110, src4332, src6554); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, + src76_l, src87_l, src98_l, src109_l); + ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998); + + dst0_r = const_vec; + DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, + filt0, filt1, filt2, filt3, + dst0_r, dst0_r, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, + filt0, filt1, filt2, filt3, + dst1_r, dst1_r, dst1_r, dst1_r); + dst2_r = const_vec; + DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, + filt0, filt1, filt2, filt3, + dst2_r, dst2_r, dst2_r, dst2_r); + dst3_r = const_vec; + DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, + filt0, filt1, filt2, filt3, + dst3_r, dst3_r, dst3_r, dst3_r); + dst0_l = const_vec; + DPADD_SB4_SH(src2110, src4332, src6554, src8776, + filt0, filt1, filt2, filt3, + dst0_l, dst0_l, dst0_l, dst0_l); + dst1_l = const_vec; + DPADD_SB4_SH(src4332, src6554, src8776, src10998, + filt0, filt1, filt2, filt3, + dst1_l, dst1_l, dst1_l, dst1_l); + + ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride); + ST8x4_UB(dst0_l, dst1_l, dst + 8, 2 * dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src2110 = src6554; + src4332 = src8776; + src6554 = src10998; + src6 = src10; + } +} + +static void hevc_vt_8t_16multx4mult_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t width) +{ + uint8_t *src_tmp; + int16_t *dst_tmp; + int32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; + v8i16 dst0_r, dst1_r, dst2_r, dst3_r; + v16i8 src10_l, src32_l, src54_l, src76_l, src98_l; + v16i8 src21_l, src43_l, src65_l, src87_l, src109_l; + v8i16 dst0_l, dst1_l, dst2_l, dst3_l; + v8i16 filter_vec, const_vec; + v8i16 filt0, filt1, filt2, filt3; + + src -= (3 * src_stride); + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + for (cnt = width >> 4; cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + src_tmp += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_r, src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, + src10_l, src32_l, src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); + src_tmp += (4 * src_stride); + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, + src76_l, src87_l, src98_l, src109_l); + + dst0_r = const_vec; + DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, + filt0, filt1, filt2, filt3, + dst0_r, dst0_r, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, + filt0, filt1, filt2, filt3, + dst1_r, dst1_r, dst1_r, dst1_r); + dst2_r = const_vec; + DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, + filt0, filt1, filt2, filt3, + dst2_r, dst2_r, dst2_r, dst2_r); + dst3_r = const_vec; + DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, + filt0, filt1, filt2, filt3, + dst3_r, dst3_r, dst3_r, dst3_r); + dst0_l = const_vec; + DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l, + filt0, filt1, filt2, filt3, + dst0_l, dst0_l, dst0_l, dst0_l); + dst1_l = const_vec; + DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l, + filt0, filt1, filt2, filt3, + dst1_l, dst1_l, dst1_l, dst1_l); + dst2_l = const_vec; + DPADD_SB4_SH(src32_l, src54_l, src76_l, src98_l, + filt0, filt1, filt2, filt3, + dst2_l, dst2_l, dst2_l, dst2_l); + dst3_l = const_vec; + DPADD_SB4_SH(src43_l, src65_l, src87_l, src109_l, + filt0, filt1, filt2, filt3, + dst3_l, dst3_l, dst3_l, dst3_l); + + ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride); + ST_SH4(dst0_l, dst1_l, dst2_l, dst3_l, dst_tmp + 8, dst_stride); + dst_tmp += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } + + src += 16; + dst += 16; + } +} + +static void hevc_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, + filter, height, 16); +} + +static void hevc_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, + filter, height, 16); + hevc_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, + filter, height); +} + +static void hevc_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, + filter, height, 32); +} + +static void hevc_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, + filter, height, 48); +} + +static void hevc_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, + filter, height, 64); +} + +static void hevc_hv_8t_4w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter_x, const int8_t *filter_y, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 filt0, filt1, filt2, filt3; + v4i32 filt_h0, filt_h1, filt_h2, filt_h3; + v16i8 mask1, mask2, mask3; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + v8i16 dst30, dst41, dst52, dst63, dst66, dst87; + v4i32 dst0_r, dst1_r; + v8i16 dst10_r, dst32_r, dst54_r, dst76_r; + v8i16 dst21_r, dst43_r, dst65_r, dst87_r; + v16i8 mask0 = { + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 + }; + v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 }; + + src -= ((3 * src_stride) + 3); + filter_vec = LD_SH(filter_x); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + + VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); + VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); + VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3, + vec8, vec9, vec10, vec11); + VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3, + vec12, vec13, vec14, vec15); + dst30 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst30, dst30, dst30, dst30); + dst41 = const_vec; + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, + dst41, dst41, dst41, dst41); + dst52 = const_vec; + DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, + dst52, dst52, dst52, dst52); + dst63 = const_vec; + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, + dst63, dst63, dst63, dst63); + + ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52, + dst10_r, dst21_r, dst32_r); + dst43_r = __msa_ilvl_h(dst41, dst30); + dst54_r = __msa_ilvl_h(dst52, dst41); + dst65_r = __msa_ilvl_h(dst63, dst52); + dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); + + for (loop_cnt = height >> 1; loop_cnt--;) { + LD_SB2(src, src_stride, src7, src8); + src += (2 * src_stride); + XORI_B2_128_SB(src7, src8); + + VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst87 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst87, dst87, dst87, dst87); + dst76_r = __msa_ilvr_h(dst87, dst66); + dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, + filt_h0, filt_h1, filt_h2, filt_h3); + dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87); + dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, + filt_h0, filt_h1, filt_h2, filt_h3); + dst0_r >>= 6; + dst1_r >>= 6; + + dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r); + ST8x2_UB(dst0_r, dst, (2 * dst_stride)); + dst += (2 * dst_stride); + + dst10_r = dst32_r; + dst32_r = dst54_r; + dst54_r = dst76_r; + dst21_r = dst43_r; + dst43_r = dst65_r; + dst65_r = dst87_r; + dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1); + } +} + +static void hevc_hv_8t_8multx2mult_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, int32_t width) +{ + uint32_t loop_cnt, cnt; + uint8_t *src_tmp; + int16_t *dst_tmp; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 filt0, filt1, filt2, filt3; + v4i32 filt_h0, filt_h1, filt_h2, filt_h3; + v16i8 mask1, mask2, mask3; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; + v4i32 dst0_r, dst0_l; + v8i16 dst10_r, dst32_r, dst54_r, dst76_r; + v8i16 dst10_l, dst32_l, dst54_l, dst76_l; + v8i16 dst21_r, dst43_r, dst65_r, dst87_r; + v8i16 dst21_l, dst43_l, dst65_l, dst87_l; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + + src -= ((3 * src_stride) + 3); + filter_vec = LD_SH(filter_x); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + for (cnt = width >> 3; cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + src_tmp += (7 * src_stride); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + + /* row 0 row 1 row 2 row 3 */ + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, + vec4, vec5, vec6, vec7); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, + vec8, vec9, vec10, vec11); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec12, vec13, vec14, vec15); + dst0 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst0, dst0, dst0, dst0); + dst1 = const_vec; + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, + dst1, dst1, dst1, dst1); + dst2 = const_vec; + DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, + dst2, dst2, dst2, dst2); + dst3 = const_vec; + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, + dst3, dst3, dst3, dst3); + + /* row 4 row 5 row 6 */ + VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, + vec4, vec5, vec6, vec7); + VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, + vec8, vec9, vec10, vec11); + dst4 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst4, dst4, dst4, dst4); + dst5 = const_vec; + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, + dst5, dst5, dst5, dst5); + dst6 = const_vec; + DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, + dst6, dst6, dst6, dst6); + + ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, + dst10_r, dst32_r, dst54_r, dst21_r); + ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r); + ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, + dst10_l, dst32_l, dst54_l, dst21_l); + ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l); + + for (loop_cnt = height >> 1; loop_cnt--;) { + LD_SB2(src_tmp, src_stride, src7, src8); + XORI_B2_128_SB(src7, src8); + src_tmp += 2 * src_stride; + + VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst7 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst7, dst7, dst7, dst7); + + ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); + dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, + filt_h0, filt_h1, filt_h2, filt_h3); + dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, + filt_h0, filt_h1, filt_h2, filt_h3); + dst0_r >>= 6; + dst0_l >>= 6; + + dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); + ST_SW(dst0_r, dst_tmp); + dst_tmp += dst_stride; + + VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst8 = const_vec; + DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, + dst8, dst8, dst8, dst8); + + ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); + dst6 = dst8; + dst0_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, + filt_h0, filt_h1, filt_h2, filt_h3); + dst0_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, + filt_h0, filt_h1, filt_h2, filt_h3); + dst0_r >>= 6; + dst0_l >>= 6; + + dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); + ST_SW(dst0_r, dst_tmp); + dst_tmp += dst_stride; + + dst10_r = dst32_r; + dst32_r = dst54_r; + dst54_r = dst76_r; + dst10_l = dst32_l; + dst32_l = dst54_l; + dst54_l = dst76_l; + dst21_r = dst43_r; + dst43_r = dst65_r; + dst65_r = dst87_r; + dst21_l = dst43_l; + dst43_l = dst65_l; + dst65_l = dst87_l; + } + + src += 8; + dst += 8; + } +} + +static void hevc_hv_8t_8w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter_x, const int8_t *filter_y, + int32_t height) +{ + hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 8); +} + +static void hevc_hv_8t_12w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter_x, const int8_t *filter_y, + int32_t height) +{ + hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 8); + + hevc_hv_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride, + filter_x, filter_y, height); +} + +static void hevc_hv_8t_16w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter_x, const int8_t *filter_y, + int32_t height) +{ + hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 16); +} + +static void hevc_hv_8t_24w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter_x, const int8_t *filter_y, + int32_t height) +{ + hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 24); +} + +static void hevc_hv_8t_32w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter_x, const int8_t *filter_y, + int32_t height) +{ + hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 32); +} + +static void hevc_hv_8t_48w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter_x, const int8_t *filter_y, + int32_t height) +{ + hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 48); +} + +static void hevc_hv_8t_64w_msa(uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + const int8_t *filter_x, const int8_t *filter_y, + int32_t height) +{ + hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 64); +} + +static void hevc_hz_4t_4x2_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter) +{ + v8i16 filt0, filt1; + v16i8 src0, src1; + v16i8 mask1, vec0, vec1; + v8i16 dst0; + v8i16 filter_vec, const_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB2(src, src_stride, src0, src1); + XORI_B2_128_SB(src0, src1); + + VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + ST8x2_UB(dst0, dst, 2 * dst_stride); +} + +static void hevc_hz_4t_4x4_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter) +{ + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3; + v16i8 mask1, vec0, vec1; + v8i16 dst0, dst1; + v8i16 filter_vec, const_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + ST8x4_UB(dst0, dst1, dst, 2 * dst_stride); +} + +static void hevc_hz_4t_4x8multiple_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 mask1, vec0, vec1; + v8i16 dst0, dst1, dst2, dst3; + v8i16 filter_vec, const_vec; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + + VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride); + dst += (8 * dst_stride); + } +} + +static void hevc_hz_4t_4w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + if (2 == height) { + hevc_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter); + } else if (4 == height) { + hevc_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (0 == height % 8) { + hevc_hz_4t_4x8multiple_msa(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void hevc_hz_4t_6w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + uint64_t dst_val0, dst_val1, dst_val2, dst_val3; + uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3; + v8i16 filt0, filt1, dst0, dst1, dst2, dst3; + v16i8 src0, src1, src2, src3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v16i8 vec0, vec1; + v8i16 filter_vec, const_vec; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + dst_val0 = __msa_copy_u_d((v2i64) dst0, 0); + dst_val1 = __msa_copy_u_d((v2i64) dst1, 0); + dst_val2 = __msa_copy_u_d((v2i64) dst2, 0); + dst_val3 = __msa_copy_u_d((v2i64) dst3, 0); + + dst_val_int0 = __msa_copy_u_w((v4i32) dst0, 2); + dst_val_int1 = __msa_copy_u_w((v4i32) dst1, 2); + dst_val_int2 = __msa_copy_u_w((v4i32) dst2, 2); + dst_val_int3 = __msa_copy_u_w((v4i32) dst3, 2); + + SD(dst_val0, dst); + SW(dst_val_int0, dst + 4); + dst += dst_stride; + SD(dst_val1, dst); + SW(dst_val_int1, dst + 4); + dst += dst_stride; + SD(dst_val2, dst); + SW(dst_val_int2, dst + 4); + dst += dst_stride; + SD(dst_val3, dst); + SW(dst_val_int3, dst + 4); + dst += dst_stride; + } +} + +static void hevc_hz_4t_8x2multiple_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v8i16 filt0, filt1, dst0, dst1; + v16i8 src0, src1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v16i8 vec0, vec1; + v8i16 filter_vec, const_vec; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, src_stride, src0, src1); + src += (2 * src_stride); + + XORI_B2_128_SB(src0, src1); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + ST_SH2(dst0, dst1, dst, dst_stride); + dst += (2 * dst_stride); + } +} + +static void hevc_hz_4t_8x4multiple_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v16i8 vec0, vec1; + v8i16 dst0, dst1, dst2, dst3; + v8i16 filter_vec, const_vec; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_4t_8w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + if (2 == height || 6 == height) { + hevc_hz_4t_8x2multiple_msa(src, src_stride, dst, dst_stride, + filter, height); + } else { + hevc_hz_4t_8x4multiple_msa(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void hevc_hz_4t_12w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v8i16 filt0, filt1; + v16i8 src0, src1, src2, src3; + v16i8 mask1; + v16i8 vec0, vec1; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v8i16 filter_vec, const_vec; + v16i8 mask3; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask2 = { + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 + }; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + mask3 = mask2 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + XORI_B4_128_SB(src0, src1, src2, src3); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride); + ST8x4_UB(dst4, dst5, dst + 8, 2 * dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_4t_16w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3; + v16i8 src4, src5, src6, src7; + v8i16 filt0, filt1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16i8 vec0, vec1; + v8i16 filter_vec, const_vec; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst6 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); + + VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); + dst7 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); + + ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride); + ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride); + dst += (4 * dst_stride); + } +} + +static void hevc_hz_4t_24w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + int16_t *dst_tmp = dst + 16; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 filt0, filt1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1, mask00, mask11; + v16i8 vec0, vec1; + v8i16 dst0, dst1, dst2, dst3; + v8i16 filter_vec, const_vec; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + mask00 = mask0 + 8; + mask11 = mask0 + 10; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + /* 16 width */ + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 16, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src0, src1, src0, src1, mask00, mask11, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + VSHF_B2_SB(src2, src3, src2, src3, mask00, mask11, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ST_SH2(dst0, dst1, dst, 8); + dst += dst_stride; + ST_SH2(dst2, dst3, dst, 8); + dst += dst_stride; + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src4, src5, src4, src5, mask00, mask11, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + VSHF_B2_SB(src6, src7, src6, src7, mask00, mask11, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ST_SH2(dst0, dst1, dst, 8); + dst += dst_stride; + ST_SH2(dst2, dst3, dst, 8); + dst += dst_stride; + + /* 8 width */ + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ST_SH4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + } +} + +static void hevc_hz_4t_32w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2; + v8i16 filt0, filt1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1, mask2, mask3; + v8i16 dst0, dst1, dst2, dst3; + v16i8 vec0, vec1; + v8i16 filter_vec, const_vec; + + src -= 1; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + mask1 = mask0 + 2; + mask2 = mask0 + 8; + mask3 = mask0 + 10; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, 16, src0, src1); + src2 = LD_SB(src + 24); + src += src_stride; + + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ST_SH4(dst0, dst1, dst2, dst3, dst, 8); + dst += dst_stride; + + LD_SB2(src, 16, src0, src1); + src2 = LD_SB(src + 24); + src += src_stride; + + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + + VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); + dst1 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); + + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ST_SH4(dst0, dst1, dst2, dst3, dst, 8); + dst += dst_stride; + } +} + +static void hevc_vt_4t_4x2_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, src4; + v16i8 src10_r, src32_r, src21_r, src43_r; + v16i8 src2110, src4332; + v8i16 dst10; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + + ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + XORI_B2_128_SB(src2110, src4332); + dst10 = const_vec; + DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); + + ST8x2_UB(dst10, dst, 2 * dst_stride); +} + +static void hevc_vt_4t_4x4_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6; + v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; + v16i8 src2110, src4332, src6554; + v8i16 dst10, dst32; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src -= src_stride; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); + ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, + src2110, src4332, src6554); + XORI_B3_128_SB(src2110, src4332, src6554); + dst10 = const_vec; + DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); + dst32 = const_vec; + DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); + + ST8x4_UB(dst10, dst32, dst, 2 * dst_stride); +} + +static void hevc_vt_4t_4x8multiple_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; + v16i8 src2110, src4332, src6554, src8776; + v8i16 dst10, dst32, dst54, dst76; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src -= src_stride; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8); + src += (6 * src_stride); + + ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, + src32_r, src43_r, src54_r, src65_r); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, + src4332, src6554, src8776); + XORI_B3_128_SB(src4332, src6554, src8776); + + dst10 = const_vec; + DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); + dst32 = const_vec; + DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); + dst54 = const_vec; + DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54); + + LD_SB2(src, src_stride, src9, src2); + src += (2 * src_stride); + ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + dst76 = const_vec; + DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76); + + ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride); + dst += (8 * dst_stride); + } +} + +static void hevc_vt_4t_4w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + if (2 == height) { + hevc_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter); + } else if (4 == height) { + hevc_vt_4t_4x4_msa(src, src_stride, dst, dst_stride, filter, height); + } else if (0 == (height % 8)) { + hevc_vt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void hevc_vt_4t_6w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + int32_t loop_cnt; + uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3; + uint64_t dst_val0, dst_val1, dst_val2, dst_val3; + v16i8 src0, src1, src2, src3, src4; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 dst0_r, dst1_r, dst2_r, dst3_r; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src -= src_stride; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src, src_stride, src3, src4); + src += (2 * src_stride); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + + dst0_r = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); + + LD_SB2(src, src_stride, src1, src2); + src += (2 * src_stride); + XORI_B2_128_SB(src1, src2); + ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r); + + dst2_r = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r); + dst3_r = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r); + + dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0); + dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0); + dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0); + dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0); + + dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2); + dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2); + dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2); + dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2); + + SD(dst_val0, dst); + SW(dst_val_int0, dst + 4); + dst += dst_stride; + SD(dst_val1, dst); + SW(dst_val_int1, dst + 4); + dst += dst_stride; + SD(dst_val2, dst); + SW(dst_val_int2, dst + 4); + dst += dst_stride; + SD(dst_val3, dst); + SW(dst_val_int3, dst + 4); + dst += dst_stride; + } +} + +static void hevc_vt_4t_8x2_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, src4; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 dst0_r, dst1_r; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src -= src_stride; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + LD_SB2(src, src_stride, src3, src4); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + dst0_r = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); + + ST_SH2(dst0_r, dst1_r, dst, dst_stride); +} + +static void hevc_vt_4t_8x6_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, src4; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 dst0_r, dst1_r; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src -= src_stride; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + LD_SB2(src, src_stride, src3, src4); + src += (2 * src_stride); + XORI_B2_128_SB(src3, src4); + + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + dst0_r = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); + + ST_SH2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + + LD_SB2(src, src_stride, src1, src2); + src += (2 * src_stride); + XORI_B2_128_SB(src1, src2); + + ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r); + dst0_r = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); + + ST_SH2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + + LD_SB2(src, src_stride, src3, src4); + XORI_B2_128_SB(src3, src4); + + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + dst0_r = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); + + ST_SH2(dst0_r, dst1_r, dst, dst_stride); +} + +static void hevc_vt_4t_8x4multiple_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 dst0_r, dst1_r; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src -= src_stride; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src, src_stride, src3, src4); + src += (2 * src_stride); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + dst0_r = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); + + ST_SH2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + + LD_SB2(src, src_stride, src5, src2); + src += (2 * src_stride); + XORI_B2_128_SB(src5, src2); + ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); + dst0_r = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); + + ST_SH2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + } +} + +static void hevc_vt_4t_8w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + if (2 == height) { + hevc_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter); + } else if (6 == height) { + hevc_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter); + } else { + hevc_vt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void hevc_vt_4t_12w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 dst0_r, dst1_r, dst2_r, dst3_r; + v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; + v16i8 src2110, src4332; + v8i16 dst0_l, dst1_l; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src -= (1 * src_stride); + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src, src_stride, src3, src4); + src += (2 * src_stride); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); + src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l); + dst0_r = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); + dst1_r = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); + dst0_l = const_vec; + DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l); + + LD_SB2(src, src_stride, src5, src2); + src += (2 * src_stride); + XORI_B2_128_SB(src5, src2); + ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); + ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l); + dst2_r = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r); + dst3_r = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r); + dst1_l = const_vec; + DPADD_SB2_SH(src4332, src2110, filt0, filt1, dst1_l, dst1_l); + + ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride); + ST8x4_UB(dst0_l, dst1_l, dst + 8, (2 * dst_stride)); + dst += (4 * dst_stride); + } +} + +static void hevc_vt_4t_16w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v16i8 src10_r, src32_r, src21_r, src43_r; + v16i8 src10_l, src32_l, src21_l, src43_l; + v8i16 dst0_r, dst1_r, dst0_l, dst1_l; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src -= src_stride; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src, src_stride, src3, src4); + src += (2 * src_stride); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); + dst0_r = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); + dst0_l = const_vec; + DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); + dst1_r = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); + dst1_l = const_vec; + DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); + ST_SH2(dst0_r, dst0_l, dst, 8); + dst += dst_stride; + ST_SH2(dst1_r, dst1_l, dst, 8); + dst += dst_stride; + + LD_SB2(src, src_stride, src5, src2); + src += (2 * src_stride); + XORI_B2_128_SB(src5, src2); + ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); + ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); + dst0_r = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); + dst0_l = const_vec; + DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l); + dst1_r = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); + dst1_l = const_vec; + DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l); + ST_SH2(dst0_r, dst0_l, dst, 8); + dst += dst_stride; + ST_SH2(dst1_r, dst1_l, dst, 8); + dst += dst_stride; + } +} + +static void hevc_vt_4t_24w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v16i8 src6, src7, src8, src9, src10, src11; + v16i8 src10_r, src32_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src87_r, src109_r; + v8i16 dst0_r, dst1_r, dst2_r, dst3_r; + v16i8 src10_l, src32_l, src21_l, src43_l; + v8i16 dst0_l, dst1_l; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src -= src_stride; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + + LD_SB3(src + 16, src_stride, src6, src7, src8); + src += (3 * src_stride); + XORI_B3_128_SB(src6, src7, src8); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src, src_stride, src3, src4); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); + + LD_SB2(src + 16, src_stride, src9, src10); + src += (2 * src_stride); + XORI_B2_128_SB(src9, src10); + ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); + + dst0_r = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); + dst0_l = const_vec; + DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); + dst1_r = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); + dst1_l = const_vec; + DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); + dst2_r = const_vec; + DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r); + dst3_r = const_vec; + DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r); + + ST_SH2(dst0_r, dst0_l, dst, 8); + ST_SH(dst2_r, dst + 16); + dst += dst_stride; + ST_SH2(dst1_r, dst1_l, dst, 8); + ST_SH(dst3_r, dst + 16); + dst += dst_stride; + + LD_SB2(src, src_stride, src5, src2); + XORI_B2_128_SB(src5, src2); + ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); + ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); + + LD_SB2(src + 16, src_stride, src11, src8); + src += (2 * src_stride); + XORI_B2_128_SB(src11, src8); + ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r); + + dst0_r = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); + dst0_l = const_vec; + DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l); + dst1_r = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); + dst1_l = const_vec; + DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l); + dst2_r = const_vec; + DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r); + dst3_r = const_vec; + DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r); + + ST_SH2(dst0_r, dst0_l, dst, 8); + ST_SH(dst2_r, dst + 16); + dst += dst_stride; + ST_SH2(dst1_r, dst1_l, dst, 8); + ST_SH(dst3_r, dst + 16); + dst += dst_stride; + } +} + +static void hevc_vt_4t_32w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v16i8 src6, src7, src8, src9, src10, src11; + v16i8 src10_r, src32_r, src76_r, src98_r; + v16i8 src21_r, src43_r, src87_r, src109_r; + v8i16 dst0_r, dst1_r, dst2_r, dst3_r; + v16i8 src10_l, src32_l, src76_l, src98_l; + v16i8 src21_l, src43_l, src87_l, src109_l; + v8i16 dst0_l, dst1_l, dst2_l, dst3_l; + v8i16 filt0, filt1; + v8i16 filter_vec, const_vec; + + src -= src_stride; + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + filter_vec = LD_SH(filter); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + + LD_SB3(src + 16, src_stride, src6, src7, src8); + src += (3 * src_stride); + XORI_B3_128_SB(src6, src7, src8); + ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); + ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src, src_stride, src3, src4); + XORI_B2_128_SB(src3, src4); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); + + LD_SB2(src + 16, src_stride, src9, src10); + src += (2 * src_stride); + XORI_B2_128_SB(src9, src10); + ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); + ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l); + + dst0_r = const_vec; + DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); + dst0_l = const_vec; + DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); + dst1_r = const_vec; + DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); + dst1_l = const_vec; + DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); + dst2_r = const_vec; + DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r); + dst2_l = const_vec; + DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l); + dst3_r = const_vec; + DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r); + dst3_l = const_vec; + DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l); + + ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8); + dst += dst_stride; + ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8); + dst += dst_stride; + + LD_SB2(src, src_stride, src5, src2); + XORI_B2_128_SB(src5, src2); + ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); + ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); + + LD_SB2(src + 16, src_stride, src11, src8); + src += (2 * src_stride); + XORI_B2_128_SB(src11, src8); + ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r); + ILVL_B2_SB(src11, src10, src8, src11, src76_l, src87_l); + + dst0_r = const_vec; + DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); + dst0_l = const_vec; + DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l); + dst1_r = const_vec; + DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); + dst1_l = const_vec; + DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l); + dst2_r = const_vec; + DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r); + dst2_l = const_vec; + DPADD_SB2_SH(src98_l, src76_l, filt0, filt1, dst2_l, dst2_l); + dst3_r = const_vec; + DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r); + dst3_l = const_vec; + DPADD_SB2_SH(src109_l, src87_l, filt0, filt1, dst3_l, dst3_l); + + ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8); + dst += dst_stride; + ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8); + dst += dst_stride; + } +} + +static void hevc_hv_4t_4x2_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y) +{ + v16i8 src0, src1, src2, src3, src4; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4; + v4i32 dst0_r, dst1_r; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + + src -= (src_stride + 1); + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r); + + LD_SB2(src, src_stride, src3, src4); + XORI_B2_128_SB(src3, src4); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + dst32_r = __msa_ilvr_h(dst3, dst2); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_r >>= 6; + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + dst43_r = __msa_ilvr_h(dst4, dst3); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_r >>= 6; + + dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r); + ST8x2_UB(dst0_r, dst, 2 * dst_stride); +} + +static void hevc_hv_4t_4x4_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + + src -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r); + + LD_SB4(src, src_stride, src3, src4, src5, src6); + XORI_B4_128_SB(src3, src4, src5, src6); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + dst32_r = __msa_ilvr_h(dst3, dst2); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_r >>= 6; + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + dst43_r = __msa_ilvr_h(dst4, dst3); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_r >>= 6; + + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + dst10_r = __msa_ilvr_h(dst5, dst4); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1); + dst2_r >>= 6; + + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + dst21_r = __msa_ilvr_h(dst2, dst5); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1); + dst3_r >>= 6; + + PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); + ST8x4_UB(dst0_r, dst1_r, dst, 2 * dst_stride); +} + + +static void hevc_hv_4t_4multx8mult_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v16i8 src7, src8, src9, src10; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9; + v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; + v8i16 dst10_r, dst32_r, dst54_r, dst76_r; + v8i16 dst21_r, dst43_r, dst65_r, dst87_r; + + src -= (src_stride + 1); + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r); + + for (loop_cnt = height >> 3; loop_cnt--;) { + LD_SB8(src, src_stride, + src3, src4, src5, src6, src7, src8, src9, src10); + src += (8 * src_stride); + XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + dst32_r = __msa_ilvr_h(dst3, dst2); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_r >>= 6; + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + dst43_r = __msa_ilvr_h(dst4, dst3); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_r >>= 6; + + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + dst54_r = __msa_ilvr_h(dst5, dst4); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); + dst2_r >>= 6; + + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst6 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); + + dst65_r = __msa_ilvr_h(dst6, dst5); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); + dst3_r >>= 6; + + VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); + dst7 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); + + dst76_r = __msa_ilvr_h(dst7, dst6); + dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); + dst4_r >>= 6; + + VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1); + dst8 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8); + + dst87_r = __msa_ilvr_h(dst8, dst7); + dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); + dst5_r >>= 6; + + VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1); + dst9 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9); + + dst10_r = __msa_ilvr_h(dst9, dst8); + dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1); + dst6_r >>= 6; + + VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + dst21_r = __msa_ilvr_h(dst2, dst9); + dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1); + dst7_r >>= 6; + + PCKEV_H4_SW(dst1_r, dst0_r, dst3_r, dst2_r, + dst5_r, dst4_r, dst7_r, dst6_r, + dst0_r, dst1_r, dst2_r, dst3_r); + ST8x8_UB(dst0_r, dst1_r, dst2_r, dst3_r, dst, 2 * dst_stride); + dst += (8 * dst_stride); + } +} + +static void hevc_hv_4t_4w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + if (2 == height) { + hevc_hv_4t_4x2_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y); + } else if (4 == height) { + hevc_hv_4t_4x4_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y); + } else if (0 == (height % 8)) { + hevc_hv_4t_4multx8mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height); + } +} + +static void hevc_hv_4t_6w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + uint32_t loop_cnt; + uint64_t dst_val0, dst_val1, dst_val2, dst_val3; + uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v8i16 dst10_l, dst32_l, dst21_l, dst43_l; + + src -= (src_stride + 1); + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + for (loop_cnt = height >> 2; loop_cnt--;) { + LD_SB4(src, src_stride, src3, src4, src5, src6); + src += (4 * src_stride); + XORI_B4_128_SB(src3, src4, src5, src6); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + dst0_r >>= 6; + dst0_l >>= 6; + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1); + dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1); + dst2_r >>= 6; + dst2_l >>= 6; + + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1); + dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1); + dst3_r >>= 6; + dst3_l >>= 6; + + PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + dst0_r, dst1_r, dst2_r, dst3_r); + + dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0); + dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0); + dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0); + dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0); + + dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2); + dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2); + dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2); + dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2); + + SD(dst_val0, dst); + SW(dst_val_int0, dst + 4); + dst += dst_stride; + SD(dst_val1, dst); + SW(dst_val_int1, dst + 4); + dst += dst_stride; + SD(dst_val2, dst); + SW(dst_val_int2, dst + 4); + dst += dst_stride; + SD(dst_val3, dst); + SW(dst_val_int3, dst + 4); + dst += dst_stride; + + } +} + +static void hevc_hv_4t_8x2_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + v16i8 src0, src1, src2, src3, src4; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v8i16 dst10_l, dst32_l, dst21_l, dst43_l; + + src -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + LD_SB2(src, src_stride, src3, src4); + src += (2 * src_stride); + XORI_B2_128_SB(src3, src4); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + dst0_r >>= 6; + dst0_l >>= 6; + + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + + PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); + ST_SW2(dst0_r, dst1_r, dst, dst_stride); +} + +static void hevc_hv_4t_8x6_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; + v4i32 dst4_r, dst4_l, dst5_r, dst5_l; + v8i16 dst10_r, dst32_r, dst10_l, dst32_l; + v8i16 dst21_r, dst43_r, dst21_l, dst43_l; + v8i16 dst54_r, dst54_l, dst65_r, dst65_l; + v8i16 dst76_r, dst76_l, dst87_r, dst87_l; + + src -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + LD_SB2(src, src_stride, src3, src4); + src += (2 * src_stride); + + XORI_B2_128_SB(src3, src4); + + /* row 3 */ + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + + dst0_r >>= 6; + dst0_l >>= 6; + + /* row 4 */ + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + + LD_SB2(src, src_stride, src5, src6); + src += (2 * src_stride); + + XORI_B2_128_SB(src5, src6); + + /* row 5 */ + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); + dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); + dst2_r >>= 6; + dst2_l >>= 6; + + /* row 6 */ + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst6 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); + + ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); + dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); + dst3_r >>= 6; + dst3_l >>= 6; + + LD_SB2(src, src_stride, src7, src8); + + XORI_B2_128_SB(src7, src8); + + /* row 7 */ + VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); + dst7 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); + + ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); + dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); + dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1); + dst4_r >>= 6; + dst4_l >>= 6; + + /* row 8 */ + VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1); + dst8 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8); + + ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); + dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); + dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1); + dst5_r >>= 6; + dst5_l >>= 6; + + PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r); + PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r); + + ST_SW2(dst0_r, dst1_r, dst, dst_stride); + dst += (2 * dst_stride); + ST_SW2(dst2_r, dst3_r, dst, dst_stride); + dst += (2 * dst_stride); + ST_SW2(dst4_r, dst5_r, dst, dst_stride); +} + +static void hevc_hv_4t_8multx4mult_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height, + int32_t width) +{ + uint32_t loop_cnt, cnt; + uint8_t *src_tmp; + int16_t *dst_tmp; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v8i16 filt0, filt1; + v4i32 filt_h0, filt_h1; + v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask1; + v8i16 filter_vec, const_vec; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; + v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; + v8i16 dst10_r, dst32_r, dst21_r, dst43_r; + v8i16 dst10_l, dst32_l, dst21_l, dst43_l; + + src -= (src_stride + 1); + + filter_vec = LD_SH(filter_x); + SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); + + filter_vec = LD_SH(filter_y); + vec0 = __msa_clti_s_b((v16i8) filter_vec, 0); + filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec); + + SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1); + + mask1 = mask0 + 2; + + const_vec = __msa_ldi_h(128); + const_vec <<= 6; + + for (cnt = width >> 3; cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB3(src_tmp, src_stride, src0, src1, src2); + src_tmp += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + + VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); + VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); + VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); + + dst0 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); + dst1 = const_vec; + DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); + dst2 = const_vec; + DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); + ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); + + for (loop_cnt = height >> 2; loop_cnt--;) { + LD_SB4(src_tmp, src_stride, src3, src4, src5, src6); + src_tmp += (4 * src_stride); + XORI_B4_128_SB(src3, src4, src5, src6); + + VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); + dst3 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); + + ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); + dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); + dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); + + dst0_r >>= 6; + dst0_l >>= 6; + + /* row 4 */ + VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); + dst4 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); + + ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); + dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); + dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); + dst1_r >>= 6; + dst1_l >>= 6; + + /* row 5 */ + VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); + dst5 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); + + ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l); + dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1); + dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1); + + dst2_r >>= 6; + dst2_l >>= 6; + + /* row 6 */ + VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); + dst2 = const_vec; + DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); + + ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l); + dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1); + dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1); + + dst3_r >>= 6; + dst3_l >>= 6; + + PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, + dst2_l, dst2_r, dst3_l, dst3_r, + dst0_r, dst1_r, dst2_r, dst3_r); + + ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + ST_SW2(dst2_r, dst3_r, dst_tmp, dst_stride); + dst_tmp += (2 * dst_stride); + } + + src += 8; + dst += 8; + } +} + +static void hevc_hv_4t_8w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + + if (2 == height) { + hevc_hv_4t_8x2_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height); + } else if (6 == height) { + hevc_hv_4t_8x6_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height); + } else if (0 == (height % 4)) { + hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 8); + } +} + +static void hevc_hv_4t_12w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 8); + + hevc_hv_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride, + filter_x, filter_y, height); + +} + +static void hevc_hv_4t_16w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 16); +} + +static void hevc_hv_4t_24w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 24); +} + +static void hevc_hv_4t_32w_msa(uint8_t *src, + int32_t src_stride, + int16_t *dst, + int32_t dst_stride, + const int8_t *filter_x, + const int8_t *filter_y, + int32_t height) +{ + hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, + filter_x, filter_y, height, 32); +} + +#define MC_COPY(WIDTH) \ +void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_msa(int16_t *dst, \ + uint8_t *src, \ + ptrdiff_t src_stride, \ + int height, \ + intptr_t mx, \ + intptr_t my, \ + int width) \ +{ \ + hevc_copy_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, height); \ +} + +MC_COPY(4); +MC_COPY(6); +MC_COPY(8); +MC_COPY(12); +MC_COPY(16); +MC_COPY(24); +MC_COPY(32); +MC_COPY(48); +MC_COPY(64); + +#undef MC_COPY + +#define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ +void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst, \ + uint8_t *src, \ + ptrdiff_t src_stride, \ + int height, \ + intptr_t mx, \ + intptr_t my, \ + int width) \ +{ \ + const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ + \ + hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \ + MAX_PB_SIZE, filter, height); \ +} + +MC(qpel, h, 4, 8, hz, mx); +MC(qpel, h, 8, 8, hz, mx); +MC(qpel, h, 12, 8, hz, mx); +MC(qpel, h, 16, 8, hz, mx); +MC(qpel, h, 24, 8, hz, mx); +MC(qpel, h, 32, 8, hz, mx); +MC(qpel, h, 48, 8, hz, mx); +MC(qpel, h, 64, 8, hz, mx); + +MC(qpel, v, 4, 8, vt, my); +MC(qpel, v, 8, 8, vt, my); +MC(qpel, v, 12, 8, vt, my); +MC(qpel, v, 16, 8, vt, my); +MC(qpel, v, 24, 8, vt, my); +MC(qpel, v, 32, 8, vt, my); +MC(qpel, v, 48, 8, vt, my); +MC(qpel, v, 64, 8, vt, my); + +MC(epel, h, 4, 4, hz, mx); +MC(epel, h, 6, 4, hz, mx); +MC(epel, h, 8, 4, hz, mx); +MC(epel, h, 12, 4, hz, mx); +MC(epel, h, 16, 4, hz, mx); +MC(epel, h, 24, 4, hz, mx); +MC(epel, h, 32, 4, hz, mx); + +MC(epel, v, 4, 4, vt, my); +MC(epel, v, 6, 4, vt, my); +MC(epel, v, 8, 4, vt, my); +MC(epel, v, 12, 4, vt, my); +MC(epel, v, 16, 4, vt, my); +MC(epel, v, 24, 4, vt, my); +MC(epel, v, 32, 4, vt, my); + +#undef MC + +#define MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \ +void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst, \ + uint8_t *src, \ + ptrdiff_t src_stride, \ + int height, \ + intptr_t mx, \ + intptr_t my, \ + int width) \ +{ \ + const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ + const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ + \ + hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, \ + filter_x, filter_y, height); \ +} + +MC_HV(qpel, hv, 4, 8, hv); +MC_HV(qpel, hv, 8, 8, hv); +MC_HV(qpel, hv, 12, 8, hv); +MC_HV(qpel, hv, 16, 8, hv); +MC_HV(qpel, hv, 24, 8, hv); +MC_HV(qpel, hv, 32, 8, hv); +MC_HV(qpel, hv, 48, 8, hv); +MC_HV(qpel, hv, 64, 8, hv); + +MC_HV(epel, hv, 4, 4, hv); +MC_HV(epel, hv, 6, 4, hv); +MC_HV(epel, hv, 8, 4, hv); +MC_HV(epel, hv, 12, 4, hv); +MC_HV(epel, hv, 16, 4, hv); +MC_HV(epel, hv, 24, 4, hv); +MC_HV(epel, hv, 32, 4, hv); + +#undef MC_HV diff --git a/libavcodec/mips/hevcpred_init_mips.c b/libavcodec/mips/hevcpred_init_mips.c new file mode 100644 index 0000000000..331cfac115 --- /dev/null +++ b/libavcodec/mips/hevcpred_init_mips.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/hevc.h" +#include "libavcodec/mips/hevcpred_mips.h" + +#if HAVE_MSA +static av_cold void hevc_pred_init_msa(HEVCPredContext *c, const int bit_depth) +{ + if (8 == bit_depth) { + c->intra_pred[2] = ff_intra_pred_8_16x16_msa; + c->intra_pred[3] = ff_intra_pred_8_32x32_msa; + c->pred_planar[0] = ff_hevc_intra_pred_planar_0_msa; + c->pred_planar[1] = ff_hevc_intra_pred_planar_1_msa; + c->pred_planar[2] = ff_hevc_intra_pred_planar_2_msa; + c->pred_planar[3] = ff_hevc_intra_pred_planar_3_msa; + c->pred_dc = ff_hevc_intra_pred_dc_msa; + c->pred_angular[0] = ff_pred_intra_pred_angular_0_msa; + c->pred_angular[1] = ff_pred_intra_pred_angular_1_msa; + c->pred_angular[2] = ff_pred_intra_pred_angular_2_msa; + c->pred_angular[3] = ff_pred_intra_pred_angular_3_msa; + } +} +#endif // #if HAVE_MSA + +void ff_hevc_pred_init_mips(HEVCPredContext *c, const int bit_depth) +{ +#if HAVE_MSA + hevc_pred_init_msa(c, bit_depth); +#endif // #if HAVE_MSA +} diff --git a/libavcodec/mips/hevcpred_mips.h b/libavcodec/mips/hevcpred_mips.h new file mode 100644 index 0000000000..12f57a2a3c --- /dev/null +++ b/libavcodec/mips/hevcpred_mips.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_HEVCPRED_MIPS_H +#define AVCODEC_MIPS_HEVCPRED_MIPS_H + +#include "libavcodec/hevcdsp.h" + +void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst, + const uint8_t *src_top, + const uint8_t *src_left, + ptrdiff_t stride); + +void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst, + const uint8_t *src_top, + const uint8_t *src_left, + ptrdiff_t stride); + +void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst, + const uint8_t *src_top, + const uint8_t *src_left, + ptrdiff_t stride); + +void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst, + const uint8_t *src_top, + const uint8_t *src_left, + ptrdiff_t stride); + +void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top, + const uint8_t *src_left, + ptrdiff_t stride, int log2, int c_idx); + +void ff_pred_intra_pred_angular_0_msa(uint8_t *dst, + const uint8_t *src_top, + const uint8_t *src_left, + ptrdiff_t stride, int c_idx, int mode); + +void ff_pred_intra_pred_angular_1_msa(uint8_t *dst, + const uint8_t *src_top, + const uint8_t *src_left, + ptrdiff_t stride, int c_idx, int mode); + +void ff_pred_intra_pred_angular_2_msa(uint8_t *dst, + const uint8_t *src_top, + const uint8_t *src_left, + ptrdiff_t stride, int c_idx, int mode); + +void ff_pred_intra_pred_angular_3_msa(uint8_t *dst, + const uint8_t *src_top, + const uint8_t *src_left, + ptrdiff_t stride, int c_idx, int mode); + +void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx); +void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx); + +#endif // #ifndef AVCODEC_MIPS_HEVCPRED_MIPS_H diff --git a/libavcodec/mips/hevcpred_msa.c b/libavcodec/mips/hevcpred_msa.c new file mode 100644 index 0000000000..6a3b2815fd --- /dev/null +++ b/libavcodec/mips/hevcpred_msa.c @@ -0,0 +1,3084 @@ +/* + * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/hevc.h" +#include "libavutil/mips/generic_macros_msa.h" +#include "hevcpred_mips.h" + +static const int8_t intra_pred_angle_up[17] = { + -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 +}; + +static const int8_t intra_pred_angle_low[16] = { + 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26 +}; + +#define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, \ + mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3, \ + res0, res1, mul_val_b0, mul_val_b1, round) \ +{ \ + v8i16 res0_m, res1_m, res2_m, res3_m; \ + \ + MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1, \ + mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m); \ + \ + res0_m += mul_val_h1 * tmp0; \ + res1_m += mul_val_h3 * tmp0; \ + res2_m += mul_val_h1 * tmp0; \ + res3_m += mul_val_h3 * tmp0; \ + \ + res0_m += mul_val_b0 * src0_r; \ + res1_m += mul_val_b0 * src0_l; \ + res2_m += (mul_val_b0 - 1) * src0_r; \ + res3_m += (mul_val_b0 - 1) * src0_l; \ + \ + res0_m += mul_val_b1 * tmp1; \ + res1_m += mul_val_b1 * tmp1; \ + res2_m += (mul_val_b1 + 1) * tmp1; \ + res3_m += (mul_val_b1 + 1) * tmp1; \ + \ + SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round); \ + PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1); \ +} + +static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t stride, + int32_t flag) +{ + uint32_t col; + uint32_t src_data; + v8i16 vec0, vec1, vec2; + v16i8 zero = { 0 }; + + src_data = LW(src_top); + SW4(src_data, src_data, src_data, src_data, dst, stride); + + if (0 == flag) { + src_data = LW(src_left); + + vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data); + + vec0 = __msa_fill_h(src_left[-1]); + vec1 = __msa_fill_h(src_top[0]); + + vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2); + vec2 -= vec0; + vec2 >>= 1; + vec2 += vec1; + vec2 = CLIP_SH_0_255(vec2); + + for (col = 0; col < 4; col++) { + dst[stride * col] = (uint8_t) vec2[col]; + } + } +} + +static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t stride, + int32_t flag) +{ + uint8_t *tmp_dst = dst; + uint32_t row; + uint16_t val0, val1, val2, val3; + uint64_t src_data1; + v8i16 vec0, vec1, vec2; + v16i8 zero = { 0 }; + + src_data1 = LD(src_top); + + for (row = 8; row--;) { + SD(src_data1, tmp_dst); + tmp_dst += stride; + } + + if (0 == flag) { + src_data1 = LD(src_left); + + vec2 = (v8i16) __msa_insert_d((v2i64) zero, 0, src_data1); + + vec0 = __msa_fill_h(src_left[-1]); + vec1 = __msa_fill_h(src_top[0]); + + vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2); + vec2 -= vec0; + vec2 >>= 1; + vec2 += vec1; + vec2 = CLIP_SH_0_255(vec2); + + val0 = vec2[0]; + val1 = vec2[1]; + val2 = vec2[2]; + val3 = vec2[3]; + + dst[0] = val0; + dst[stride] = val1; + dst[2 * stride] = val2; + dst[3 * stride] = val3; + + val0 = vec2[4]; + val1 = vec2[5]; + val2 = vec2[6]; + val3 = vec2[7]; + + dst[4 * stride] = val0; + dst[5 * stride] = val1; + dst[6 * stride] = val2; + dst[7 * stride] = val3; + } +} + +static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t stride, + int32_t flag) +{ + int32_t col; + uint8_t *tmp_dst = dst; + uint32_t row; + v16u8 src; + v8i16 vec0, vec1, vec2, vec3; + + src = LD_UB(src_top); + + for (row = 16; row--;) { + ST_UB(src, tmp_dst); + tmp_dst += stride; + } + + if (0 == flag) { + src = LD_UB(src_left); + + vec0 = __msa_fill_h(src_left[-1]); + vec1 = __msa_fill_h(src_top[0]); + + UNPCK_UB_SH(src, vec2, vec3); + SUB2(vec2, vec0, vec3, vec0, vec2, vec3); + + vec2 >>= 1; + vec3 >>= 1; + + ADD2(vec2, vec1, vec3, vec1, vec2, vec3); + CLIP_SH2_0_255(vec2, vec3); + + src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2); + + for (col = 0; col < 16; col++) { + dst[stride * col] = src[col]; + } + } +} + +static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t stride, + int32_t flag) +{ + uint32_t val0, val1, val2, val3; + v16i8 src0; + v8i16 src0_r, src_top_val, src_left_val; + v16i8 zero = { 0 }; + + val0 = src_left[0] * 0x01010101; + val1 = src_left[1] * 0x01010101; + val2 = src_left[2] * 0x01010101; + val3 = src_left[3] * 0x01010101; + SW4(val0, val1, val2, val3, dst, stride); + + if (0 == flag) { + val0 = LW(src_top); + src0 = (v16i8) __msa_insert_w((v4i32) src0, 0, val0); + src_top_val = __msa_fill_h(src_top[-1]); + src_left_val = __msa_fill_h(src_left[0]); + + src0_r = (v8i16) __msa_ilvr_b(zero, src0); + + src0_r -= src_top_val; + src0_r >>= 1; + src0_r += src_left_val; + src0_r = CLIP_SH_0_255(src0_r); + src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r); + val0 = __msa_copy_s_w((v4i32) src0, 0); + SW(val0, dst); + } +} + +static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t stride, + int32_t flag) +{ + uint64_t val0, val1, val2, val3; + v16i8 src0; + v8i16 src0_r, src_top_val, src_left_val; + v16i8 zero = { 0 }; + + val0 = src_left[0] * 0x0101010101010101; + val1 = src_left[1] * 0x0101010101010101; + val2 = src_left[2] * 0x0101010101010101; + val3 = src_left[3] * 0x0101010101010101; + SD4(val0, val1, val2, val3, dst, stride); + + val0 = src_left[4] * 0x0101010101010101; + val1 = src_left[5] * 0x0101010101010101; + val2 = src_left[6] * 0x0101010101010101; + val3 = src_left[7] * 0x0101010101010101; + SD4(val0, val1, val2, val3, dst + 4 * stride, stride); + + if (0 == flag) { + val0 = LD(src_top); + src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, val0); + src_top_val = __msa_fill_h(src_top[-1]); + src_left_val = __msa_fill_h(src_left[0]); + + src0_r = (v8i16) __msa_ilvr_b(zero, src0); + + src0_r -= src_top_val; + src0_r >>= 1; + src0_r += src_left_val; + src0_r = CLIP_SH_0_255(src0_r); + src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r); + val0 = __msa_copy_s_d((v2i64) src0, 0); + SD(val0, dst); + } +} + +static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t stride, + int32_t flag) +{ + uint8_t *tmp_dst = dst; + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16i8 src0, src1, src2, src3; + v8i16 src0_r, src0_l, src_left_val, src_top_val; + + src_left_val = __msa_fill_h(src_left[0]); + + for (row = 4; row--;) { + inp0 = src_left[0]; + inp1 = src_left[1]; + inp2 = src_left[2]; + inp3 = src_left[3]; + src_left += 4; + + src0 = __msa_fill_b(inp0); + src1 = __msa_fill_b(inp1); + src2 = __msa_fill_b(inp2); + src3 = __msa_fill_b(inp3); + + ST_SB4(src0, src1, src2, src3, tmp_dst, stride); + tmp_dst += (4 * stride); + } + + if (0 == flag) { + src0 = LD_SB(src_top); + src_top_val = __msa_fill_h(src_top[-1]); + + UNPCK_UB_SH(src0, src0_r, src0_l); + SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l); + + src0_r >>= 1; + src0_l >>= 1; + + ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l); + CLIP_SH2_0_255(src0_r, src0_l); + src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r); + ST_SB(src0, dst); + } +} + +static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t stride) +{ + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16i8 src0, src1, src2, src3; + + for (row = 0; row < 8; row++) { + inp0 = src_left[row * 4]; + inp1 = src_left[row * 4 + 1]; + inp2 = src_left[row * 4 + 2]; + inp3 = src_left[row * 4 + 3]; + + src0 = __msa_fill_b(inp0); + src1 = __msa_fill_b(inp1); + src2 = __msa_fill_b(inp2); + src3 = __msa_fill_b(inp3); + + ST_SB2(src0, src0, dst, 16); + dst += stride; + ST_SB2(src1, src1, dst, 16); + dst += stride; + ST_SB2(src2, src2, dst, 16); + dst += stride; + ST_SB2(src3, src3, dst, 16); + dst += stride; + } +} + +static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t stride, + int32_t flag) +{ + uint8_t *tmp_dst = dst; + uint32_t addition = 0; + uint32_t val0, val1, val2; + v16i8 src = { 0 }; + v16u8 store; + v16i8 zero = { 0 }; + v8u16 sum, vec0, vec1; + + val0 = LW(src_top); + val1 = LW(src_left); + INSERT_W2_SB(val0, val1, src); + sum = __msa_hadd_u_h((v16u8) src, (v16u8) src); + sum = (v8u16) __msa_hadd_u_w(sum, sum); + sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum); + sum = (v8u16) __msa_srari_w((v4i32) sum, 3); + addition = __msa_copy_u_w((v4i32) sum, 0); + store = (v16u8) __msa_fill_b(addition); + val0 = __msa_copy_u_w((v4i32) store, 0); + SW4(val0, val0, val0, val0, dst, stride) + + if (0 == flag) { + ILVR_B2_UH(zero, store, zero, src, vec0, vec1); + + vec1 += vec0; + vec0 += vec0; + vec1 += vec0; + + vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2); + store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1); + val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2; + store = (v16u8) __msa_insert_b((v16i8) store, 0, val1); + val0 = __msa_copy_u_w((v4i32) store, 0); + SW(val0, tmp_dst); + + val0 = src_left[1]; + val1 = src_left[2]; + val2 = src_left[3]; + + addition *= 3; + + ADD2(val0, addition, val1, addition, val0, val1); + val2 += addition; + + val0 += 2; + val1 += 2; + val2 += 2; + val0 >>= 2; + val1 >>= 2; + val2 >>= 2; + + tmp_dst[stride * 1] = val0; + tmp_dst[stride * 2] = val1; + tmp_dst[stride * 3] = val2; + } +} + +static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t stride, + int32_t flag) +{ + uint8_t *tmp_dst = dst; + uint32_t row, col, val; + uint32_t addition = 0; + uint64_t val0, val1; + v16u8 src = { 0 }; + v16u8 store; + v8u16 sum, vec0, vec1; + v16i8 zero = { 0 }; + + val0 = LD(src_top); + val1 = LD(src_left); + INSERT_D2_UB(val0, val1, src); + sum = __msa_hadd_u_h((v16u8) src, (v16u8) src); + sum = (v8u16) __msa_hadd_u_w(sum, sum); + sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum); + sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum); + sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum); + sum = (v8u16) __msa_srari_w((v4i32) sum, 4); + addition = __msa_copy_u_w((v4i32) sum, 0); + store = (v16u8) __msa_fill_b(addition); + val0 = __msa_copy_u_d((v2i64) store, 0); + + for (row = 8; row--;) { + SD(val0, dst); + dst += stride; + } + + if (0 == flag) { + ILVR_B2_UH(zero, store, zero, src, vec0, vec1); + + vec1 += vec0; + vec0 += vec0; + vec1 += vec0; + vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2); + store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1); + val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2; + store = (v16u8) __msa_insert_b((v16i8) store, 0, val); + val0 = __msa_copy_u_d((v2i64) store, 0); + SD(val0, tmp_dst); + + val0 = LD(src_left); + src = (v16u8) __msa_insert_d((v2i64) src, 0, val0); + vec1 = (v8u16) __msa_ilvr_b(zero, (v16i8) src); + vec0 = (v8u16) __msa_fill_h(addition); + vec0 *= 3; + vec1 += vec0; + vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2); + + for (col = 1; col < 8; col++) { + tmp_dst[stride * col] = vec1[col]; + } + } +} + +static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t stride, + int32_t flag) +{ + uint8_t *tmp_dst = dst; + uint32_t row, col, val; + uint32_t addition = 0; + v16u8 src_above1, store, src_left1; + v8u16 sum, sum_above, sum_left; + v8u16 vec0, vec1, vec2; + v16i8 zero = { 0 }; + + src_above1 = LD_UB(src_top); + src_left1 = LD_UB(src_left); + + HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left); + sum = sum_above + sum_left; + sum = (v8u16) __msa_hadd_u_w(sum, sum); + sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum); + sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum); + sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum); + sum = (v8u16) __msa_srari_w((v4i32) sum, 5); + addition = __msa_copy_u_w((v4i32) sum, 0); + store = (v16u8) __msa_fill_b(addition); + + for (row = 16; row--;) { + ST_UB(store, dst); + dst += stride; + } + + if (0 == flag) { + vec0 = (v8u16) __msa_ilvr_b(zero, (v16i8) store); + ILVRL_B2_UH(zero, src_above1, vec1, vec2); + ADD2(vec1, vec0, vec2, vec0, vec1, vec2); + vec0 += vec0; + ADD2(vec1, vec0, vec2, vec0, vec1, vec2); + SRARI_H2_UH(vec1, vec2, 2); + store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1); + val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2; + store = (v16u8) __msa_insert_b((v16i8) store, 0, val); + ST_UB(store, tmp_dst); + + ILVRL_B2_UH(zero, src_left1, vec1, vec2); + vec0 = (v8u16) __msa_fill_h(addition); + vec0 *= 3; + ADD2(vec1, vec0, vec2, vec0, vec1, vec2); + SRARI_H2_UH(vec1, vec2, 2); + store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1); + + for (col = 1; col < 16; col++) { + tmp_dst[stride * col] = store[col]; + } + } +} + +static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t stride) +{ + uint32_t row; + v16u8 src_above1, src_above2, store, src_left1, src_left2; + v8u16 sum_above1, sum_above2; + v8u16 sum_left1, sum_left2; + v8u16 sum, sum_above, sum_left; + + LD_UB2(src_top, 16, src_above1, src_above2); + LD_UB2(src_left, 16, src_left1, src_left2); + HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2); + HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2); + sum_above = sum_above1 + sum_above2; + sum_left = sum_left1 + sum_left2; + sum = sum_above + sum_left; + sum = (v8u16) __msa_hadd_u_w(sum, sum); + sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum); + sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum); + sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum); + sum = (v8u16) __msa_srari_w((v4i32) sum, 6); + store = (v16u8) __msa_splati_b((v16i8) sum, 0); + + for (row = 16; row--;) { + ST_UB2(store, store, dst, 16); + dst += stride; + ST_UB2(store, store, dst, 16); + dst += stride; + } +} + +static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t stride) +{ + uint32_t src0, src1; + v16i8 src_vec0, src_vec1; + v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1; + v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3; + v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 }; + v16i8 zero = { 0 }; + + src0 = LW(src_top); + src1 = LW(src_left); + + mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0); + + src_vec0 = (v16i8) __msa_insert_w((v4i32) zero, 0, src0); + src_vec1 = (v16i8) __msa_insert_w((v4i32) zero, 0, src1); + + ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src1_r); + SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3); + + tmp0 = __msa_fill_h(src_top[4]); + tmp1 = __msa_fill_h(src_left[4]); + + MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3, + res0, res1, res2, res3); + + res0 += mul_val1 * tmp0; + res1 += mul_val1 * tmp0; + res2 += mul_val1 * tmp0; + res3 += mul_val1 * tmp0; + + res0 += 3 * src_vec0_r; + res1 += 2 * src_vec0_r; + res2 += src_vec0_r; + res0 += tmp1; + res1 += 2 * tmp1; + res2 += 3 * tmp1; + res3 += 4 * tmp1; + + PCKEV_D2_SH(res1, res0, res3, res2, res0, res1); + SRARI_H2_SH(res0, res1, 3); + src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0); + ST4x4_UB(src_vec0, src_vec0, 0, 1, 2, 3, dst, stride); +} + +static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t stride) +{ + uint64_t src0, src1; + v16i8 src_vec0, src_vec1, src_vec2, src_vec3; + v8i16 src_vec0_r, src_vec1_r; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7; + v8i16 tmp0, tmp1, tmp2; + v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 }; + v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 }; + v16i8 zero = { 0 }; + + src0 = LD(src_top); + src1 = LD(src_left); + + src_vec0 = (v16i8) __msa_insert_d((v2i64) zero, 0, src0); + src_vec1 = (v16i8) __msa_insert_d((v2i64) zero, 0, src1); + + ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src_vec1_r); + SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3); + SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7); + + tmp0 = __msa_fill_h(src_top[8]); + tmp1 = __msa_fill_h(src_left[8]); + + MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3, + res0, res1, res2, res3); + MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7, + res4, res5, res6, res7); + + tmp2 = mul_val1 * tmp0; + res0 += tmp2; + res1 += tmp2; + res2 += tmp2; + res3 += tmp2; + res4 += tmp2; + res5 += tmp2; + res6 += tmp2; + res7 += tmp2; + + res0 += 7 * src_vec0_r; + res1 += 6 * src_vec0_r; + res2 += 5 * src_vec0_r; + res3 += 4 * src_vec0_r; + res4 += 3 * src_vec0_r; + res5 += 2 * src_vec0_r; + res6 += src_vec0_r; + + res0 += tmp1; + res1 += 2 * tmp1; + res2 += 3 * tmp1; + res3 += 4 * tmp1; + res4 += 5 * tmp1; + res5 += 6 * tmp1; + res6 += 7 * tmp1; + res7 += 8 * tmp1; + + SRARI_H4_SH(res0, res1, res2, res3, 4); + SRARI_H4_SH(res4, res5, res6, res7, 4); + PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, + src_vec0, src_vec1, src_vec2, src_vec3); + + ST8x8_UB(src_vec0, src_vec1, src_vec2, src_vec3, dst, stride); +} + +static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t stride) +{ + v16u8 src0, src1; + v8i16 src0_r, src1_r, src0_l, src1_l; + v8i16 vec0, vec1; + v8i16 res0, res1, tmp0, tmp1; + v8i16 mul_val2, mul_val3; + v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 }; + v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 }; + + src0 = LD_UB(src_top); + src1 = LD_UB(src_left); + + UNPCK_UB_SH(src0, src0_r, src0_l); + UNPCK_UB_SH(src1, src1_r, src1_l); + + mul_val2 = mul_val0 - 8; + mul_val3 = mul_val1 + 8; + + tmp0 = __msa_fill_h(src_top[16]); + tmp1 = __msa_fill_h(src_left[16]); + + SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 15, 1, 5); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 13, 3, 5); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 11, 5, 5); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 9, 7, 5); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 7, 9, 5); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 5, 11, 5); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 3, 13, 5); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 1, 15, 5); + ST_SH2(res0, res1, dst, stride); +} + +static void process_intra_upper_16x16_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t stride, + uint8_t offset) +{ + v16i8 src0, src1; + v8i16 src0_r, src1_r, src0_l, src1_l; + v8i16 vec0, vec1, res0, res1; + v8i16 tmp0, tmp1; + v8i16 mul_val2, mul_val3; + v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 }; + v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 }; + + tmp0 = __msa_fill_h(src_top[32 - offset]); + tmp1 = __msa_fill_h(src_left[32]); + + src0 = LD_SB(src_top); + src1 = LD_SB(src_left); + + UNPCK_UB_SH(src0, src0_r, src0_l); + UNPCK_UB_SH(src1, src1_r, src1_l); + + mul_val1 += offset; + mul_val0 -= offset; + mul_val2 = mul_val0 - 8; + mul_val3 = mul_val1 + 8; + + SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 31, 1, 6); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 29, 3, 6); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 27, 5, 6); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 25, 7, 6); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 23, 9, 6); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 21, 11, 6); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 19, 13, 6); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 17, 15, 6); + ST_SH2(res0, res1, dst, stride); +} + +static void process_intra_lower_16x16_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t stride, + uint8_t offset) +{ + v16i8 src0, src1; + v8i16 src0_r, src1_r, src0_l, src1_l; + v8i16 vec0, vec1, res0, res1, tmp0, tmp1; + v8i16 mul_val2, mul_val3; + v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 }; + v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 }; + + tmp0 = __msa_fill_h(src_top[32 - offset]); + tmp1 = __msa_fill_h(src_left[16]); + + src0 = LD_SB(src_top); + src1 = LD_SB(src_left); + + UNPCK_UB_SH(src0, src0_r, src0_l); + UNPCK_UB_SH(src1, src1_r, src1_l); + + mul_val1 += offset; + mul_val0 -= offset; + mul_val2 = mul_val0 - 8; + mul_val3 = mul_val1 + 8; + + SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 15, 17, 6); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 13, 19, 6); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 11, 21, 6); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 9, 23, 6); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 7, 25, 6); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 5, 27, 6); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 3, 29, 6); + ST_SH2(res0, res1, dst, stride); + dst += (2 * stride); + + SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1); + HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, + mul_val0, mul_val1, mul_val2, mul_val3, + res0, res1, 1, 31, 6); + ST_SH2(res0, res1, dst, stride); +} + +static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t stride) +{ + process_intra_upper_16x16_msa(src_top, src_left, dst, stride, 0); + process_intra_upper_16x16_msa((src_top + 16), src_left, + (dst + 16), stride, 16); + dst += (16 * stride); + src_left += 16; + + process_intra_lower_16x16_msa(src_top, src_left, dst, stride, 0); + process_intra_lower_16x16_msa((src_top + 16), src_left, + (dst + 16), stride, 16); +} + +static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, + int32_t stride, + int32_t mode) +{ + int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 }; + uint8_t ref_array[3 * 32 + 4]; + uint8_t *ref_tmp = ref_array + 4; + const uint8_t *ref; + int32_t last; + int32_t h_cnt, idx0, fact_val0, idx1, fact_val1; + int32_t idx2, fact_val2, idx3, fact_val3; + int32_t angle, angle_loop; + int32_t inv_angle_val, offset; + uint64_t tmp0; + v16i8 top0, top1, top2, top3; + v16i8 dst_val0; + v16i8 zero = { 0 }; + v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; + v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7; + + angle = intra_pred_angle_up[mode - 18]; + inv_angle_val = inv_angle[mode - 18]; + last = (angle) >> 3; + angle_loop = angle; + + ref = src_top - 1; + if (angle < 0 && last < -1) { + inv_angle_val = inv_angle[mode - 18]; + + tmp0 = LD(ref); + SD(tmp0, ref_tmp); + + for (h_cnt = last; h_cnt <= -1; h_cnt++) { + offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8); + ref_tmp[h_cnt] = src_left[offset]; + } + + ref = ref_tmp; + } + + idx0 = angle_loop >> 5; + fact_val0 = angle_loop & 31; + angle_loop += angle; + + idx1 = angle_loop >> 5; + fact_val1 = angle_loop & 31; + angle_loop += angle; + + idx2 = angle_loop >> 5; + fact_val2 = angle_loop & 31; + angle_loop += angle; + + idx3 = angle_loop >> 5; + fact_val3 = angle_loop & 31; + + top0 = LD_SB(ref + idx0 + 1); + top1 = LD_SB(ref + idx1 + 1); + top2 = LD_SB(ref + idx2 + 1); + top3 = LD_SB(ref + idx3 + 1); + + fact0 = __msa_fill_h(fact_val0); + fact1 = __msa_fill_h(32 - fact_val0); + + fact2 = __msa_fill_h(fact_val1); + fact3 = __msa_fill_h(32 - fact_val1); + + fact4 = __msa_fill_h(fact_val2); + fact5 = __msa_fill_h(32 - fact_val2); + + fact6 = __msa_fill_h(fact_val3); + fact7 = __msa_fill_h(32 - fact_val3); + + ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2); + ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3); + ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3, + diff0, diff2, diff4, diff6); + SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2); + ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2); + ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3); + MUL2(diff1, fact0, diff3, fact2, diff1, diff3); + + diff1 += diff0 * fact1; + diff3 += diff2 * fact3; + + SRARI_H2_SH(diff1, diff3, 5); + dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1); + ST4x4_UB(dst_val0, dst_val0, 0, 1, 2, 3, dst, stride); +} + +static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, + int32_t stride, + int32_t mode) +{ + int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 }; + uint8_t ref_array[3 * 32 + 4]; + uint8_t *ref_tmp = ref_array + 8; + const uint8_t *ref; + const uint8_t *src_left_tmp = src_left - 1; + int32_t last, offset; + int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1; + int32_t idx2, fact_val2, idx3, fact_val3; + int32_t angle, angle_loop; + int32_t inv_angle_val, inv_angle_val_loop; + int32_t tmp0, tmp1, tmp2; + v16i8 top0, top1, top2, top3; + v16u8 dst_val0, dst_val1; + v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7; + v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; + + angle = intra_pred_angle_up[mode - 18]; + inv_angle_val = inv_angle[mode - 18]; + last = (angle) >> 2; + angle_loop = angle; + + ref = src_top - 1; + if (last < -1) { + inv_angle_val_loop = inv_angle_val * last; + + tmp0 = LW(ref); + tmp1 = LW(ref + 4); + tmp2 = LW(ref + 8); + SW(tmp0, ref_tmp); + SW(tmp1, ref_tmp + 4); + SW(tmp2, ref_tmp + 8); + + for (h_cnt = last; h_cnt <= -1; h_cnt++) { + offset = (inv_angle_val_loop + 128) >> 8; + ref_tmp[h_cnt] = src_left_tmp[offset]; + inv_angle_val_loop += inv_angle_val; + } + ref = ref_tmp; + } + + for (v_cnt = 0; v_cnt < 2; v_cnt++) { + idx0 = (angle_loop) >> 5; + fact_val0 = (angle_loop) & 31; + angle_loop += angle; + + idx1 = (angle_loop) >> 5; + fact_val1 = (angle_loop) & 31; + angle_loop += angle; + + idx2 = (angle_loop) >> 5; + fact_val2 = (angle_loop) & 31; + angle_loop += angle; + + idx3 = (angle_loop) >> 5; + fact_val3 = (angle_loop) & 31; + angle_loop += angle; + + top0 = LD_SB(ref + idx0 + 1); + top1 = LD_SB(ref + idx1 + 1); + top2 = LD_SB(ref + idx2 + 1); + top3 = LD_SB(ref + idx3 + 1); + + fact0 = __msa_fill_h(fact_val0); + fact1 = __msa_fill_h(32 - fact_val0); + fact2 = __msa_fill_h(fact_val1); + fact3 = __msa_fill_h(32 - fact_val1); + fact4 = __msa_fill_h(fact_val2); + fact5 = __msa_fill_h(32 - fact_val2); + fact6 = __msa_fill_h(fact_val3); + fact7 = __msa_fill_h(32 - fact_val3); + + UNPCK_UB_SH(top0, diff0, diff1); + UNPCK_UB_SH(top1, diff2, diff3); + UNPCK_UB_SH(top2, diff4, diff5); + UNPCK_UB_SH(top3, diff6, diff7); + + SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2); + SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2); + MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6, + diff1, diff3, diff5, diff7); + + diff1 += diff0 * fact1; + diff3 += diff2 * fact3; + diff5 += diff4 * fact5; + diff7 += diff6 * fact7; + + SRARI_H4_SH(diff1, diff3, diff5, diff7, 5); + PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1); + ST8x4_UB(dst_val0, dst_val1, dst, stride); + dst += (4 * stride); + } +} + +static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, + int32_t stride, + int32_t mode) +{ + int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 }; + int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1; + int32_t idx2, fact_val2, idx3, fact_val3; + int32_t tmp0; + int32_t angle, angle_loop, offset; + int32_t inv_angle_val, inv_angle_val_loop; + uint8_t ref_array[3 * 32 + 4]; + uint8_t *ref_tmp = ref_array + 16; + const uint8_t *ref; + const uint8_t *src_left_tmp = src_left - 1; + int32_t last; + v16u8 top0, top1, top2, top3, top4, top5, top6, top7; + v16i8 dst0, dst1, dst2, dst3; + v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7; + v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; + v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15; + + angle = intra_pred_angle_up[mode - 18]; + inv_angle_val = inv_angle[mode - 18]; + last = angle >> 1; + angle_loop = angle; + + ref = src_top - 1; + if (last < -1) { + inv_angle_val_loop = inv_angle_val * last; + + top0 = LD_UB(ref); + tmp0 = LW(ref + 16); + ST_UB(top0, ref_tmp); + SW(tmp0, ref_tmp + 16); + + for (h_cnt = last; h_cnt <= -1; h_cnt++) { + offset = (inv_angle_val_loop + 128) >> 8; + ref_tmp[h_cnt] = src_left_tmp[offset]; + inv_angle_val_loop += inv_angle_val; + } + ref = ref_tmp; + } + + for (v_cnt = 4; v_cnt--;) { + idx0 = (angle_loop) >> 5; + fact_val0 = (angle_loop) & 31; + angle_loop += angle; + + idx1 = (angle_loop) >> 5; + fact_val1 = (angle_loop) & 31; + angle_loop += angle; + + idx2 = (angle_loop) >> 5; + fact_val2 = (angle_loop) & 31; + angle_loop += angle; + + idx3 = (angle_loop) >> 5; + fact_val3 = (angle_loop) & 31; + angle_loop += angle; + + LD_UB2(ref + idx0 + 1, 16, top0, top1); + LD_UB2(ref + idx1 + 1, 16, top2, top3); + LD_UB2(ref + idx2 + 1, 16, top4, top5); + LD_UB2(ref + idx3 + 1, 16, top6, top7); + + fact0 = __msa_fill_h(fact_val0); + fact1 = __msa_fill_h(32 - fact_val0); + fact2 = __msa_fill_h(fact_val1); + fact3 = __msa_fill_h(32 - fact_val1); + fact4 = __msa_fill_h(fact_val2); + fact5 = __msa_fill_h(32 - fact_val2); + fact6 = __msa_fill_h(fact_val3); + fact7 = __msa_fill_h(32 - fact_val3); + + SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1); + SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1); + UNPCK_UB_SH(top0, diff0, diff1); + UNPCK_UB_SH(top1, diff2, diff3); + UNPCK_UB_SH(top2, diff4, diff5); + UNPCK_UB_SH(top3, diff6, diff7); + UNPCK_UB_SH(top4, diff8, diff9); + UNPCK_UB_SH(top5, diff10, diff11); + UNPCK_UB_SH(top6, diff12, diff13); + UNPCK_UB_SH(top7, diff14, diff15); + + MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2, + diff2, diff3, diff6, diff7); + MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6, + diff10, diff11, diff14, diff15); + + diff2 += diff0 * fact1; + diff3 += diff1 * fact1; + diff6 += diff4 * fact3; + diff7 += diff5 * fact3; + diff10 += diff8 * fact5; + diff11 += diff9 * fact5; + diff14 += diff12 * fact7; + diff15 += diff13 * fact7; + + SRARI_H4_SH(diff2, diff3, diff6, diff7, 5); + SRARI_H4_SH(diff10, diff11, diff14, diff15, 5); + PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14, + dst0, dst1, dst2, dst3); + ST_SB4(dst0, dst1, dst2, dst3, dst, stride); + dst += (4 * stride); + } +} + +static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, + int32_t stride, + int32_t mode) +{ + int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 }; + uint8_t ref_array[3 * 32 + 4]; + uint8_t *ref_tmp; + const uint8_t *ref; + const uint8_t *src_left_tmp = src_left - 1; + int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1; + int32_t tmp0, tmp1, tmp2, tmp3; + int32_t angle, angle_loop; + int32_t inv_angle_val, inv_angle_val_loop; + int32_t last, offset; + v16u8 top0, top1, top2, top3, top4, top5, top6, top7; + v16i8 dst0, dst1, dst2, dst3; + v8i16 fact0, fact1, fact2, fact3; + v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; + v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15; + + ref_tmp = ref_array + 32; + + angle = intra_pred_angle_up[mode - 18]; + inv_angle_val = inv_angle[mode - 18]; + last = angle; + angle_loop = angle; + + ref = src_top - 1; + if (last < -1) { + inv_angle_val_loop = inv_angle_val * last; + LD_UB2(ref, 16, top0, top1); + tmp0 = ref[32]; + tmp1 = ref[33]; + tmp2 = ref[34]; + tmp3 = ref[35]; + + ST_UB2(top0, top1, ref_tmp, 16); + ref_tmp[32] = tmp0; + ref_tmp[33] = tmp1; + ref_tmp[34] = tmp2; + ref_tmp[35] = tmp3; + + for (h_cnt = last; h_cnt <= -1; h_cnt++) { + offset = (inv_angle_val_loop + 128) >> 8; + ref_tmp[h_cnt] = src_left_tmp[offset]; + inv_angle_val_loop += inv_angle_val; + } + + ref = ref_tmp; + } + + for (v_cnt = 16; v_cnt--;) { + idx0 = (angle_loop) >> 5; + fact_val0 = (angle_loop) & 31; + angle_loop += angle; + + idx1 = (angle_loop) >> 5; + fact_val1 = (angle_loop) & 31; + angle_loop += angle; + + top0 = LD_UB(ref + idx0 + 1); + top4 = LD_UB(ref + idx1 + 1); + top1 = LD_UB(ref + idx0 + 17); + top5 = LD_UB(ref + idx1 + 17); + top3 = LD_UB(ref + idx0 + 33); + top7 = LD_UB(ref + idx1 + 33); + + fact0 = __msa_fill_h(fact_val0); + fact1 = __msa_fill_h(32 - fact_val0); + fact2 = __msa_fill_h(fact_val1); + fact3 = __msa_fill_h(32 - fact_val1); + + top2 = top1; + top6 = top5; + + SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1); + SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1); + UNPCK_UB_SH(top0, diff0, diff1); + UNPCK_UB_SH(top1, diff2, diff3); + UNPCK_UB_SH(top2, diff4, diff5); + UNPCK_UB_SH(top3, diff6, diff7); + UNPCK_UB_SH(top4, diff8, diff9); + UNPCK_UB_SH(top5, diff10, diff11); + UNPCK_UB_SH(top6, diff12, diff13); + UNPCK_UB_SH(top7, diff14, diff15); + + MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0, + diff2, diff3, diff6, diff7); + MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2, + diff10, diff11, diff14, diff15); + + diff2 += diff0 * fact1; + diff3 += diff1 * fact1; + diff6 += diff4 * fact1; + diff7 += diff5 * fact1; + diff10 += diff8 * fact3; + diff11 += diff9 * fact3; + diff14 += diff12 * fact3; + diff15 += diff13 * fact3; + + SRARI_H4_SH(diff2, diff3, diff6, diff7, 5); + SRARI_H4_SH(diff10, diff11, diff14, diff15, 5); + PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14, + dst0, dst1, dst2, dst3); + + ST_SB2(dst0, dst1, dst, 16); + dst += stride; + ST_SB2(dst2, dst3, dst, 16); + dst += stride; + } +} + +static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, + int32_t stride, + int32_t mode) +{ + int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 }; + uint8_t ref_array[3 * 32 + 4]; + uint8_t *ref_tmp = ref_array + 4; + const uint8_t *ref; + int32_t last, offset; + int32_t h_cnt, idx0, fact_val0, idx1, fact_val1; + int32_t idx2, fact_val2, idx3, fact_val3; + int32_t angle, angle_loop, inv_angle_val; + uint64_t tmp0; + v16i8 dst_val0, dst_val1; + v16u8 top0, top1, top2, top3; + v16u8 zero = { 0 }; + v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; + v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7; + + angle = intra_pred_angle_low[mode - 2]; + last = angle >> 3; + angle_loop = angle; + + ref = src_left - 1; + if (last < -1) { + inv_angle_val = inv_angle[mode - 11]; + + tmp0 = LD(ref); + SD(tmp0, ref_tmp); + + for (h_cnt = last; h_cnt <= -1; h_cnt++) { + offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8); + ref_tmp[h_cnt] = src_top[offset]; + } + + ref = ref_tmp; + } + + idx0 = angle_loop >> 5; + fact_val0 = angle_loop & 31; + angle_loop += angle; + + idx1 = angle_loop >> 5; + fact_val1 = angle_loop & 31; + angle_loop += angle; + + idx2 = angle_loop >> 5; + fact_val2 = angle_loop & 31; + angle_loop += angle; + + idx3 = angle_loop >> 5; + fact_val3 = angle_loop & 31; + + top0 = LD_UB(ref + idx0 + 1); + top1 = LD_UB(ref + idx1 + 1); + top2 = LD_UB(ref + idx2 + 1); + top3 = LD_UB(ref + idx3 + 1); + + fact0 = __msa_fill_h(fact_val0); + fact1 = __msa_fill_h(32 - fact_val0); + fact2 = __msa_fill_h(fact_val1); + fact3 = __msa_fill_h(32 - fact_val1); + fact4 = __msa_fill_h(fact_val2); + fact5 = __msa_fill_h(32 - fact_val2); + fact6 = __msa_fill_h(fact_val3); + fact7 = __msa_fill_h(32 - fact_val3); + + ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2); + ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3); + ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3, + diff0, diff2, diff4, diff6); + SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2); + ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2); + ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3); + MUL2(diff1, fact0, diff3, fact2, diff1, diff3); + + diff1 += diff0 * fact1; + diff3 += diff2 * fact3; + + SRARI_H2_SH(diff1, diff3, 5); + PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1); + + diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0); + diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0); + + diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0); + + dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2); + dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2); + + ST4x2_UB(dst_val0, dst, stride); + dst += (2 * stride); + ST4x2_UB(dst_val1, dst, stride); +} + +static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, + int32_t stride, + int32_t mode) +{ + int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 }; + uint8_t ref_array[3 * 32 + 4]; + uint8_t *ref_tmp = ref_array + 8; + const uint8_t *ref; + const uint8_t *src_top_tmp = src_top - 1; + uint8_t *dst_org; + int32_t last, offset, tmp0, tmp1, tmp2; + int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1; + int32_t idx2, fact_val2, idx3, fact_val3; + int32_t angle, angle_loop, inv_angle_val; + v16i8 top0, top1, top2, top3; + v16i8 dst_val0, dst_val1, dst_val2, dst_val3; + v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; + v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7; + + angle = intra_pred_angle_low[mode - 2]; + last = (angle) >> 2; + angle_loop = angle; + + ref = src_left - 1; + if (last < -1) { + inv_angle_val = inv_angle[mode - 11]; + + tmp0 = LW(ref); + tmp1 = LW(ref + 4); + tmp2 = LW(ref + 8); + SW(tmp0, ref_tmp); + SW(tmp1, ref_tmp + 4); + SW(tmp2, ref_tmp + 8); + + for (h_cnt = last; h_cnt <= -1; h_cnt++) { + offset = (h_cnt * inv_angle_val + 128) >> 8; + ref_tmp[h_cnt] = src_top_tmp[offset]; + } + + ref = ref_tmp; + } + + for (v_cnt = 0; v_cnt < 2; v_cnt++) { + dst_org = dst; + + idx0 = angle_loop >> 5; + fact_val0 = angle_loop & 31; + angle_loop += angle; + + idx1 = angle_loop >> 5; + fact_val1 = angle_loop & 31; + angle_loop += angle; + + idx2 = angle_loop >> 5; + fact_val2 = angle_loop & 31; + angle_loop += angle; + + idx3 = angle_loop >> 5; + fact_val3 = angle_loop & 31; + angle_loop += angle; + + top0 = LD_SB(ref + idx0 + 1); + top1 = LD_SB(ref + idx1 + 1); + top2 = LD_SB(ref + idx2 + 1); + top3 = LD_SB(ref + idx3 + 1); + + fact0 = __msa_fill_h(fact_val0); + fact1 = __msa_fill_h(32 - fact_val0); + fact2 = __msa_fill_h(fact_val1); + fact3 = __msa_fill_h(32 - fact_val1); + fact4 = __msa_fill_h(fact_val2); + fact5 = __msa_fill_h(32 - fact_val2); + fact6 = __msa_fill_h(fact_val3); + fact7 = __msa_fill_h(32 - fact_val3); + + UNPCK_UB_SH(top0, diff0, diff1); + UNPCK_UB_SH(top1, diff2, diff3); + UNPCK_UB_SH(top2, diff4, diff5); + UNPCK_UB_SH(top3, diff6, diff7); + SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2); + SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2); + MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6, + diff1, diff3, diff5, diff7); + + diff1 += diff0 * fact1; + diff3 += diff2 * fact3; + diff5 += diff4 * fact5; + diff7 += diff6 * fact7; + + SRARI_H4_SH(diff1, diff3, diff5, diff7, 5); + PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7, + dst_val0, dst_val1, dst_val2, dst_val3); + ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1); + ILVRL_H2_SH(diff1, diff0, diff3, diff4); + ST4x8_UB(diff3, diff4, dst_org, stride); + dst += 4; + } +} + +static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, + int32_t stride, + int32_t mode) +{ + int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 }; + int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1; + int32_t idx2, fact_val2, idx3, fact_val3, tmp0; + v16i8 top0, top1, dst_val0, top2, top3, dst_val1; + v16i8 top4, top5, dst_val2, top6, top7, dst_val3; + v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7; + v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; + v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15; + int32_t angle, angle_loop, inv_angle_val, offset; + uint8_t ref_array[3 * 32 + 4]; + uint8_t *ref_tmp = ref_array + 16; + const uint8_t *ref, *src_top_tmp = src_top - 1; + uint8_t *dst_org; + int32_t last; + + angle = intra_pred_angle_low[mode - 2]; + last = (angle) >> 1; + angle_loop = angle; + + ref = src_left - 1; + if (last < -1) { + inv_angle_val = inv_angle[mode - 11]; + + top0 = LD_SB(ref); + tmp0 = LW(ref + 16); + ST_SB(top0, ref_tmp); + SW(tmp0, ref_tmp + 16); + + for (h_cnt = last; h_cnt <= -1; h_cnt++) { + offset = (h_cnt * inv_angle_val + 128) >> 8; + ref_tmp[h_cnt] = src_top_tmp[offset]; + } + + ref = ref_tmp; + } + + for (v_cnt = 0; v_cnt < 4; v_cnt++) { + dst_org = dst; + + idx0 = angle_loop >> 5; + fact_val0 = angle_loop & 31; + angle_loop += angle; + + idx1 = angle_loop >> 5; + fact_val1 = angle_loop & 31; + angle_loop += angle; + + idx2 = angle_loop >> 5; + fact_val2 = angle_loop & 31; + angle_loop += angle; + + idx3 = angle_loop >> 5; + fact_val3 = angle_loop & 31; + angle_loop += angle; + + LD_SB2(ref + idx0 + 1, 16, top0, top1); + LD_SB2(ref + idx1 + 1, 16, top2, top3); + LD_SB2(ref + idx2 + 1, 16, top4, top5); + LD_SB2(ref + idx3 + 1, 16, top6, top7); + + fact0 = __msa_fill_h(fact_val0); + fact1 = __msa_fill_h(32 - fact_val0); + fact2 = __msa_fill_h(fact_val1); + fact3 = __msa_fill_h(32 - fact_val1); + fact4 = __msa_fill_h(fact_val2); + fact5 = __msa_fill_h(32 - fact_val2); + fact6 = __msa_fill_h(fact_val3); + fact7 = __msa_fill_h(32 - fact_val3); + + SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1); + SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1); + + UNPCK_UB_SH(top0, diff0, diff1); + UNPCK_UB_SH(top1, diff2, diff3); + UNPCK_UB_SH(top2, diff4, diff5); + UNPCK_UB_SH(top3, diff6, diff7); + UNPCK_UB_SH(top4, diff8, diff9); + UNPCK_UB_SH(top5, diff10, diff11); + UNPCK_UB_SH(top6, diff12, diff13); + UNPCK_UB_SH(top7, diff14, diff15); + + MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2, + diff2, diff3, diff6, diff7); + MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6, + diff10, diff11, diff14, diff15); + + diff2 += diff0 * fact1; + diff3 += diff1 * fact1; + diff6 += diff4 * fact3; + diff7 += diff5 * fact3; + diff10 += diff8 * fact5; + diff11 += diff9 * fact5; + diff14 += diff12 * fact7; + diff15 += diff13 * fact7; + + SRARI_H4_SH(diff2, diff3, diff6, diff7, 5); + SRARI_H4_SH(diff10, diff11, diff14, diff15, 5); + PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14, + dst_val0, dst_val1, dst_val2, dst_val3); + ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1); + ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3); + ILVRL_H2_SH(diff1, diff0, diff4, diff5); + ILVRL_H2_SH(diff3, diff2, diff6, diff7); + ST4x8_UB(diff4, diff5, dst_org, stride); + dst_org += (8 * stride); + ST4x8_UB(diff6, diff7, dst_org, stride); + dst += 4; + } +} + +static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, + int32_t stride, + int32_t mode) +{ + int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 }; + int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0; + v16i8 top0, top1, dst_val0, top2, top3, dst_val1; + v16i8 top4, top5, dst_val2, top6, top7, dst_val3; + v8i16 fact0, fact1, fact2, fact3; + v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; + v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15; + int32_t angle, angle_loop, inv_angle_val, offset; + uint8_t ref_array[3 * 32 + 4]; + uint8_t *ref_tmp = ref_array + 32; + const uint8_t *ref, *src_top_tmp = src_top - 1; + uint8_t *dst_org; + int32_t last; + + angle = intra_pred_angle_low[mode - 2]; + last = angle; + angle_loop = angle; + + ref = src_left - 1; + if (last < -1) { + inv_angle_val = inv_angle[mode - 11]; + + LD_SB2(ref, 16, top0, top1); + tmp0 = LW(ref + 32); + ST_SB2(top0, top1, ref_tmp, 16); + SW(tmp0, ref_tmp + 32); + + for (h_cnt = last; h_cnt <= -1; h_cnt++) { + offset = (h_cnt * inv_angle_val + 128) >> 8; + ref_tmp[h_cnt] = src_top_tmp[offset]; + } + + ref = ref_tmp; + } + + for (v_cnt = 0; v_cnt < 16; v_cnt++) { + dst_org = dst; + idx0 = angle_loop >> 5; + fact_val0 = angle_loop & 31; + angle_loop += angle; + + idx1 = angle_loop >> 5; + fact_val1 = angle_loop & 31; + angle_loop += angle; + + top0 = LD_SB(ref + idx0 + 1); + top4 = LD_SB(ref + idx1 + 1); + top1 = LD_SB(ref + idx0 + 17); + top5 = LD_SB(ref + idx1 + 17); + top3 = LD_SB(ref + idx0 + 33); + top7 = LD_SB(ref + idx1 + 33); + + fact0 = __msa_fill_h(fact_val0); + fact1 = __msa_fill_h(32 - fact_val0); + fact2 = __msa_fill_h(fact_val1); + fact3 = __msa_fill_h(32 - fact_val1); + + top2 = top1; + top6 = top5; + + SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1); + SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1); + + UNPCK_UB_SH(top0, diff0, diff1); + UNPCK_UB_SH(top1, diff2, diff3); + UNPCK_UB_SH(top2, diff4, diff5); + UNPCK_UB_SH(top3, diff6, diff7); + UNPCK_UB_SH(top4, diff8, diff9); + UNPCK_UB_SH(top5, diff10, diff11); + UNPCK_UB_SH(top6, diff12, diff13); + UNPCK_UB_SH(top7, diff14, diff15); + + MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0, + diff2, diff3, diff6, diff7); + MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2, + diff10, diff11, diff14, diff15); + + diff2 += diff0 * fact1; + diff3 += diff1 * fact1; + diff6 += diff4 * fact1; + diff7 += diff5 * fact1; + diff10 += diff8 * fact3; + diff11 += diff9 * fact3; + diff14 += diff12 * fact3; + diff15 += diff13 * fact3; + + SRARI_H4_SH(diff2, diff3, diff6, diff7, 5); + SRARI_H4_SH(diff10, diff11, diff14, diff15, 5); + PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14, + dst_val0, dst_val1, dst_val2, dst_val3); + ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1); + ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3); + + ST2x4_UB(diff0, 0, dst_org, stride); + dst_org += (4 * stride); + ST2x4_UB(diff0, 4, dst_org, stride); + dst_org += (4 * stride); + ST2x4_UB(diff1, 0, dst_org, stride); + dst_org += (4 * stride); + ST2x4_UB(diff1, 4, dst_org, stride); + dst_org += (4 * stride); + + ST2x4_UB(diff2, 0, dst_org, stride); + dst_org += (4 * stride); + ST2x4_UB(diff2, 4, dst_org, stride); + dst_org += (4 * stride); + ST2x4_UB(diff3, 0, dst_org, stride); + dst_org += (4 * stride); + ST2x4_UB(diff3, 4, dst_org, stride); + dst_org += (4 * stride); + + dst += 2; + } +} + +static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) +{ + uint32_t row; + v16u8 src1, src2; + + src1 = LD_UB(src); + src2 = LD_UB(src + 16); + + for (row = 32; row--;) { + ST_UB2(src1, src2, dst, 16); + dst += dst_stride; + } +} + +void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst, + const uint8_t *src_top, + const uint8_t *src_left, + ptrdiff_t stride) +{ + hevc_intra_pred_plane_4x4_msa(src_top, src_left, dst, stride); +} + +void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst, + const uint8_t *src_top, + const uint8_t *src_left, + ptrdiff_t stride) +{ + hevc_intra_pred_plane_8x8_msa(src_top, src_left, dst, stride); +} + +void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst, + const uint8_t *src_top, + const uint8_t *src_left, + ptrdiff_t stride) +{ + hevc_intra_pred_plane_16x16_msa(src_top, src_left, dst, stride); +} + +void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst, + const uint8_t *src_top, + const uint8_t *src_left, + ptrdiff_t stride) +{ + hevc_intra_pred_plane_32x32_msa(src_top, src_left, dst, stride); +} + +void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top, + const uint8_t *src_left, + ptrdiff_t stride, int log2, int c_idx) +{ + switch (log2) { + case 2: + hevc_intra_pred_dc_4x4_msa(src_top, src_left, dst, stride, c_idx); + break; + + case 3: + hevc_intra_pred_dc_8x8_msa(src_top, src_left, dst, stride, c_idx); + break; + + case 4: + hevc_intra_pred_dc_16x16_msa(src_top, src_left, dst, stride, c_idx); + break; + + case 5: + hevc_intra_pred_dc_32x32_msa(src_top, src_left, dst, stride); + break; + } +} + +void ff_pred_intra_pred_angular_0_msa(uint8_t *dst, + const uint8_t *src_top, + const uint8_t *src_left, + ptrdiff_t stride, int c_idx, int mode) +{ + if (mode == 10) { + hevc_intra_pred_horiz_4x4_msa(src_top, src_left, dst, stride, c_idx); + } else if (mode == 26) { + hevc_intra_pred_vert_4x4_msa(src_top, src_left, dst, stride, c_idx); + } else if (mode >= 18) { + hevc_intra_pred_angular_upper_4width_msa(src_top, src_left, + dst, stride, mode); + } else { + hevc_intra_pred_angular_lower_4width_msa(src_top, src_left, + dst, stride, mode); + } +} + +void ff_pred_intra_pred_angular_1_msa(uint8_t *dst, + const uint8_t *src_top, + const uint8_t *src_left, + ptrdiff_t stride, int c_idx, int mode) +{ + if (mode == 10) { + hevc_intra_pred_horiz_8x8_msa(src_top, src_left, dst, stride, c_idx); + } else if (mode == 26) { + hevc_intra_pred_vert_8x8_msa(src_top, src_left, dst, stride, c_idx); + } else if (mode >= 18) { + hevc_intra_pred_angular_upper_8width_msa(src_top, src_left, + dst, stride, mode); + } else { + hevc_intra_pred_angular_lower_8width_msa(src_top, src_left, + dst, stride, mode); + } +} + +void ff_pred_intra_pred_angular_2_msa(uint8_t *dst, + const uint8_t *src_top, + const uint8_t *src_left, + ptrdiff_t stride, int c_idx, int mode) +{ + if (mode == 10) { + hevc_intra_pred_horiz_16x16_msa(src_top, src_left, dst, stride, c_idx); + } else if (mode == 26) { + hevc_intra_pred_vert_16x16_msa(src_top, src_left, dst, stride, c_idx); + } else if (mode >= 18) { + hevc_intra_pred_angular_upper_16width_msa(src_top, src_left, + dst, stride, mode); + } else { + hevc_intra_pred_angular_lower_16width_msa(src_top, src_left, + dst, stride, mode); + } +} + +void ff_pred_intra_pred_angular_3_msa(uint8_t *dst, + const uint8_t *src_top, + const uint8_t *src_left, + ptrdiff_t stride, int c_idx, int mode) +{ + if (mode == 10) { + hevc_intra_pred_horiz_32x32_msa(src_top, src_left, dst, stride); + } else if (mode == 26) { + intra_predict_vert_32x32_msa(src_top, dst, stride); + } else if (mode >= 18) { + hevc_intra_pred_angular_upper_32width_msa(src_top, src_left, + dst, stride, mode); + } else { + hevc_intra_pred_angular_lower_32width_msa(src_top, src_left, + dst, stride, mode); + } +} + +void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx) +{ + v16u8 vec0; + HEVCLocalContext *lc = s->HEVClc; + int i; + int hshift = s->ps.sps->hshift[c_idx]; + int vshift = s->ps.sps->vshift[c_idx]; + int size_in_luma_h = 16 << hshift; + int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size; + int size_in_luma_v = 16 << vshift; + int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size; + int x = x0 >> hshift; + int y = y0 >> vshift; + int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; + int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; + + int cur_tb_addr = + s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)]; + + ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t); + uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride; + + int min_pu_width = s->ps.sps->min_pu_width; + + enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c : + lc->tu.intra_pred_mode; + uint32_t a; + uint8_t left_array[2 * 32 + 1]; + uint8_t filtered_left_array[2 * 32 + 1]; + uint8_t top_array[2 * 32 + 1]; + uint8_t filtered_top_array[2 * 32 + 1]; + + uint8_t *left = left_array + 1; + uint8_t *top = top_array + 1; + uint8_t *filtered_left = filtered_left_array + 1; + uint8_t *filtered_top = filtered_top_array + 1; + int cand_bottom_left = lc->na.cand_bottom_left + && cur_tb_addr > + s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) * + (s->ps.sps->tb_mask + 2) + (x_tb - 1)]; + int cand_left = lc->na.cand_left; + int cand_up_left = lc->na.cand_up_left; + int cand_up = lc->na.cand_up; + int cand_up_right = lc->na.cand_up_right + && cur_tb_addr > + s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) + + ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)]; + + int bottom_left_size = + (((y0 + 2 * size_in_luma_v) > + (s->ps.sps->height) ? (s->ps.sps->height) : (y0 + + 2 * size_in_luma_v)) - + (y0 + size_in_luma_v)) >> vshift; + int top_right_size = + (((x0 + 2 * size_in_luma_h) > + (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) - + (x0 + size_in_luma_h)) >> hshift; + + if (s->ps.pps->constrained_intra_pred_flag == 1) { + int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size); + int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size); + int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1)); + int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1)); + if (!size_in_luma_pu_h) + size_in_luma_pu_h++; + if (cand_bottom_left == 1 && on_pu_edge_x) { + int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size); + int y_bottom_pu = + ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size); + int max = + ((size_in_luma_pu_v) > + (s->ps.sps->min_pu_height - + y_bottom_pu) ? (s->ps.sps->min_pu_height - + y_bottom_pu) : (size_in_luma_pu_v)); + cand_bottom_left = 0; + for (i = 0; i < max; i += 2) + cand_bottom_left |= + ((s->ref->tab_mvf[(x_left_pu) + + (y_bottom_pu + + i) * min_pu_width]).pred_flag == + PF_INTRA); + } + if (cand_left == 1 && on_pu_edge_x) { + int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size); + int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size); + int max = + ((size_in_luma_pu_v) > + (s->ps.sps->min_pu_height - + y_left_pu) ? (s->ps.sps->min_pu_height - + y_left_pu) : (size_in_luma_pu_v)); + cand_left = 0; + for (i = 0; i < max; i += 2) + cand_left |= + ((s->ref->tab_mvf[(x_left_pu) + + (y_left_pu + + i) * min_pu_width]).pred_flag == + PF_INTRA); + } + if (cand_up_left == 1) { + int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size); + int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size); + cand_up_left = + (s->ref->tab_mvf[(x_left_pu) + + (y_top_pu) * min_pu_width]).pred_flag == + PF_INTRA; + } + if (cand_up == 1 && on_pu_edge_y) { + int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size); + int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size); + int max = + ((size_in_luma_pu_h) > + (s->ps.sps->min_pu_width - + x_top_pu) ? (s->ps.sps->min_pu_width - + x_top_pu) : (size_in_luma_pu_h)); + cand_up = 0; + for (i = 0; i < max; i += 2) + cand_up |= + ((s->ref->tab_mvf[(x_top_pu + i) + + (y_top_pu) * + min_pu_width]).pred_flag == PF_INTRA); + } + if (cand_up_right == 1 && on_pu_edge_y) { + int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size); + int x_right_pu = + ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size); + int max = + ((size_in_luma_pu_h) > + (s->ps.sps->min_pu_width - + x_right_pu) ? (s->ps.sps->min_pu_width - + x_right_pu) : (size_in_luma_pu_h)); + cand_up_right = 0; + for (i = 0; i < max; i += 2) + cand_up_right |= + ((s->ref->tab_mvf[(x_right_pu + i) + + (y_top_pu) * + min_pu_width]).pred_flag == PF_INTRA); + } + + vec0 = (v16u8) __msa_ldi_b(128); + + ST_UB4(vec0, vec0, vec0, vec0, left, 16); + + ST_UB4(vec0, vec0, vec0, vec0, top, 16); + + top[-1] = 128; + } + if (cand_up_left) { + left[-1] = src[(-1) + stride * (-1)]; + top[-1] = left[-1]; + } + if (cand_up) { + vec0 = LD_UB(src - stride); + ST_UB(vec0, top); + } + if (cand_up_right) { + vec0 = LD_UB(src - stride + 16); + ST_UB(vec0, (top + 16)); + + do { + uint32_t pix = + ((src[(16 + top_right_size - 1) + stride * (-1)]) * + 0x01010101U); + for (i = 0; i < (16 - top_right_size); i += 4) + ((((union unaligned_32 *) (top + 16 + top_right_size + + i))->l) = (pix)); + } while (0); + } + if (cand_left) + for (i = 0; i < 16; i++) + left[i] = src[(-1) + stride * (i)]; + if (cand_bottom_left) { + for (i = 16; i < 16 + bottom_left_size; i++) + left[i] = src[(-1) + stride * (i)]; + do { + uint32_t pix = + ((src[(-1) + stride * (16 + bottom_left_size - 1)]) * + 0x01010101U); + for (i = 0; i < (16 - bottom_left_size); i += 4) + ((((union unaligned_32 *) (left + 16 + bottom_left_size + + i))->l) = (pix)); + } while (0); + } + + if (s->ps.pps->constrained_intra_pred_flag == 1) { + if (cand_bottom_left || cand_left || cand_up_left || cand_up + || cand_up_right) { + int size_max_x = + x0 + ((2 * 16) << hshift) < + s->ps.sps->width ? 2 * 16 : (s->ps.sps->width - x0) >> hshift; + int size_max_y = + y0 + ((2 * 16) << vshift) < + s->ps.sps->height ? 2 * 16 : (s->ps.sps->height - y0) >> vshift; + int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1; + if (!cand_up_right) { + size_max_x = x0 + ((16) << hshift) < s->ps.sps->width ? + 16 : (s->ps.sps->width - x0) >> hshift; + } + if (!cand_bottom_left) { + size_max_y = y0 + ((16) << vshift) < s->ps.sps->height ? + 16 : (s->ps.sps->height - y0) >> vshift; + } + if (cand_bottom_left || cand_left || cand_up_left) { + while (j > -1 + && + !((s->ref->tab_mvf[(((x0 + + ((-1) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + + ((j) << + vshift)) + >> s->ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == + PF_INTRA)) + j--; + if (! + ((s->ref->tab_mvf[(((x0 + + ((-1) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + ((j) + << + vshift)) + >> s->ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == PF_INTRA)) { + j = 0; + while (j < size_max_x + && + !((s->ref->tab_mvf[(((x0 + + ((j) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + + ((-1) << + vshift)) + >> s-> + ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == + PF_INTRA)) + j++; + for (i = j; i > (j) - (j + 1); i--) + if (! + ((s->ref->tab_mvf[(((x0 + + ((i - + 1) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + + ((-1) << + vshift)) + >> s-> + ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == + PF_INTRA)) + top[i - 1] = top[i]; + left[-1] = top[-1]; + } + } else { + j = 0; + while (j < size_max_x + && + !((s->ref->tab_mvf[(((x0 + + ((j) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + ((-1) + << + vshift)) + >> s->ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == + PF_INTRA)) + j++; + if (j > 0) + if (x0 > 0) { + for (i = j; i > (j) - (j + 1); i--) + if (! + ((s->ref->tab_mvf[(((x0 + + ((i - + 1) << hshift)) >> + s->ps.sps->log2_min_pu_size)) + + (((y0 + ((-1) + << vshift)) + >> + s->ps.sps->log2_min_pu_size)) + * + min_pu_width]).pred_flag == + PF_INTRA)) + top[i - 1] = top[i]; + } else { + for (i = j; i > (j) - (j); i--) + if (! + ((s->ref->tab_mvf[(((x0 + + ((i - + 1) << hshift)) >> + s->ps.sps->log2_min_pu_size)) + + (((y0 + ((-1) + << vshift)) + >> + s->ps.sps->log2_min_pu_size)) + * + min_pu_width]).pred_flag == + PF_INTRA)) + top[i - 1] = top[i]; + top[-1] = top[0]; + } + left[-1] = top[-1]; + } + left[-1] = top[-1]; + if (cand_bottom_left || cand_left) { + a = ((left[-1]) * 0x01010101U); + for (i = 0; i < (0) + (size_max_y); i += 4) + if (! + ((s->ref->tab_mvf[(((x0 + + ((-1) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + + ((i) << + vshift)) + >> s->ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == + PF_INTRA)) + ((((union unaligned_32 *) (&left[i]))->l) = (a)); + else + a = ((left[i + 3]) * 0x01010101U); + } + if (!cand_left) { + vec0 = (v16u8) __msa_fill_b(left[-1]); + + ST_UB(vec0, left); + } + if (!cand_bottom_left) { + + vec0 = (v16u8) __msa_fill_b(left[15]); + + ST_UB(vec0, (left + 16)); + } + if (x0 != 0 && y0 != 0) { + a = ((left[size_max_y - 1]) * 0x01010101U); + for (i = (size_max_y - 1); + i > (size_max_y - 1) - (size_max_y); i -= 4) + if (! + ((s->ref->tab_mvf[(((x0 + + ((-1) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + + ((i - + 3) << + vshift)) + >> s->ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == + PF_INTRA)) + ((((union unaligned_32 *) (&left[i - 3]))->l) = (a)); + else + a = ((left[i - 3]) * 0x01010101U); + if (! + ((s->ref->tab_mvf[(((x0 + + ((-1) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + ((-1) + << + vshift)) + >> s->ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == PF_INTRA)) + left[-1] = left[0]; + } else if (x0 == 0) { + do { + uint32_t pix = ((0) * 0x01010101U); + for (i = 0; i < (size_max_y); i += 4) + ((((union unaligned_32 *) (left + i))->l) = (pix)); + } while (0); + } else { + a = ((left[size_max_y - 1]) * 0x01010101U); + for (i = (size_max_y - 1); + i > (size_max_y - 1) - (size_max_y); i -= 4) + if (! + ((s->ref->tab_mvf[(((x0 + + ((-1) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + + ((i - + 3) << + vshift)) + >> s->ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == + PF_INTRA)) + ((((union unaligned_32 *) (&left[i - 3]))->l) = (a)); + else + a = ((left[i - 3]) * 0x01010101U); + } + top[-1] = left[-1]; + if (y0 != 0) { + a = ((left[-1]) * 0x01010101U); + for (i = 0; i < (0) + (size_max_x); i += 4) + if (! + ((s->ref->tab_mvf[(((x0 + + ((i) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + ((-1) + << + vshift)) + >> s->ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == + PF_INTRA)) + ((((union unaligned_32 *) (&top[i]))->l) = (a)); + else + a = ((top[i + 3]) * 0x01010101U); + } + } + } + + if (!cand_bottom_left) { + if (cand_left) { + vec0 = (v16u8) __msa_fill_b(left[15]); + + ST_UB(vec0, (left + 16)); + + } else if (cand_up_left) { + vec0 = (v16u8) __msa_fill_b(left[-1]); + + ST_UB2(vec0, vec0, left, 16); + + cand_left = 1; + } else if (cand_up) { + left[-1] = top[0]; + + vec0 = (v16u8) __msa_fill_b(left[-1]); + + ST_UB2(vec0, vec0, left, 16); + + cand_up_left = 1; + cand_left = 1; + } else if (cand_up_right) { + vec0 = (v16u8) __msa_fill_b(top[16]); + + ST_UB(vec0, top); + + left[-1] = top[16]; + + ST_UB2(vec0, vec0, left, 16); + + cand_up = 1; + cand_up_left = 1; + cand_left = 1; + } else { + left[-1] = 128; + vec0 = (v16u8) __msa_ldi_b(128); + + ST_UB2(vec0, vec0, top, 16); + ST_UB2(vec0, vec0, left, 16); + } + } + + if (!cand_left) { + vec0 = (v16u8) __msa_fill_b(left[16]); + ST_UB(vec0, left); + } + if (!cand_up_left) { + left[-1] = left[0]; + } + if (!cand_up) { + vec0 = (v16u8) __msa_fill_b(left[-1]); + ST_UB(vec0, top); + } + if (!cand_up_right) { + vec0 = (v16u8) __msa_fill_b(top[15]); + ST_UB(vec0, (top + 16)); + } + + top[-1] = left[-1]; + + + if (!s->ps.sps->intra_smoothing_disabled_flag + && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) { + if (mode != INTRA_DC && 16 != 4) { + int intra_hor_ver_dist_thresh[] = { 7, 1, 0 }; + int min_dist_vert_hor = + (((((int) (mode - 26U)) >= + 0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) > + ((((int) (mode - 10U)) >= + 0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U))))) + ? ((((int) (mode - 10U)) >= + 0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U))))) + : ((((int) (mode - 26U)) >= + 0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U)))))); + if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) { + filtered_left[2 * 16 - 1] = left[2 * 16 - 1]; + filtered_top[2 * 16 - 1] = top[2 * 16 - 1]; + for (i = 2 * 16 - 2; i >= 0; i--) + filtered_left[i] = (left[i + 1] + 2 * left[i] + + left[i - 1] + 2) >> 2; + filtered_top[-1] = + filtered_left[-1] = + (left[0] + 2 * left[-1] + top[0] + 2) >> 2; + for (i = 2 * 16 - 2; i >= 0; i--) + filtered_top[i] = (top[i + 1] + 2 * top[i] + + top[i - 1] + 2) >> 2; + left = filtered_left; + top = filtered_top; + } + } + } + + switch (mode) { + case INTRA_PLANAR: + s->hpc.pred_planar[4 - 2] ((uint8_t *) src, (uint8_t *) top, + (uint8_t *) left, stride); + break; + case INTRA_DC: + s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top, + (uint8_t *) left, stride, 4, c_idx); + break; + default: + s->hpc.pred_angular[4 - 2] ((uint8_t *) src, (uint8_t *) top, + (uint8_t *) left, stride, c_idx, mode); + break; + } +} + +void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx) +{ + v16u8 vec0, vec1; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 res0, res1, res2, res3; + v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 }; + v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 }; + HEVCLocalContext *lc = s->HEVClc; + int i; + int hshift = s->ps.sps->hshift[c_idx]; + int vshift = s->ps.sps->vshift[c_idx]; + int size_in_luma_h = 32 << hshift; + int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size; + int size_in_luma_v = 32 << vshift; + int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size; + int x = x0 >> hshift; + int y = y0 >> vshift; + int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; + int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; + + int cur_tb_addr = + s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)]; + + ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t); + uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride; + + int min_pu_width = s->ps.sps->min_pu_width; + + enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c : + lc->tu.intra_pred_mode; + uint32_t a; + uint8_t left_array[2 * 32 + 1]; + uint8_t filtered_left_array[2 * 32 + 1]; + uint8_t top_array[2 * 32 + 1]; + uint8_t filtered_top_array[2 * 32 + 1]; + + uint8_t *left = left_array + 1; + uint8_t *top = top_array + 1; + uint8_t *filtered_left = filtered_left_array + 1; + uint8_t *filtered_top = filtered_top_array + 1; + int cand_bottom_left = lc->na.cand_bottom_left + && cur_tb_addr > + s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) * + (s->ps.sps->tb_mask + 2) + (x_tb - 1)]; + int cand_left = lc->na.cand_left; + int cand_up_left = lc->na.cand_up_left; + int cand_up = lc->na.cand_up; + int cand_up_right = lc->na.cand_up_right + && cur_tb_addr > + s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) + + ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)]; + + int bottom_left_size = + (((y0 + 2 * size_in_luma_v) > + (s->ps.sps->height) ? (s->ps.sps->height) : (y0 + + 2 * size_in_luma_v)) - + (y0 + size_in_luma_v)) >> vshift; + int top_right_size = + (((x0 + 2 * size_in_luma_h) > + (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) - + (x0 + size_in_luma_h)) >> hshift; + + if (s->ps.pps->constrained_intra_pred_flag == 1) { + int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size); + int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size); + int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1)); + int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1)); + if (!size_in_luma_pu_h) + size_in_luma_pu_h++; + if (cand_bottom_left == 1 && on_pu_edge_x) { + int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size); + int y_bottom_pu = + ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size); + int max = + ((size_in_luma_pu_v) > + (s->ps.sps->min_pu_height - + y_bottom_pu) ? (s->ps.sps->min_pu_height - + y_bottom_pu) : (size_in_luma_pu_v)); + cand_bottom_left = 0; + for (i = 0; i < max; i += 2) + cand_bottom_left |= + ((s->ref->tab_mvf[(x_left_pu) + + (y_bottom_pu + + i) * min_pu_width]).pred_flag == + PF_INTRA); + } + if (cand_left == 1 && on_pu_edge_x) { + int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size); + int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size); + int max = + ((size_in_luma_pu_v) > + (s->ps.sps->min_pu_height - + y_left_pu) ? (s->ps.sps->min_pu_height - + y_left_pu) : (size_in_luma_pu_v)); + cand_left = 0; + for (i = 0; i < max; i += 2) + cand_left |= + ((s->ref->tab_mvf[(x_left_pu) + + (y_left_pu + + i) * min_pu_width]).pred_flag == + PF_INTRA); + } + if (cand_up_left == 1) { + int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size); + int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size); + cand_up_left = + (s->ref->tab_mvf[(x_left_pu) + + (y_top_pu) * min_pu_width]).pred_flag == + PF_INTRA; + } + if (cand_up == 1 && on_pu_edge_y) { + int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size); + int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size); + int max = + ((size_in_luma_pu_h) > + (s->ps.sps->min_pu_width - + x_top_pu) ? (s->ps.sps->min_pu_width - + x_top_pu) : (size_in_luma_pu_h)); + cand_up = 0; + for (i = 0; i < max; i += 2) + cand_up |= + ((s->ref->tab_mvf[(x_top_pu + i) + + (y_top_pu) * + min_pu_width]).pred_flag == PF_INTRA); + } + if (cand_up_right == 1 && on_pu_edge_y) { + int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size); + int x_right_pu = + ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size); + int max = + ((size_in_luma_pu_h) > + (s->ps.sps->min_pu_width - + x_right_pu) ? (s->ps.sps->min_pu_width - + x_right_pu) : (size_in_luma_pu_h)); + cand_up_right = 0; + for (i = 0; i < max; i += 2) + cand_up_right |= + ((s->ref->tab_mvf[(x_right_pu + i) + + (y_top_pu) * + min_pu_width]).pred_flag == PF_INTRA); + } + vec0 = (v16u8) __msa_ldi_b(128); + + ST_UB4(vec0, vec0, vec0, vec0, left, 16); + ST_UB4(vec0, vec0, vec0, vec0, top, 16); + + top[-1] = 128; + } + if (cand_up_left) { + left[-1] = src[(-1) + stride * (-1)]; + top[-1] = left[-1]; + } + if (cand_up) { + LD_UB2(src - stride, 16, vec0, vec1); + ST_UB2(vec0, vec1, top, 16); + } + + if (cand_up_right) { + LD_UB2(src - stride + 32, 16, vec0, vec1); + ST_UB2(vec0, vec1, (top + 32), 16); + do { + uint32_t pix = + ((src[(32 + top_right_size - 1) + stride * (-1)]) * + 0x01010101U); + for (i = 0; i < (32 - top_right_size); i += 4) + ((((union unaligned_32 *) (top + 32 + top_right_size + + i))->l) = (pix)); + } while (0); + } + if (cand_left) + for (i = 0; i < 32; i++) + left[i] = src[(-1) + stride * (i)]; + if (cand_bottom_left) { + for (i = 32; i < 32 + bottom_left_size; i++) + left[i] = src[(-1) + stride * (i)]; + do { + uint32_t pix = + ((src[(-1) + stride * (32 + bottom_left_size - 1)]) * + 0x01010101U); + for (i = 0; i < (32 - bottom_left_size); i += 4) + ((((union unaligned_32 *) (left + 32 + bottom_left_size + + i))->l) = (pix)); + } while (0); + } + + if (s->ps.pps->constrained_intra_pred_flag == 1) { + if (cand_bottom_left || cand_left || cand_up_left || cand_up + || cand_up_right) { + int size_max_x = + x0 + ((2 * 32) << hshift) < + s->ps.sps->width ? 2 * 32 : (s->ps.sps->width - x0) >> hshift; + int size_max_y = + y0 + ((2 * 32) << vshift) < + s->ps.sps->height ? 2 * 32 : (s->ps.sps->height - y0) >> vshift; + int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1; + if (!cand_up_right) { + size_max_x = x0 + ((32) << hshift) < s->ps.sps->width ? + 32 : (s->ps.sps->width - x0) >> hshift; + } + if (!cand_bottom_left) { + size_max_y = y0 + ((32) << vshift) < s->ps.sps->height ? + 32 : (s->ps.sps->height - y0) >> vshift; + } + if (cand_bottom_left || cand_left || cand_up_left) { + while (j > -1 + && + !((s->ref->tab_mvf[(((x0 + + ((-1) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + + ((j) << + vshift)) + >> s->ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == + PF_INTRA)) + j--; + if (! + ((s->ref->tab_mvf[(((x0 + + ((-1) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + ((j) + << + vshift)) + >> s->ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == PF_INTRA)) { + j = 0; + while (j < size_max_x + && + !((s->ref->tab_mvf[(((x0 + + ((j) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + + ((-1) << + vshift)) + >> s-> + ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == + PF_INTRA)) + j++; + for (i = j; i > (j) - (j + 1); i--) + if (! + ((s->ref->tab_mvf[(((x0 + + ((i - + 1) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + + ((-1) << + vshift)) + >> s-> + ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == + PF_INTRA)) + top[i - 1] = top[i]; + left[-1] = top[-1]; + } + } else { + j = 0; + while (j < size_max_x + && + !((s->ref->tab_mvf[(((x0 + + ((j) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + ((-1) + << + vshift)) + >> s->ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == + PF_INTRA)) + j++; + if (j > 0) + if (x0 > 0) { + for (i = j; i > (j) - (j + 1); i--) + if (! + ((s->ref->tab_mvf[(((x0 + + ((i - + 1) << hshift)) >> + s->ps.sps->log2_min_pu_size)) + + (((y0 + ((-1) + << vshift)) + >> + s->ps.sps->log2_min_pu_size)) + * + min_pu_width]).pred_flag == + PF_INTRA)) + top[i - 1] = top[i]; + } else { + for (i = j; i > (j) - (j); i--) + if (! + ((s->ref->tab_mvf[(((x0 + + ((i - + 1) << hshift)) >> + s->ps.sps->log2_min_pu_size)) + + (((y0 + ((-1) + << vshift)) + >> + s->ps.sps->log2_min_pu_size)) + * + min_pu_width]).pred_flag == + PF_INTRA)) + top[i - 1] = top[i]; + top[-1] = top[0]; + } + left[-1] = top[-1]; + } + left[-1] = top[-1]; + if (cand_bottom_left || cand_left) { + a = ((left[-1]) * 0x01010101U); + for (i = 0; i < (0) + (size_max_y); i += 4) + if (! + ((s->ref->tab_mvf[(((x0 + + ((-1) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + + ((i) << + vshift)) + >> s->ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == + PF_INTRA)) + ((((union unaligned_32 *) (&left[i]))->l) = (a)); + else + a = ((left[i + 3]) * 0x01010101U); + } + if (!cand_left) { + vec0 = (v16u8) __msa_fill_b(left[-1]); + + ST_UB2(vec0, vec0, left, 16); + } + if (!cand_bottom_left) { + vec0 = (v16u8) __msa_fill_b(left[31]); + + ST_UB2(vec0, vec0, (left + 32), 16); + } + if (x0 != 0 && y0 != 0) { + a = ((left[size_max_y - 1]) * 0x01010101U); + for (i = (size_max_y - 1); + i > (size_max_y - 1) - (size_max_y); i -= 4) + if (! + ((s->ref->tab_mvf[(((x0 + + ((-1) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + + ((i - + 3) << + vshift)) + >> s->ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == + PF_INTRA)) + ((((union unaligned_32 *) (&left[i - 3]))->l) = (a)); + else + a = ((left[i - 3]) * 0x01010101U); + if (! + ((s->ref->tab_mvf[(((x0 + + ((-1) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + ((-1) + << + vshift)) + >> s->ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == PF_INTRA)) + left[-1] = left[0]; + } else if (x0 == 0) { + do { + uint32_t pix = ((0) * 0x01010101U); + for (i = 0; i < (size_max_y); i += 4) + ((((union unaligned_32 *) (left + i))->l) = (pix)); + } while (0); + } else { + a = ((left[size_max_y - 1]) * 0x01010101U); + for (i = (size_max_y - 1); + i > (size_max_y - 1) - (size_max_y); i -= 4) + if (! + ((s->ref->tab_mvf[(((x0 + + ((-1) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + + ((i - + 3) << + vshift)) + >> s->ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == + PF_INTRA)) + ((((union unaligned_32 *) (&left[i - 3]))->l) = (a)); + else + a = ((left[i - 3]) * 0x01010101U); + } + top[-1] = left[-1]; + if (y0 != 0) { + a = ((left[-1]) * 0x01010101U); + for (i = 0; i < (0) + (size_max_x); i += 4) + if (! + ((s->ref->tab_mvf[(((x0 + + ((i) << hshift)) >> s->ps.sps-> + log2_min_pu_size)) + (((y0 + ((-1) + << + vshift)) + >> s->ps.sps-> + log2_min_pu_size)) + * min_pu_width]).pred_flag == + PF_INTRA)) + ((((union unaligned_32 *) (&top[i]))->l) = (a)); + else + a = ((top[i + 3]) * 0x01010101U); + } + } + } + + if (!cand_bottom_left) { + if (cand_left) { + vec0 = (v16u8) __msa_fill_b(left[31]); + + ST_UB2(vec0, vec0, (left + 32), 16); + } else if (cand_up_left) { + vec0 = (v16u8) __msa_fill_b(left[-1]); + + ST_UB4(vec0, vec0, vec0, vec0, left, 16); + + cand_left = 1; + } else if (cand_up) { + left[-1] = top[0]; + + vec0 = (v16u8) __msa_fill_b(left[-1]); + + ST_UB4(vec0, vec0, vec0, vec0, left, 16); + + cand_up_left = 1; + cand_left = 1; + } else if (cand_up_right) { + vec0 = (v16u8) __msa_fill_b(top[32]); + + ST_UB2(vec0, vec0, top, 16); + + left[-1] = top[32]; + + ST_UB4(vec0, vec0, vec0, vec0, left, 16); + + cand_up = 1; + cand_up_left = 1; + cand_left = 1; + } else { + left[-1] = 128; + + vec0 = (v16u8) __msa_ldi_b(128); + + ST_UB4(vec0, vec0, vec0, vec0, top, 16); + ST_UB4(vec0, vec0, vec0, vec0, left, 16); + } + } + + if (!cand_left) { + vec0 = (v16u8) __msa_fill_b(left[32]); + + ST_UB2(vec0, vec0, left, 16); + } + if (!cand_up_left) { + left[-1] = left[0]; + } + if (!cand_up) { + vec0 = (v16u8) __msa_fill_b(left[-1]); + + ST_UB2(vec0, vec0, top, 16); + } + if (!cand_up_right) { + vec0 = (v16u8) __msa_fill_b(top[31]); + + ST_UB2(vec0, vec0, (top + 32), 16); + } + + top[-1] = left[-1]; + + + if (!s->ps.sps->intra_smoothing_disabled_flag + && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) { + if (mode != INTRA_DC && 32 != 4) { + int intra_hor_ver_dist_thresh[] = { 7, 1, 0 }; + int min_dist_vert_hor = + (((((int) (mode - 26U)) >= + 0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) > + ((((int) (mode - 10U)) >= + 0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U))))) + ? ((((int) (mode - 10U)) >= + 0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U))))) + : ((((int) (mode - 26U)) >= + 0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U)))))); + if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) { + int threshold = 1 << (8 - 5); + if (s->ps.sps->sps_strong_intra_smoothing_enable_flag + && c_idx == 0 + && ((top[-1] + top[63] - 2 * top[31]) >= + 0 ? (top[-1] + top[63] - + 2 * top[31]) : (-(top[-1] + top[63] - + 2 * top[31]))) < threshold + && ((left[-1] + left[63] - 2 * left[31]) >= + 0 ? (left[-1] + left[63] - + 2 * left[31]) : (-(left[-1] + left[63] - + 2 * left[31]))) < threshold) { + + + filtered_top[-1] = top[-1]; + filtered_top[63] = top[63]; + + + for (i = 0; i < 63; i++) { + filtered_top[i] = + ((63 - i) * top[-1] + (i + 1) * top[63] + 32) >> 6; + } + + tmp0 = __msa_fill_h(top[-1]); + tmp1 = __msa_fill_h(top[63]); + + tmp2 = mul_val0 - 8; + tmp3 = mul_val0 - 16; + tmp4 = mul_val0 - 24; + tmp5 = mul_val1 + 8; + tmp6 = mul_val1 + 16; + tmp7 = mul_val1 + 24; + + res0 = mul_val0 * tmp0; + res1 = tmp2 * tmp0; + res2 = tmp3 * tmp0; + res3 = tmp4 * tmp0; + res0 += mul_val1 * tmp1; + res1 += tmp5 * tmp1; + res2 += tmp6 * tmp1; + res3 += tmp7 * tmp1; + + res0 = __msa_srari_h(res0, 6); + res1 = __msa_srari_h(res1, 6); + res2 = __msa_srari_h(res2, 6); + res3 = __msa_srari_h(res3, 6); + + vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0); + vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2); + + ST_UB2(vec0, vec1, filtered_top, 16); + + res0 = mul_val0 - 32; + tmp2 = mul_val0 - 40; + tmp3 = mul_val0 - 48; + tmp4 = mul_val0 - 56; + res3 = mul_val1 + 32; + tmp5 = mul_val1 + 40; + tmp6 = mul_val1 + 48; + tmp7 = mul_val1 + 56; + + res0 = res0 * tmp0; + res1 = tmp2 * tmp0; + res2 = tmp3 * tmp0; + res0 += res3 * tmp1; + res3 = tmp4 * tmp0; + res1 += tmp5 * tmp1; + res2 += tmp6 * tmp1; + res3 += tmp7 * tmp1; + + res0 = __msa_srari_h(res0, 6); + res1 = __msa_srari_h(res1, 6); + res2 = __msa_srari_h(res2, 6); + res3 = __msa_srari_h(res3, 6); + + vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0); + vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2); + + ST_UB2(vec0, vec1, (filtered_top + 32), 16); + + filtered_top[63] = top[63]; + + tmp0 = __msa_fill_h(left[-1]); + tmp1 = __msa_fill_h(left[63]); + + tmp2 = mul_val0 - 8; + tmp3 = mul_val0 - 16; + tmp4 = mul_val0 - 24; + tmp5 = mul_val1 + 8; + tmp6 = mul_val1 + 16; + tmp7 = mul_val1 + 24; + + res0 = mul_val0 * tmp0; + res1 = tmp2 * tmp0; + res2 = tmp3 * tmp0; + res3 = tmp4 * tmp0; + res0 += mul_val1 * tmp1; + res1 += tmp5 * tmp1; + res2 += tmp6 * tmp1; + res3 += tmp7 * tmp1; + + res0 = __msa_srari_h(res0, 6); + res1 = __msa_srari_h(res1, 6); + res2 = __msa_srari_h(res2, 6); + res3 = __msa_srari_h(res3, 6); + + vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0); + vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2); + + ST_UB2(vec0, vec1, left, 16); + + res0 = mul_val0 - 32; + tmp2 = mul_val0 - 40; + tmp3 = mul_val0 - 48; + tmp4 = mul_val0 - 56; + res3 = mul_val1 + 32; + tmp5 = mul_val1 + 40; + tmp6 = mul_val1 + 48; + tmp7 = mul_val1 + 56; + + res0 = res0 * tmp0; + res1 = tmp2 * tmp0; + res2 = tmp3 * tmp0; + res0 += res3 * tmp1; + res3 = tmp4 * tmp0; + res1 += tmp5 * tmp1; + res2 += tmp6 * tmp1; + res3 += tmp7 * tmp1; + + res0 = __msa_srari_h(res0, 6); + res1 = __msa_srari_h(res1, 6); + res2 = __msa_srari_h(res2, 6); + res3 = __msa_srari_h(res3, 6); + + vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0); + vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2); + + ST_UB2(vec0, vec1, (left + 32), 16); + + left[63] = tmp1[0]; + + top = filtered_top; + } else { + filtered_left[2 * 32 - 1] = left[2 * 32 - 1]; + filtered_top[2 * 32 - 1] = top[2 * 32 - 1]; + for (i = 2 * 32 - 2; i >= 0; i--) + filtered_left[i] = (left[i + 1] + 2 * left[i] + + left[i - 1] + 2) >> 2; + filtered_top[-1] = + filtered_left[-1] = + (left[0] + 2 * left[-1] + top[0] + 2) >> 2; + for (i = 2 * 32 - 2; i >= 0; i--) + filtered_top[i] = (top[i + 1] + 2 * top[i] + + top[i - 1] + 2) >> 2; + left = filtered_left; + top = filtered_top; + } + } + } + } + + switch (mode) { + case INTRA_PLANAR: + s->hpc.pred_planar[3] ((uint8_t *) src, (uint8_t *) top, + (uint8_t *) left, stride); + break; + case INTRA_DC: + s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top, + (uint8_t *) left, stride, 5, c_idx); + break; + default: + s->hpc.pred_angular[3] ((uint8_t *) src, (uint8_t *) top, + (uint8_t *) left, stride, c_idx, mode); + break; + } +} diff --git a/libavcodec/mips/hpeldsp_init_mips.c b/libavcodec/mips/hpeldsp_init_mips.c new file mode 100644 index 0000000000..363a04514f --- /dev/null +++ b/libavcodec/mips/hpeldsp_init_mips.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) + * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "../hpeldsp.h" +#include "libavcodec/mips/hpeldsp_mips.h" + +#if HAVE_MSA +static void ff_hpeldsp_init_msa(HpelDSPContext *c, int flags) +{ + c->put_pixels_tab[0][0] = ff_put_pixels16_msa; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_msa; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_msa; + c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_msa; + + c->put_pixels_tab[1][0] = ff_put_pixels8_msa; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_msa; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_msa; + c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_msa; + + c->put_pixels_tab[2][1] = ff_put_pixels4_x2_msa; + c->put_pixels_tab[2][2] = ff_put_pixels4_y2_msa; + c->put_pixels_tab[2][3] = ff_put_pixels4_xy2_msa; + + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_msa; + c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_msa; + c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_msa; + c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_msa; + + c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_msa; + c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_msa; + c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_msa; + c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_msa; + + c->avg_pixels_tab[0][0] = ff_avg_pixels16_msa; + c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_msa; + c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_msa; + c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_msa; + + c->avg_pixels_tab[1][0] = ff_avg_pixels8_msa; + c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_msa; + c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_msa; + c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_msa; + + c->avg_pixels_tab[2][0] = ff_avg_pixels4_msa; + c->avg_pixels_tab[2][1] = ff_avg_pixels4_x2_msa; + c->avg_pixels_tab[2][2] = ff_avg_pixels4_y2_msa; + c->avg_pixels_tab[2][3] = ff_avg_pixels4_xy2_msa; +} +#endif // #if HAVE_MSA + +#if HAVE_MMI +static void ff_hpeldsp_init_mmi(HpelDSPContext *c, int flags) +{ + c->put_pixels_tab[0][0] = ff_put_pixels16_8_mmi; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_8_mmi; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_8_mmi; + c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_8_mmi; + + c->put_pixels_tab[1][0] = ff_put_pixels8_8_mmi; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_8_mmi; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_8_mmi; + c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_8_mmi; + + c->put_pixels_tab[2][0] = ff_put_pixels4_8_mmi; + c->put_pixels_tab[2][1] = ff_put_pixels4_x2_8_mmi; + c->put_pixels_tab[2][2] = ff_put_pixels4_y2_8_mmi; + c->put_pixels_tab[2][3] = ff_put_pixels4_xy2_8_mmi; + + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_8_mmi; + c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_8_mmi; + c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_8_mmi; + c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_8_mmi; + + c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_8_mmi; + c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_8_mmi; + c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_8_mmi; + c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_8_mmi; + + c->avg_pixels_tab[0][0] = ff_avg_pixels16_8_mmi; + c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_8_mmi; + c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_8_mmi; + c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_8_mmi; + + c->avg_pixels_tab[1][0] = ff_avg_pixels8_8_mmi; + c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_8_mmi; + c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_8_mmi; + c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_8_mmi; + + c->avg_pixels_tab[2][0] = ff_avg_pixels4_8_mmi; + c->avg_pixels_tab[2][1] = ff_avg_pixels4_x2_8_mmi; + c->avg_pixels_tab[2][2] = ff_avg_pixels4_y2_8_mmi; + c->avg_pixels_tab[2][3] = ff_avg_pixels4_xy2_8_mmi; +} +#endif // #if HAVE_MMI + +void ff_hpeldsp_init_mips(HpelDSPContext *c, int flags) +{ +#if HAVE_MSA + ff_hpeldsp_init_msa(c, flags); +#endif // #if HAVE_MSA +#if HAVE_MMI + ff_hpeldsp_init_mmi(c, flags); +#endif // #if HAVE_MMI +} diff --git a/libavcodec/mips/hpeldsp_mips.h b/libavcodec/mips/hpeldsp_mips.h new file mode 100644 index 0000000000..f527c1d7e6 --- /dev/null +++ b/libavcodec/mips/hpeldsp_mips.h @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) + * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_HPELDSP_MIPS_H +#define AVCODEC_MIPS_HPELDSP_MIPS_H + +#include "libavcodec/bit_depth_template.c" + +void ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels4_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_no_rnd_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); + +void ff_put_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h); +void ff_put_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h); +void ff_put_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h); +void ff_avg_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h); +void ff_avg_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h); +void ff_avg_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h); +void ff_put_no_rnd_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h); +void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h); + +void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); +void ff_avg_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int32_t h); + +#endif // #ifndef AVCODEC_MIPS_HPELDSP_MIPS_H diff --git a/libavcodec/mips/hpeldsp_mmi.c b/libavcodec/mips/hpeldsp_mmi.c new file mode 100644 index 0000000000..4c46f00dc7 --- /dev/null +++ b/libavcodec/mips/hpeldsp_mmi.c @@ -0,0 +1,1257 @@ +/* + * Loongson SIMD optimized qpeldsp + * + * Copyright (c) 2016 Loongson Technology Corporation Limited + * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "hpeldsp_mips.h" +#include "libavcodec/bit_depth_template.c" +#include "libavutil/mips/asmdefs.h" +#include "constants.h" + +void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + double ftmp[2]; + mips_reg addr[2]; + uint64_t low32; + + __asm__ volatile ( + PTR_ADDU "%[addr1], %[line_size], %[line_size] \n\t" + "1: \n\t" + PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t" + "uld %[low32], 0x00(%[pixels]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "uld %[low32], 0x00(%[addr0]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "swc1 %[ftmp0], 0x00(%[block]) \n\t" + "gsswxc1 %[ftmp1], 0x00(%[block], %[line_size]) \n\t" + PTR_ADDU "%[pixels], %[pixels], %[addr1] \n\t" + PTR_ADDU "%[block], %[block], %[addr1] \n\t" + + PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t" + "uld %[low32], 0x00(%[pixels]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "uld %[low32], 0x00(%[addr0]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "swc1 %[ftmp0], 0x00(%[block]) \n\t" + "gsswxc1 %[ftmp1], 0x00(%[block], %[line_size]) \n\t" + PTR_ADDU "%[pixels], %[pixels], %[addr1] \n\t" + PTR_ADDU "%[block], %[block], %[addr1] \n\t" + + PTR_ADDI "%[h], %[h], -0x04 \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [low32]"=&r"(low32), + [block]"+&r"(block), [pixels]"+&r"(pixels), + [h]"+&r"(h) + : [line_size]"r"((mips_reg)line_size) + : "memory" + ); +} + +void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + double ftmp[2]; + mips_reg addr[2]; + + __asm__ volatile ( + PTR_ADDU "%[addr1], %[line_size], %[line_size] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[pixels]) \n\t" + PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t" + "gsldrc1 %[ftmp0], 0x00(%[pixels]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "sdc1 %[ftmp0], 0x00(%[block]) \n\t" + "gssdxc1 %[ftmp1], 0x00(%[block], %[line_size]) \n\t" + PTR_ADDU "%[pixels], %[pixels], %[addr1] \n\t" + PTR_ADDU "%[block], %[block], %[addr1] \n\t" + + "gsldlc1 %[ftmp0], 0x07(%[pixels]) \n\t" + PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t" + "gsldrc1 %[ftmp0], 0x00(%[pixels]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "sdc1 %[ftmp0], 0x00(%[block]) \n\t" + "gssdxc1 %[ftmp1], 0x00(%[block], %[line_size]) \n\t" + PTR_ADDU "%[pixels], %[pixels], %[addr1] \n\t" + PTR_ADDU "%[block], %[block], %[addr1] \n\t" + + PTR_ADDI "%[h], %[h], -0x04 \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [block]"+&r"(block), [pixels]"+&r"(pixels), + [h]"+&r"(h) + : [line_size]"r"((mips_reg)line_size) + : "memory" + ); +} + +void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + double ftmp[4]; + mips_reg addr[2]; + + __asm__ volatile ( + PTR_ADDU "%[addr1], %[line_size], %[line_size] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[pixels]) \n\t" + PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t" + "gsldrc1 %[ftmp0], 0x00(%[pixels]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[pixels]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[pixels]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gsldlc1 %[ftmp3], 0x0f(%[addr0]) \n\t" + "gsldrc1 %[ftmp3], 0x08(%[addr0]) \n\t" + "sdc1 %[ftmp0], 0x00(%[block]) \n\t" + "gssdxc1 %[ftmp1], 0x00(%[block], %[line_size]) \n\t" + "sdc1 %[ftmp2], 0x08(%[block]) \n\t" + "gssdxc1 %[ftmp3], 0x08(%[block], %[line_size]) \n\t" + PTR_ADDU "%[pixels], %[pixels], %[addr1] \n\t" + PTR_ADDU "%[block], %[block], %[addr1] \n\t" + + "gsldlc1 %[ftmp0], 0x07(%[pixels]) \n\t" + PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t" + "gsldrc1 %[ftmp0], 0x00(%[pixels]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[pixels]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[pixels]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gsldlc1 %[ftmp3], 0x0f(%[addr0]) \n\t" + "gsldrc1 %[ftmp3], 0x08(%[addr0]) \n\t" + "sdc1 %[ftmp0], 0x00(%[block]) \n\t" + "gssdxc1 %[ftmp1], 0x00(%[block], %[line_size]) \n\t" + "sdc1 %[ftmp2], 0x08(%[block]) \n\t" + "gssdxc1 %[ftmp3], 0x08(%[block], %[line_size]) \n\t" + PTR_ADDU "%[pixels], %[pixels], %[addr1] \n\t" + PTR_ADDU "%[block], %[block], %[addr1] \n\t" + + PTR_ADDI "%[h], %[h], -0x04 \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [block]"+&r"(block), [pixels]"+&r"(pixels), + [h]"+&r"(h) + : [line_size]"r"((mips_reg)line_size) + : "memory" + ); +} + +void ff_avg_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + double ftmp[4]; + mips_reg addr[3]; + uint64_t low32; + + __asm__ volatile ( + PTR_ADDU "%[addr2], %[line_size], %[line_size] \n\t" + "1: \n\t" + PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t" + "uld %[low32], 0x00(%[pixels]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "uld %[low32], 0x00(%[addr0]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + PTR_ADDU "%[addr1], %[block], %[line_size] \n\t" + "uld %[low32], 0x00(%[block]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "uld %[low32], 0x00(%[addr1]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[block]) \n\t" + "gsswxc1 %[ftmp1], 0x00(%[block], %[line_size]) \n\t" + PTR_ADDU "%[pixels], %[pixels], %[addr2] \n\t" + PTR_ADDU "%[block], %[block], %[addr2] \n\t" + + PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t" + "uld %[low32], 0x00(%[pixels]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "uld %[low32], 0x00(%[addr0]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + PTR_ADDU "%[addr1], %[block], %[line_size] \n\t" + "uld %[low32], 0x00(%[block]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + "uld %[low32], 0x00(%[addr1]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[block]) \n\t" + "gsswxc1 %[ftmp1], 0x00(%[block], %[line_size]) \n\t" + PTR_ADDU "%[pixels], %[pixels], %[addr2] \n\t" + PTR_ADDU "%[block], %[block], %[addr2] \n\t" + + PTR_ADDI "%[h], %[h], -0x04 \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [addr2]"=&r"(addr[2]), + [low32]"=&r"(low32), + [block]"+&r"(block), [pixels]"+&r"(pixels), + [h]"+&r"(h) + : [line_size]"r"((mips_reg)line_size) + : "memory" + ); +} + +void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + double ftmp[4]; + mips_reg addr[3]; + + __asm__ volatile ( + PTR_ADDU "%[addr2], %[line_size], %[line_size] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[pixels]) \n\t" + PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t" + "gsldrc1 %[ftmp0], 0x00(%[pixels]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + PTR_ADDU "%[addr1], %[block], %[line_size] \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[block]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[block]) \n\t" + "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "sdc1 %[ftmp0], 0x00(%[block]) \n\t" + "gssdxc1 %[ftmp1], 0x00(%[block], %[line_size]) \n\t" + PTR_ADDU "%[pixels], %[pixels], %[addr2] \n\t" + PTR_ADDU "%[block], %[block], %[addr2] \n\t" + + "gsldlc1 %[ftmp0], 0x07(%[pixels]) \n\t" + PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t" + "gsldrc1 %[ftmp0], 0x00(%[pixels]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + PTR_ADDU "%[addr1], %[block], %[line_size] \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[block]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[block]) \n\t" + "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "sdc1 %[ftmp0], 0x00(%[block]) \n\t" + "gssdxc1 %[ftmp1], 0x00(%[block], %[line_size]) \n\t" + PTR_ADDU "%[pixels], %[pixels], %[addr2] \n\t" + PTR_ADDU "%[block], %[block], %[addr2] \n\t" + + PTR_ADDI "%[h], %[h], -0x04 \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [addr2]"=&r"(addr[2]), + [block]"+&r"(block), [pixels]"+&r"(pixels), + [h]"+&r"(h) + : [line_size]"r"((mips_reg)line_size) + : "memory" + ); +} + +void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + double ftmp[8]; + mips_reg addr[3]; + + __asm__ volatile ( + PTR_ADDU "%[addr2], %[line_size], %[line_size] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[pixels]) \n\t" + PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t" + "gsldrc1 %[ftmp0], 0x00(%[pixels]) \n\t" + "gsldlc1 %[ftmp4], 0x0f(%[pixels]) \n\t" + PTR_ADDU "%[addr1], %[block], %[line_size] \n\t" + "gsldrc1 %[ftmp4], 0x08(%[pixels]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gsldlc1 %[ftmp5], 0x0f(%[addr0]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[addr0]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[block]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[block]) \n\t" + "gsldlc1 %[ftmp6], 0x0f(%[block]) \n\t" + "gsldrc1 %[ftmp6], 0x08(%[block]) \n\t" + "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + "gsldlc1 %[ftmp7], 0x0f(%[addr1]) \n\t" + "gsldrc1 %[ftmp7], 0x08(%[addr1]) \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "sdc1 %[ftmp0], 0x00(%[block]) \n\t" + "gssdxc1 %[ftmp1], 0x00(%[block], %[line_size]) \n\t" + "sdc1 %[ftmp4], 0x08(%[block]) \n\t" + "gssdxc1 %[ftmp5], 0x08(%[block], %[line_size]) \n\t" + PTR_ADDU "%[pixels], %[pixels], %[addr2] \n\t" + PTR_ADDU "%[block], %[block], %[addr2] \n\t" + + "gsldlc1 %[ftmp0], 0x07(%[pixels]) \n\t" + PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t" + "gsldrc1 %[ftmp0], 0x00(%[pixels]) \n\t" + "gsldlc1 %[ftmp4], 0x0f(%[pixels]) \n\t" + PTR_ADDU "%[addr1], %[block], %[line_size] \n\t" + "gsldrc1 %[ftmp4], 0x08(%[pixels]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gsldlc1 %[ftmp5], 0x0f(%[addr0]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[addr0]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[block]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[block]) \n\t" + "gsldlc1 %[ftmp6], 0x0f(%[block]) \n\t" + "gsldrc1 %[ftmp6], 0x08(%[block]) \n\t" + "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + "gsldlc1 %[ftmp7], 0x0f(%[addr1]) \n\t" + "gsldrc1 %[ftmp7], 0x08(%[addr1]) \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "sdc1 %[ftmp0], 0x00(%[block]) \n\t" + "gssdxc1 %[ftmp1], 0x00(%[block], %[line_size]) \n\t" + "sdc1 %[ftmp4], 0x08(%[block]) \n\t" + "gssdxc1 %[ftmp5], 0x08(%[block], %[line_size]) \n\t" + PTR_ADDU "%[pixels], %[pixels], %[addr2] \n\t" + PTR_ADDU "%[block], %[block], %[addr2] \n\t" + + PTR_ADDI "%[h], %[h], -0x04 \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [addr2]"=&r"(addr[2]), + [block]"+&r"(block), [pixels]"+&r"(pixels), + [h]"+&r"(h) + : [line_size]"r"((mips_reg)line_size) + : "memory" + ); +} + +inline void ff_put_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h) +{ + double ftmp[4]; + mips_reg addr[5]; + uint64_t low32; + + __asm__ volatile ( + PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t" + PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t" + PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t" + "1: \n\t" + PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t" + "uld %[low32], 0x00(%[src1]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "uld %[low32], 0x00(%[addr0]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "uld %[low32], 0x00(%[src2]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t" + "uld %[low32], 0x00(%[addr1]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + PTR_ADDU "%[src1], %[src1], %[addr2] \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[dst]) \n\t" + "gsswxc1 %[ftmp1], 0x00(%[dst], %[dst_stride]) \n\t" + PTR_ADDU "%[src2], %[src2], %[addr3] \n\t" + PTR_ADDU "%[dst], %[dst], %[addr4] \n\t" + + PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t" + "uld %[low32], 0x00(%[src1]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "uld %[low32], 0x00(%[addr0]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "uld %[low32], 0x00(%[src2]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t" + "uld %[low32], 0x00(%[addr1]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + PTR_ADDU "%[src1], %[src1], %[addr2] \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[dst]) \n\t" + "gsswxc1 %[ftmp1], 0x00(%[dst], %[dst_stride]) \n\t" + PTR_ADDU "%[src2], %[src2], %[addr3] \n\t" + PTR_ADDU "%[dst], %[dst], %[addr4] \n\t" + + PTR_ADDI "%[h], %[h], -0x04 \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]), + [addr4]"=&r"(addr[4]), + [low32]"=&r"(low32), + [dst]"+&r"(dst), [src1]"+&r"(src1), + [src2]"+&r"(src2), [h]"+&r"(h) + : [dst_stride]"r"((mips_reg)dst_stride), + [src_stride1]"r"((mips_reg)src_stride1), + [src_stride2]"r"((mips_reg)src_stride2) + : "memory" + ); +} + +inline void ff_put_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h) +{ + double ftmp[4]; + mips_reg addr[5]; + + __asm__ volatile ( + PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t" + PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t" + PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src1]) \n\t" + PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src1]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[src2]) \n\t" + PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t" + "gsldrc1 %[ftmp2], 0x00(%[src2]) \n\t" + "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + PTR_ADDU "%[src1], %[src1], %[addr2] \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "sdc1 %[ftmp0], 0x00(%[dst]) \n\t" + "gssdxc1 %[ftmp1], 0x00(%[dst], %[dst_stride]) \n\t" + PTR_ADDU "%[src2], %[src2], %[addr3] \n\t" + PTR_ADDU "%[dst], %[dst], %[addr4] \n\t" + + "gsldlc1 %[ftmp0], 0x07(%[src1]) \n\t" + PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src1]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[src2]) \n\t" + PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t" + "gsldrc1 %[ftmp2], 0x00(%[src2]) \n\t" + "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + PTR_ADDU "%[src1], %[src1], %[addr2] \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "sdc1 %[ftmp0], 0x00(%[dst]) \n\t" + "gssdxc1 %[ftmp1], 0x00(%[dst], %[dst_stride]) \n\t" + PTR_ADDU "%[src2], %[src2], %[addr3] \n\t" + PTR_ADDU "%[dst], %[dst], %[addr4] \n\t" + + PTR_ADDI "%[h], %[h], -0x04 \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]), + [addr4]"=&r"(addr[4]), + [dst]"+&r"(dst), [src1]"+&r"(src1), + [src2]"+&r"(src2), [h]"+&r"(h) + : [dst_stride]"r"((mips_reg)dst_stride), + [src_stride1]"r"((mips_reg)src_stride1), + [src_stride2]"r"((mips_reg)src_stride2) + : "memory" + ); +} + +inline void ff_put_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h) +{ + double ftmp[8]; + mips_reg addr[5]; + + __asm__ volatile ( + PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t" + PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t" + PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src1]) \n\t" + PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src1]) \n\t" + "gsldlc1 %[ftmp4], 0x0f(%[src1]) \n\t" + "gsldrc1 %[ftmp4], 0x08(%[src1]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gsldlc1 %[ftmp5], 0x0f(%[addr0]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[addr0]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[src2]) \n\t" + PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t" + "gsldrc1 %[ftmp2], 0x00(%[src2]) \n\t" + "gsldlc1 %[ftmp6], 0x0f(%[src2]) \n\t" + "gsldrc1 %[ftmp6], 0x08(%[src2]) \n\t" + "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + PTR_ADDU "%[src1], %[src1], %[addr2] \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + "gsldlc1 %[ftmp7], 0x0f(%[addr1]) \n\t" + "gsldrc1 %[ftmp7], 0x08(%[addr1]) \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "sdc1 %[ftmp0], 0x00(%[dst]) \n\t" + "gssdxc1 %[ftmp1], 0x00(%[dst], %[dst_stride]) \n\t" + "sdc1 %[ftmp4], 0x08(%[dst]) \n\t" + "gssdxc1 %[ftmp5], 0x08(%[dst], %[dst_stride]) \n\t" + PTR_ADDU "%[src2], %[src2], %[addr3] \n\t" + PTR_ADDU "%[dst], %[dst], %[addr4] \n\t" + + "gsldlc1 %[ftmp0], 0x07(%[src1]) \n\t" + PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src1]) \n\t" + "gsldlc1 %[ftmp4], 0x0f(%[src1]) \n\t" + "gsldrc1 %[ftmp4], 0x08(%[src1]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gsldlc1 %[ftmp5], 0x0f(%[addr0]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[addr0]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[src2]) \n\t" + PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t" + "gsldrc1 %[ftmp2], 0x00(%[src2]) \n\t" + "gsldlc1 %[ftmp6], 0x0f(%[src2]) \n\t" + "gsldrc1 %[ftmp6], 0x08(%[src2]) \n\t" + "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + PTR_ADDU "%[src1], %[src1], %[addr2] \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + "gsldlc1 %[ftmp7], 0x0f(%[addr1]) \n\t" + "gsldrc1 %[ftmp7], 0x08(%[addr1]) \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "sdc1 %[ftmp0], 0x00(%[dst]) \n\t" + "gssdxc1 %[ftmp1], 0x00(%[dst], %[dst_stride]) \n\t" + "sdc1 %[ftmp4], 0x08(%[dst]) \n\t" + "gssdxc1 %[ftmp5], 0x08(%[dst], %[dst_stride]) \n\t" + PTR_ADDU "%[src2], %[src2], %[addr3] \n\t" + PTR_ADDU "%[dst], %[dst], %[addr4] \n\t" + + PTR_ADDI "%[h], %[h], -0x04 \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]), + [addr4]"=&r"(addr[4]), + [dst]"+&r"(dst), [src1]"+&r"(src1), + [src2]"+&r"(src2), [h]"+&r"(h) + : [dst_stride]"r"((mips_reg)dst_stride), + [src_stride1]"r"((mips_reg)src_stride1), + [src_stride2]"r"((mips_reg)src_stride2) + : "memory" + ); +} + +inline void ff_avg_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h) +{ + double ftmp[6]; + mips_reg addr[6]; + uint64_t low32; + + __asm__ volatile ( + PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t" + PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t" + PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t" + "1: \n\t" + PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t" + "uld %[low32], 0x00(%[src1]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "uld %[low32], 0x00(%[addr0]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "uld %[low32], 0x00(%[src2]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t" + "uld %[low32], 0x00(%[addr1]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + PTR_ADDU "%[src1], %[src1], %[addr2] \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + PTR_ADDU "%[addr5], %[dst], %[dst_stride] \n\t" + "uld %[low32], 0x00(%[dst]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + "uld %[low32], 0x00(%[addr5]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "swc1 %[ftmp0], 0x00(%[dst]) \n\t" + "gsswxc1 %[ftmp1], 0x00(%[dst], %[dst_stride]) \n\t" + PTR_ADDU "%[src2], %[src2], %[addr3] \n\t" + PTR_ADDU "%[dst], %[dst], %[addr4] \n\t" + + PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t" + "uld %[low32], 0x00(%[src1]) \n\t" + "mtc1 %[low32], %[ftmp0] \n\t" + "uld %[low32], 0x00(%[addr0]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "uld %[low32], 0x00(%[src2]) \n\t" + "mtc1 %[low32], %[ftmp2] \n\t" + PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t" + "uld %[low32], 0x00(%[addr1]) \n\t" + "mtc1 %[low32], %[ftmp3] \n\t" + PTR_ADDU "%[src1], %[src1], %[addr2] \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + PTR_ADDU "%[addr5], %[dst], %[dst_stride] \n\t" + "uld %[low32], 0x00(%[dst]) \n\t" + "mtc1 %[low32], %[ftmp4] \n\t" + "uld %[low32], 0x00(%[addr5]) \n\t" + "mtc1 %[low32], %[ftmp5] \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "swc1 %[ftmp0], 0x00(%[dst]) \n\t" + "gsswxc1 %[ftmp1], 0x00(%[dst], %[dst_stride]) \n\t" + PTR_ADDU "%[src2], %[src2], %[addr3] \n\t" + PTR_ADDU "%[dst], %[dst], %[addr4] \n\t" + + PTR_ADDI "%[h], %[h], -0x04 \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]), + [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]), + [low32]"=&r"(low32), + [dst]"+&r"(dst), [src1]"+&r"(src1), + [src2]"+&r"(src2), [h]"+&r"(h) + : [dst_stride]"r"((mips_reg)dst_stride), + [src_stride1]"r"((mips_reg)src_stride1), + [src_stride2]"r"((mips_reg)src_stride2) + : "memory" + ); +} + +inline void ff_avg_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h) +{ + double ftmp[6]; + mips_reg addr[6]; + + __asm__ volatile ( + PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t" + PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t" + PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src1]) \n\t" + PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src1]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[src2]) \n\t" + PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t" + "gsldrc1 %[ftmp2], 0x00(%[src2]) \n\t" + "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + PTR_ADDU "%[src1], %[src1], %[addr2] \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + PTR_ADDU "%[addr5], %[dst], %[dst_stride] \n\t" + "gsldlc1 %[ftmp4], 0x07(%[dst]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[dst]) \n\t" + "gsldlc1 %[ftmp5], 0x07(%[addr5]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[addr5]) \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "sdc1 %[ftmp0], 0x00(%[dst]) \n\t" + "gssdxc1 %[ftmp1], 0x00(%[dst], %[dst_stride]) \n\t" + PTR_ADDU "%[src2], %[src2], %[addr3] \n\t" + PTR_ADDU "%[dst], %[dst], %[addr4] \n\t" + + "gsldlc1 %[ftmp0], 0x07(%[src1]) \n\t" + PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src1]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[src2]) \n\t" + PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t" + "gsldrc1 %[ftmp2], 0x00(%[src2]) \n\t" + "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + PTR_ADDU "%[src1], %[src1], %[addr2] \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + PTR_ADDU "%[addr5], %[dst], %[dst_stride] \n\t" + "gsldlc1 %[ftmp4], 0x07(%[dst]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[dst]) \n\t" + "gsldlc1 %[ftmp5], 0x07(%[addr5]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[addr5]) \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "sdc1 %[ftmp0], 0x00(%[dst]) \n\t" + "gssdxc1 %[ftmp1], 0x00(%[dst], %[dst_stride]) \n\t" + PTR_ADDU "%[src2], %[src2], %[addr3] \n\t" + PTR_ADDU "%[dst], %[dst], %[addr4] \n\t" + + PTR_ADDI "%[h], %[h], -0x04 \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]), + [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]), + [dst]"+&r"(dst), [src1]"+&r"(src1), + [src2]"+&r"(src2), [h]"+&r"(h) + : [dst_stride]"r"((mips_reg)dst_stride), + [src_stride1]"r"((mips_reg)src_stride1), + [src_stride2]"r"((mips_reg)src_stride2) + : "memory" + ); +} + +inline void ff_avg_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h) +{ + ff_avg_pixels8_l2_8_mmi(dst, src1, src2, dst_stride, src_stride1, + src_stride2, h); + ff_avg_pixels8_l2_8_mmi(dst + 8, src1 + 8, src2 + 8, dst_stride, + src_stride1, src_stride2, h); +} + +void ff_put_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_put_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size, + line_size, h); +} + +void ff_put_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_put_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size, + line_size, h); +} + +void ff_put_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_put_pixels16_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size, + line_size, h); +} + +void ff_avg_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size, + line_size, h); +} + +void ff_avg_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size, + line_size, h); +} + +void ff_avg_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_avg_pixels8_x2_8_mmi(block, pixels, line_size, h); + ff_avg_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h); +} + +inline void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h) +{ + double ftmp[5]; + mips_reg addr[5]; + + __asm__ volatile ( + "pcmpeqb %[ftmp4], %[ftmp4], %[ftmp4] \n\t" + PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t" + PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t" + PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src1]) \n\t" + PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src1]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[src2]) \n\t" + PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t" + "gsldrc1 %[ftmp2], 0x00(%[src2]) \n\t" + "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + PTR_ADDU "%[src1], %[src1], %[addr2] \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "xor %[ftmp1], %[ftmp1], %[ftmp4] \n\t" + "xor %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "xor %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "xor %[ftmp1], %[ftmp1], %[ftmp4] \n\t" + "sdc1 %[ftmp0], 0x00(%[dst]) \n\t" + "gssdxc1 %[ftmp1], 0x00(%[dst], %[dst_stride]) \n\t" + PTR_ADDU "%[src2], %[src2], %[addr3] \n\t" + PTR_ADDU "%[dst], %[dst], %[addr4] \n\t" + + "gsldlc1 %[ftmp0], 0x07(%[src1]) \n\t" + PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src1]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[src2]) \n\t" + PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t" + "gsldrc1 %[ftmp2], 0x00(%[src2]) \n\t" + "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + PTR_ADDU "%[src1], %[src1], %[addr2] \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "xor %[ftmp1], %[ftmp1], %[ftmp4] \n\t" + "xor %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "xor %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "xor %[ftmp1], %[ftmp1], %[ftmp4] \n\t" + "sdc1 %[ftmp0], 0x00(%[dst]) \n\t" + "gssdxc1 %[ftmp1], 0x00(%[dst], %[dst_stride]) \n\t" + PTR_ADDU "%[src2], %[src2], %[addr3] \n\t" + PTR_ADDU "%[dst], %[dst], %[addr4] \n\t" + + PTR_ADDI "%[h], %[h], -0x04 \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]), + [addr4]"=&r"(addr[4]), + [dst]"+&r"(dst), [src1]"+&r"(src1), + [src2]"+&r"(src2), [h]"+&r"(h) + : [dst_stride]"r"((mips_reg)dst_stride), + [src_stride1]"r"((mips_reg)src_stride1), + [src_stride2]"r"((mips_reg)src_stride2) + : "memory" + ); +} + +void ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, + line_size, line_size, h); +} + +void ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_put_no_rnd_pixels8_x2_8_mmi(block, pixels, line_size, h); + ff_put_no_rnd_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h); +} + +void ff_put_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_put_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size, + line_size, line_size, h); +} + +void ff_put_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_put_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size, + line_size, line_size, h); +} + +void ff_put_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_put_pixels16_l2_8_mmi(block, pixels, pixels + line_size, line_size, + line_size, line_size, h); +} + +void ff_avg_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size, + line_size, line_size, h); +} + +void ff_avg_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size, + line_size, line_size, h); +} + +void ff_avg_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_avg_pixels8_y2_8_mmi(block, pixels, line_size, h); + ff_avg_pixels8_y2_8_mmi(block + 8, pixels + 8, line_size, h); +} + +void ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + line_size, + line_size, line_size, line_size, h); +} + +void ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_put_no_rnd_pixels8_y2_8_mmi(block, pixels, line_size, h); + ff_put_no_rnd_pixels8_y2_8_mmi(block + 8 , pixels + 8, line_size, h); +} + +void ff_put_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + /* FIXME HIGH BIT DEPTH */ + int i; + const uint32_t a = AV_RN32(pixels); + const uint32_t b = AV_RN32(pixels + 1); + uint32_t l0 = (a & 0x03030303UL) + + (b & 0x03030303UL) + + 0x02020202UL; + uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) + + ((b & 0xFCFCFCFCUL) >> 2); + uint32_t l1, h1; + + pixels += line_size; + for (i = 0; i < h; i += 2) { + uint32_t a = AV_RN32(pixels); + uint32_t b = AV_RN32(pixels + 1); + l1 = (a & 0x03030303UL) + + (b & 0x03030303UL); + h1 = ((a & 0xFCFCFCFCUL) >> 2) + + ((b & 0xFCFCFCFCUL) >> 2); + *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); + pixels += line_size; + block += line_size; + a = AV_RN32(pixels); + b = AV_RN32(pixels + 1); + l0 = (a & 0x03030303UL) + + (b & 0x03030303UL) + + 0x02020202UL; + h0 = ((a & 0xFCFCFCFCUL) >> 2) + + ((b & 0xFCFCFCFCUL) >> 2); + *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); + pixels += line_size; + block += line_size; + } +} + +void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ +#if 1 + double ftmp[10]; + mips_reg addr[2]; + + __asm__ volatile ( + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "dli %[addr0], 0x0f \n\t" + "pcmpeqw %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "dmtc1 %[addr0], %[ftmp8] \n\t" + "dli %[addr0], 0x01 \n\t" + "psrlh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "dmtc1 %[addr0], %[ftmp8] \n\t" + "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + + "dli %[addr0], 0x02 \n\t" + "gsldlc1 %[ftmp0], 0x07(%[pixels]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[pixels]) \n\t" + "dmtc1 %[addr0], %[ftmp9] \n\t" + "gsldlc1 %[ftmp4], 0x08(%[pixels]) \n\t" + "gsldrc1 %[ftmp4], 0x01(%[pixels]) \n\t" + "mov.d %[ftmp1], %[ftmp0] \n\t" + "mov.d %[ftmp5], %[ftmp4] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "punpckhbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "punpckhbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "paddush %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "paddush %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "xor %[addr0], %[addr0], %[addr0] \n\t" + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + ".p2align 3 \n\t" + "1: \n\t" + PTR_ADDU "%[addr1], %[pixels], %[addr0] \n\t" + "gsldlc1 %[ftmp0], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + "gsldlc1 %[ftmp2], 0x08(%[addr1]) \n\t" + "gsldrc1 %[ftmp2], 0x01(%[addr1]) \n\t" + "mov.d %[ftmp1], %[ftmp0] \n\t" + "mov.d %[ftmp3], %[ftmp2] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "punpckhbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "punpckhbh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "paddush %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "paddush %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "paddush %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + "paddush %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + "paddush %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "paddush %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "psrlh %[ftmp4], %[ftmp4], %[ftmp9] \n\t" + "psrlh %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "gssdxc1 %[ftmp4], 0x00(%[block], %[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[line_size] \n\t" + PTR_ADDU "%[addr1], %[pixels], %[addr0] \n\t" + "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t" + "gsldlc1 %[ftmp4], 0x08(%[addr1]) \n\t" + "gsldrc1 %[ftmp4], 0x01(%[addr1]) \n\t" + "mov.d %[ftmp3], %[ftmp2] \n\t" + "mov.d %[ftmp5], %[ftmp4] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "punpckhbh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "punpckhbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "paddush %[ftmp4], %[ftmp4], %[ftmp2] \n\t" + "paddush %[ftmp5], %[ftmp5], %[ftmp3] \n\t" + "paddush %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "paddush %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + "paddush %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "paddush %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "psrlh %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "gssdxc1 %[ftmp0], 0x00(%[block], %[addr0]) \n\t" + PTR_ADDU "%[addr0], %[addr0], %[line_size] \n\t" + PTR_ADDU "%[h], %[h], -0x02 \n\t" + "bnez %[h], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [h]"+&r"(h), [pixels]"+&r"(pixels) + : [block]"r"(block), [line_size]"r"((mips_reg)line_size) + : "memory" + ); +#else + /* FIXME HIGH BIT DEPTH */ + int j; + + for (j = 0; j < 2; j++) { + int i; + const uint32_t a = AV_RN32(pixels); + const uint32_t b = AV_RN32(pixels + 1); + uint32_t l0 = (a & 0x03030303UL) + + (b & 0x03030303UL) + + 0x02020202UL; + uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) + + ((b & 0xFCFCFCFCUL) >> 2); + uint32_t l1, h1; + + pixels += line_size; + for (i = 0; i < h; i += 2) { + uint32_t a = AV_RN32(pixels); + uint32_t b = AV_RN32(pixels + 1); + l1 = (a & 0x03030303UL) + + (b & 0x03030303UL); + h1 = ((a & 0xFCFCFCFCUL) >> 2) + + ((b & 0xFCFCFCFCUL) >> 2); + *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); + pixels += line_size; + block += line_size; + a = AV_RN32(pixels); + b = AV_RN32(pixels + 1); + l0 = (a & 0x03030303UL) + + (b & 0x03030303UL) + + 0x02020202UL; + h0 = ((a & 0xFCFCFCFCUL) >> 2) + + ((b & 0xFCFCFCFCUL) >> 2); + *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); + pixels += line_size; + block += line_size; + } + pixels += 4 - line_size * (h + 1); + block += 4 - line_size * h; + } +#endif +} + +void ff_put_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_put_pixels8_xy2_8_mmi(block, pixels, line_size, h); + ff_put_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h); +} + +void ff_avg_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + /* FIXME HIGH BIT DEPTH */ + int i; + const uint32_t a = AV_RN32(pixels); + const uint32_t b = AV_RN32(pixels + 1); + uint32_t l0 = (a & 0x03030303UL) + + (b & 0x03030303UL) + + 0x02020202UL; + uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) + + ((b & 0xFCFCFCFCUL) >> 2); + uint32_t l1, h1; + + pixels += line_size; + for (i = 0; i < h; i += 2) { + uint32_t a = AV_RN32(pixels); + uint32_t b = AV_RN32(pixels + 1); + l1 = (a & 0x03030303UL) + + (b & 0x03030303UL); + h1 = ((a & 0xFCFCFCFCUL) >> 2) + + ((b & 0xFCFCFCFCUL) >> 2); + *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL)); + pixels += line_size; + block += line_size; + a = AV_RN32(pixels); + b = AV_RN32(pixels + 1); + l0 = (a & 0x03030303UL) + + (b & 0x03030303UL) + + 0x02020202UL; + h0 = ((a & 0xFCFCFCFCUL) >> 2) + + ((b & 0xFCFCFCFCUL) >> 2); + *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL)); + pixels += line_size; + block += line_size; + } +} + +void ff_avg_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + /* FIXME HIGH BIT DEPTH */ + int j; + + for (j = 0; j < 2; j++) { + int i; + const uint32_t a = AV_RN32(pixels); + const uint32_t b = AV_RN32(pixels + 1); + uint32_t l0 = (a & 0x03030303UL) + + (b & 0x03030303UL) + + 0x02020202UL; + uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) + + ((b & 0xFCFCFCFCUL) >> 2); + uint32_t l1, h1; + + pixels += line_size; + for (i = 0; i < h; i += 2) { + uint32_t a = AV_RN32(pixels); + uint32_t b = AV_RN32(pixels + 1); + l1 = (a & 0x03030303UL) + + (b & 0x03030303UL); + h1 = ((a & 0xFCFCFCFCUL) >> 2) + + ((b & 0xFCFCFCFCUL) >> 2); + *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL)); + pixels += line_size; + block += line_size; + a = AV_RN32(pixels); + b = AV_RN32(pixels + 1); + l0 = (a & 0x03030303UL) + + (b & 0x03030303UL) + + 0x02020202UL; + h0 = ((a & 0xFCFCFCFCUL) >> 2) + + ((b & 0xFCFCFCFCUL) >> 2); + *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL)); + pixels += line_size; + block += line_size; + } + pixels += 4 - line_size * (h + 1); + block += 4 - line_size * h; + } +} + +void ff_avg_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_avg_pixels8_xy2_8_mmi(block, pixels, line_size, h); + ff_avg_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h); +} + +void ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + /* FIXME HIGH BIT DEPTH */ + int j; + + for (j = 0; j < 2; j++) { + int i; + const uint32_t a = AV_RN32(pixels); + const uint32_t b = AV_RN32(pixels + 1); + uint32_t l0 = (a & 0x03030303UL) + + (b & 0x03030303UL) + + 0x01010101UL; + uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) + + ((b & 0xFCFCFCFCUL) >> 2); + uint32_t l1, h1; + + pixels += line_size; + for (i = 0; i < h; i += 2) { + uint32_t a = AV_RN32(pixels); + uint32_t b = AV_RN32(pixels + 1); + l1 = (a & 0x03030303UL) + + (b & 0x03030303UL); + h1 = ((a & 0xFCFCFCFCUL) >> 2) + + ((b & 0xFCFCFCFCUL) >> 2); + *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); + pixels += line_size; + block += line_size; + a = AV_RN32(pixels); + b = AV_RN32(pixels + 1); + l0 = (a & 0x03030303UL) + + (b & 0x03030303UL) + + 0x01010101UL; + h0 = ((a & 0xFCFCFCFCUL) >> 2) + + ((b & 0xFCFCFCFCUL) >> 2); + *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); + pixels += line_size; + block += line_size; + } + pixels += 4 - line_size * (h + 1); + block += 4 - line_size * h; + } +} + +void ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + ff_put_no_rnd_pixels8_xy2_8_mmi(block, pixels, line_size, h); + ff_put_no_rnd_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h); +} diff --git a/libavcodec/mips/hpeldsp_msa.c b/libavcodec/mips/hpeldsp_msa.c new file mode 100644 index 0000000000..40a0dca0fe --- /dev/null +++ b/libavcodec/mips/hpeldsp_msa.c @@ -0,0 +1,1498 @@ +/* + * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "libavcodec/mips/hpeldsp_mips.h" + +#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \ +{ \ + v16u8 tmp_m; \ + \ + tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1); \ + tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \ + ST_UB(tmp_m, (pdst)); \ +} + +#define PCKEV_ST_SB4(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ +{ \ + v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + uint8_t *pdst_m = (uint8_t *) (pdst); \ + \ + PCKEV_B4_SB(in0, in1, in2, in3, in4, in5, in6, in7, \ + tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + ST_SB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst_m, stride); \ +} + +#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \ + pdst, stride) \ +{ \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + uint8_t *pdst_m = (uint8_t *) (pdst); \ + \ + PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \ + PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ + AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ +} + +static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint8_t height) +{ + uint8_t loop_cnt; + uint32_t out0, out1; + v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + + SLDI_B2_0_UB(src0, src1, src0_sld1, src1_sld1, 1); + AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1); + + out0 = __msa_copy_u_w((v4i32) res0, 0); + out1 = __msa_copy_u_w((v4i32) res1, 0); + SW(out0, dst); + dst += dst_stride; + SW(out1, dst); + dst += dst_stride; + } +} + +static void common_hz_bil_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint8_t height) +{ + uint8_t loop_cnt; + v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + SLDI_B4_0_SB(src0, src1, src2, src3, + src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1); + AVER_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, + src2, src2_sld1, src3, src3_sld1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_bil_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint8_t height) +{ + uint8_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 src8, src9, src10, src11, src12, src13, src14, src15; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8((src + 1), src_stride, + src8, src9, src10, src11, src12, src13, src14, src15); + src += (8 * src_stride); + + AVER_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11, + dst, dst_stride); + dst += (4 * dst_stride); + + AVER_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15, + dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1; + v16i8 src4_sld1, src5_sld1, src6_sld1, src7_sld1; + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + SLDI_B4_0_SB(src0, src1, src2, src3, + src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1); + SLDI_B4_0_SB(src4, src5, src6, src7, + src4_sld1, src5_sld1, src6_sld1, src7_sld1, 1); + + AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, + src2, src2_sld1, src3, src3_sld1, dst, dst_stride); + dst += (4 * dst_stride); + AVE_ST8x4_UB(src4, src4_sld1, src5, src5_sld1, + src6, src6_sld1, src7, src7_sld1, dst, dst_stride); +} + +static void common_hz_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + SLDI_B4_0_SB(src0, src1, src2, src3, + src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1); + AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, + src2, src2_sld1, src3, src3_sld1, dst, dst_stride); +} + +static void common_hz_bil_no_rnd_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 src9, src10, src11, src12, src13, src14, src15; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8((src + 1), src_stride, + src8, src9, src10, src11, src12, src13, src14, src15); + src += (8 * src_stride); + + AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11, + dst, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4((src + 1), src_stride, src8, src9, src10, src11); + src += (4 * src_stride); + + AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15, + dst, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src4, src5, src6, src7); + LD_UB4((src + 1), src_stride, src12, src13, src14, src15); + src += (4 * src_stride); + + AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11, + dst, dst_stride); + dst += (4 * dst_stride); + AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15, + dst, dst_stride); +} + +static void common_hz_bil_no_rnd_8x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 src9, src10, src11, src12, src13, src14, src15; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8((src + 1), src_stride, + src8, src9, src10, src11, src12, src13, src14, src15); + + AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11, + dst, dst_stride); + dst += (4 * dst_stride); + AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15, + dst, dst_stride); +} + +static void common_hz_bil_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint8_t height) +{ + uint8_t loop_cnt; + uint32_t dst0, dst1, out0, out1; + v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1; + v16u8 tmp0 = { 0 }; + v16u8 tmp1 = { 0 }; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + + SLDI_B2_0_UB(src0, src1, src0_sld1, src1_sld1, 1); + + dst0 = LW(dst); + dst1 = LW(dst + dst_stride); + tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0); + tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1); + + AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1); + AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); + + out0 = __msa_copy_u_w((v4i32) res0, 0); + out1 = __msa_copy_u_w((v4i32) res1, 0); + SW(out0, dst); + dst += dst_stride; + SW(out1, dst); + dst += dst_stride; + } +} + +static void common_hz_bil_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint8_t height) +{ + uint8_t loop_cnt; + v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + SLDI_B4_0_SB(src0, src1, src2, src3, + src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1); + + AVER_DST_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, src2, src2_sld1, + src3, src3_sld1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_bil_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint8_t height) +{ + uint8_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 src9, src10, src11, src12, src13, src14, src15; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8((src + 1), src_stride, + src8, src9, src10, src11, src12, src13, src14, src15); + src += (8 * src_stride); + + AVER_DST_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11, + dst, dst_stride); + dst += (4 * dst_stride); + AVER_DST_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15, + dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_vt_bil_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint8_t height) +{ + uint8_t loop_cnt; + uint32_t out0, out1; + v16u8 src0, src1, src2, res0, res1; + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_UB2(src, src_stride, src1, src2); + src += (2 * src_stride); + + AVER_UB2_UB(src0, src1, src1, src2, res0, res1); + + out0 = __msa_copy_u_w((v4i32) res0, 0); + out1 = __msa_copy_u_w((v4i32) res1, 0); + SW(out0, dst); + dst += dst_stride; + SW(out1, dst); + dst += dst_stride; + + src0 = src2; + } +} + +static void common_vt_bil_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint8_t height) +{ + uint8_t loop_cnt; + v16u8 src0, src1, src2, src3, src4; + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + AVER_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4, + dst, dst_stride); + dst += (4 * dst_stride); + + src0 = src4; + } +} + +static void common_vt_bil_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint8_t height) +{ + uint8_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); + src += (8 * src_stride); + + AVER_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4, + dst, dst_stride); + dst += (4 * dst_stride); + AVER_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8, + dst, dst_stride); + dst += (4 * dst_stride); + + src0 = src8; + } +} + +static void common_vt_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_UB(src); + + AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4, + dst, dst_stride); + dst += (4 * dst_stride); + + AVE_ST8x4_UB(src4, src5, src5, src6, src6, src7, src7, src8, + dst, dst_stride); +} + +static void common_vt_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16u8 src0, src1, src2, src3, src4; + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4, + dst, dst_stride); +} + +static void common_vt_bil_no_rnd_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 src9, src10, src11, src12, src13, src14, src15, src16; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + LD_UB8(src, src_stride, + src8, src9, src10, src11, src12, src13, src14, src15); + src += (8 * src_stride); + src16 = LD_UB(src); + + AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4, + dst, dst_stride); + dst += (4 * dst_stride); + AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8, + dst, dst_stride); + dst += (4 * dst_stride); + AVE_ST16x4_UB(src8, src9, src9, src10, src10, src11, src11, src12, + dst, dst_stride); + dst += (4 * dst_stride); + AVE_ST16x4_UB(src12, src13, src13, src14, + src14, src15, src15, src16, dst, dst_stride); +} + +static void common_vt_bil_no_rnd_8x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_UB(src); + + AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4, + dst, dst_stride); + dst += (4 * dst_stride); + AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8, + dst, dst_stride); +} + +static void common_vt_bil_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint8_t height) +{ + uint8_t loop_cnt; + uint32_t out0, out1, dst0, dst1; + v16u8 src0, src1, src2; + v16u8 tmp0 = { 0 }; + v16u8 tmp1 = { 0 }; + v16u8 res0, res1; + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_UB2(src, src_stride, src1, src2); + src += (2 * src_stride); + dst0 = LW(dst); + dst1 = LW(dst + dst_stride); + tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0); + tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1); + AVER_UB2_UB(src0, src1, src1, src2, res0, res1); + AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); + out0 = __msa_copy_u_w((v4i32) res0, 0); + out1 = __msa_copy_u_w((v4i32) res1, 0); + SW(out0, dst); + dst += dst_stride; + SW(out1, dst); + dst += dst_stride; + src0 = src2; + } +} + +static void common_vt_bil_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint8_t height) +{ + uint8_t loop_cnt; + v16u8 src0, src1, src2, src3, src4; + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + AVER_DST_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4, + dst, dst_stride); + dst += (4 * dst_stride); + src0 = src4; + } +} + +static void common_vt_bil_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint8_t height) +{ + uint8_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 res0, res1, res2, res3, res4, res5, res6, res7; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); + src += (8 * src_stride); + AVER_UB4_UB(src0, src1, src1, src2, src2, src3, src3, src4, + res0, res1, res2, res3); + AVER_UB4_UB(src4, src5, src5, src6, src6, src7, src7, src8, + res4, res5, res6, res7); + + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + AVER_UB4_UB(dst0, res0, dst1, res1, dst2, res2, dst3, res3, + res0, res1, res2, res3); + AVER_UB4_UB(dst4, res4, dst5, res5, dst6, res6, dst7, res7, + res4, res5, res6, res7); + ST_UB8(res0, res1, res2, res3, res4, res5, res6, res7, dst, dst_stride); + dst += (8 * dst_stride); + + src0 = src8; + } +} + +static void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint8_t height) +{ + uint8_t loop_cnt; + uint32_t res0, res1; + v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1; + v16u8 src0_r, src1_r, src2_r, res; + v8u16 add0, add1, add2, sum0, sum1; + + src0 = LD_SB(src); + src += src_stride; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, src_stride, src1, src2); + src += (2 * src_stride); + + SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1); + ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, + src0_r, src1_r, src2_r); + HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2); + ADD2(add0, add1, add1, add2, sum0, sum1); + SRARI_H2_UH(sum0, sum1, 2); + res = (v16u8) __msa_pckev_b((v16i8) sum1, (v16i8) sum0); + res0 = __msa_copy_u_w((v4i32) res, 0); + res1 = __msa_copy_u_w((v4i32) res, 2); + SW(res0, dst); + dst += dst_stride; + SW(res1, dst); + dst += dst_stride; + + src0 = src2; + } +} + +static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint8_t height) +{ + uint8_t loop_cnt; + v16i8 src0, src1, src2, src3, src4; + v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1; + v16u8 src0_r, src1_r, src2_r, src3_r, src4_r; + v8u16 add0, add1, add2, add3, add4; + v8u16 sum0, sum1, sum2, sum3; + + src0 = LD_SB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1); + SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1); + ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r, + src1_r, src2_r); + ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r); + HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2); + HADD_UB2_UH(src3_r, src4_r, add3, add4); + ADD4(add0, add1, add1, add2, add2, add3, add3, add4, + sum0, sum1, sum2, sum3); + SRARI_H4_UH(sum0, sum1, sum2, sum3, 2); + PCKEV_B2_SB(sum1, sum0, sum3, sum2, src0, src1); + ST8x4_UB(src0, src1, dst, dst_stride); + dst += (4 * dst_stride); + src0 = src4; + } +} + +static void common_hv_bil_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint8_t height) +{ + uint8_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16u8 src10, src11, src12, src13, src14, src15, src16, src17; + v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; + v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l; + v8u16 src7_l, src8_l; + v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r; + v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8((src + 1), src_stride, + src9, src10, src11, src12, src13, src14, src15, src16); + src += (8 * src_stride); + + src8 = LD_UB(src); + src17 = LD_UB(src + 1); + + ILVRL_B2_UH(src9, src0, src0_r, src0_l); + ILVRL_B2_UH(src10, src1, src1_r, src1_l); + ILVRL_B2_UH(src11, src2, src2_r, src2_l); + ILVRL_B2_UH(src12, src3, src3_r, src3_l); + ILVRL_B2_UH(src13, src4, src4_r, src4_l); + ILVRL_B2_UH(src14, src5, src5_r, src5_l); + ILVRL_B2_UH(src15, src6, src6_r, src6_l); + ILVRL_B2_UH(src16, src7, src7_r, src7_l); + ILVRL_B2_UH(src17, src8, src8_r, src8_l); + HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r); + HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r); + HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r); + HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l); + HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l); + HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l); + ADD4(src0_r, src1_r, src1_r, src2_r, src2_r, src3_r, src3_r, src4_r, + sum0_r, sum1_r, sum2_r, sum3_r); + ADD4(src4_r, src5_r, src5_r, src6_r, src6_r, src7_r, src7_r, src8_r, + sum4_r, sum5_r, sum6_r, sum7_r); + ADD4(src0_l, src1_l, src1_l, src2_l, src2_l, src3_l, src3_l, src4_l, + sum0_l, sum1_l, sum2_l, sum3_l); + ADD4(src4_l, src5_l, src5_l, src6_l, src6_l, src7_l, src7_l, src8_l, + sum4_l, sum5_l, sum6_l, sum7_l); + SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2); + SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2); + SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2); + SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2); + PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, sum2_l, sum2_r, + sum3_l, sum3_r, dst, dst_stride); + dst += (4 * dst_stride); + PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, sum6_l, sum6_r, + sum7_l, sum7_r, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 src0_sld1, src1_sld1, src2_sld1, src3_sld1; + v16u8 src4_sld1, src5_sld1, src6_sld1, src7_sld1, src8_sld1; + v8u16 src0_r, src1_r, src2_r, src3_r; + v8u16 src4_r, src5_r, src6_r, src7_r, src8_r; + v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8; + v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; + v16i8 out0, out1; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_UB(src); + + SLDI_B4_0_UB(src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, + src3_sld1, 1); + SLDI_B3_0_UB(src4, src5, src6, src4_sld1, src5_sld1, src6_sld1, 1); + SLDI_B2_0_UB(src7, src8, src7_sld1, src8_sld1, 1); + ILVR_B4_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src3_sld1, + src3, src0_r, src1_r, src2_r, src3_r); + ILVR_B3_UH(src4_sld1, src4, src5_sld1, src5, src6_sld1, src6, src4_r, + src5_r, src6_r); + ILVR_B2_UH(src7_sld1, src7, src8_sld1, src8, src7_r, src8_r); + HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2); + HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5); + HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8); + + sum0 = add0 + add1 + 1; + sum1 = add1 + add2 + 1; + sum2 = add2 + add3 + 1; + sum3 = add3 + add4 + 1; + sum4 = add4 + add5 + 1; + sum5 = add5 + add6 + 1; + sum6 = add6 + add7 + 1; + sum7 = add7 + add8 + 1; + + SRA_4V(sum0, sum1, sum2, sum3, 2); + SRA_4V(sum4, sum5, sum6, sum7, 2); + PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + PCKEV_B2_SB(sum5, sum4, sum7, sum6, out0, out1); + ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); +} + +static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16i8 src0, src1, src2, src3, src4; + v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1; + v8u16 src0_r, src1_r, src2_r, src3_r, src4_r; + v8u16 add0, add1, add2, add3, add4; + v8u16 sum0, sum1, sum2, sum3; + v16i8 out0, out1; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + src4 = LD_SB(src); + + SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1); + SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1); + ILVR_B3_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r, + src1_r, src2_r); + ILVR_B2_UH(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r); + HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2); + HADD_UB2_UH(src3_r, src4_r, add3, add4); + + sum0 = add0 + add1 + 1; + sum1 = add1 + add2 + 1; + sum2 = add2 + add3 + 1; + sum3 = add3 + add4 + 1; + + SRA_4V(sum0, sum1, sum2, sum3, 2); + PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); +} + +static void common_hv_bil_no_rnd_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16u8 src10, src11, src12, src13, src14, src15, src16, src17; + v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; + v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l; + v8u16 src7_l, src8_l; + v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r; + v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8((src + 1), src_stride, + src9, src10, src11, src12, src13, src14, src15, src16); + src += (8 * src_stride); + src8 = LD_UB(src); + src17 = LD_UB(src + 1); + + ILVRL_B2_UH(src9, src0, src0_r, src0_l); + ILVRL_B2_UH(src10, src1, src1_r, src1_l); + ILVRL_B2_UH(src11, src2, src2_r, src2_l); + ILVRL_B2_UH(src12, src3, src3_r, src3_l); + ILVRL_B2_UH(src13, src4, src4_r, src4_l); + ILVRL_B2_UH(src14, src5, src5_r, src5_l); + ILVRL_B2_UH(src15, src6, src6_r, src6_l); + ILVRL_B2_UH(src16, src7, src7_r, src7_l); + ILVRL_B2_UH(src17, src8, src8_r, src8_l); + + HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r); + HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r); + HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r); + HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l); + HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l); + HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l); + + sum0_r = src0_r + src1_r + 1; + sum1_r = src1_r + src2_r + 1; + sum2_r = src2_r + src3_r + 1; + sum3_r = src3_r + src4_r + 1; + sum4_r = src4_r + src5_r + 1; + sum5_r = src5_r + src6_r + 1; + sum6_r = src6_r + src7_r + 1; + sum7_r = src7_r + src8_r + 1; + sum0_l = src0_l + src1_l + 1; + sum1_l = src1_l + src2_l + 1; + sum2_l = src2_l + src3_l + 1; + sum3_l = src3_l + src4_l + 1; + sum4_l = src4_l + src5_l + 1; + sum5_l = src5_l + src6_l + 1; + sum6_l = src6_l + src7_l + 1; + sum7_l = src7_l + src8_l + 1; + + SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2); + SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2); + SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2); + SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2); + PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, + sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride); + dst += (4 * dst_stride); + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8((src + 1), src_stride, + src9, src10, src11, src12, src13, src14, src15, src16); + src += (8 * src_stride); + src8 = LD_UB(src); + src17 = LD_UB(src + 1); + + PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, + sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride); + dst += (4 * dst_stride); + + ILVRL_B2_UH(src9, src0, src0_r, src0_l); + ILVRL_B2_UH(src10, src1, src1_r, src1_l); + ILVRL_B2_UH(src11, src2, src2_r, src2_l); + ILVRL_B2_UH(src12, src3, src3_r, src3_l); + ILVRL_B2_UH(src13, src4, src4_r, src4_l); + ILVRL_B2_UH(src14, src5, src5_r, src5_l); + ILVRL_B2_UH(src15, src6, src6_r, src6_l); + ILVRL_B2_UH(src16, src7, src7_r, src7_l); + ILVRL_B2_UH(src17, src8, src8_r, src8_l); + + HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r); + HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r); + HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r); + HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l); + HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l); + HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l); + + sum0_r = src0_r + src1_r + 1; + sum1_r = src1_r + src2_r + 1; + sum2_r = src2_r + src3_r + 1; + sum3_r = src3_r + src4_r + 1; + sum4_r = src4_r + src5_r + 1; + sum5_r = src5_r + src6_r + 1; + sum6_r = src6_r + src7_r + 1; + sum7_r = src7_r + src8_r + 1; + sum0_l = src0_l + src1_l + 1; + sum1_l = src1_l + src2_l + 1; + sum2_l = src2_l + src3_l + 1; + sum3_l = src3_l + src4_l + 1; + sum4_l = src4_l + src5_l + 1; + sum5_l = src5_l + src6_l + 1; + sum6_l = src6_l + src7_l + 1; + sum7_l = src7_l + src8_l + 1; + + SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2); + SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2); + SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2); + SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2); + PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, + sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride); + dst += (4 * dst_stride); + PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, + sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride); +} + +static void common_hv_bil_no_rnd_8x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16u8 src10, src11, src12, src13, src14, src15, src16, src17; + v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; + v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l; + v8u16 src7_l, src8_l; + v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r; + v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8((src + 1), src_stride, + src9, src10, src11, src12, src13, src14, src15, src16); + src += (8 * src_stride); + src8 = LD_UB(src); + src17 = LD_UB(src + 1); + + ILVRL_B2_UH(src9, src0, src0_r, src0_l); + ILVRL_B2_UH(src10, src1, src1_r, src1_l); + ILVRL_B2_UH(src11, src2, src2_r, src2_l); + ILVRL_B2_UH(src12, src3, src3_r, src3_l); + ILVRL_B2_UH(src13, src4, src4_r, src4_l); + ILVRL_B2_UH(src14, src5, src5_r, src5_l); + ILVRL_B2_UH(src15, src6, src6_r, src6_l); + ILVRL_B2_UH(src16, src7, src7_r, src7_l); + ILVRL_B2_UH(src17, src8, src8_r, src8_l); + + HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r); + HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r); + HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r); + HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l); + HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l); + HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l); + + sum0_r = src0_r + src1_r + 1; + sum1_r = src1_r + src2_r + 1; + sum2_r = src2_r + src3_r + 1; + sum3_r = src3_r + src4_r + 1; + sum4_r = src4_r + src5_r + 1; + sum5_r = src5_r + src6_r + 1; + sum6_r = src6_r + src7_r + 1; + sum7_r = src7_r + src8_r + 1; + sum0_l = src0_l + src1_l + 1; + sum1_l = src1_l + src2_l + 1; + sum2_l = src2_l + src3_l + 1; + sum3_l = src3_l + src4_l + 1; + sum4_l = src4_l + src5_l + 1; + sum5_l = src5_l + src6_l + 1; + sum6_l = src6_l + src7_l + 1; + sum7_l = src7_l + src8_l + 1; + + SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2); + SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2); + SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2); + SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2); + PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, + sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride); + dst += (4 * dst_stride); + PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, + sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride); +} + +static void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint8_t height) +{ + uint8_t loop_cnt; + uint32_t out0, out1; + v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1; + v16u8 src0_r, src1_r, src2_r; + v8u16 add0, add1, add2, sum0, sum1; + v16u8 dst0, dst1, res0, res1; + + src0 = LD_SB(src); + src += src_stride; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, src_stride, src1, src2); + src += (2 * src_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1); + ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r, + src1_r, src2_r); + HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2); + ADD2(add0, add1, add1, add2, sum0, sum1); + SRARI_H2_UH(sum0, sum1, 2); + PCKEV_B2_UB(sum0, sum0, sum1, sum1, res0, res1); + AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); + + out0 = __msa_copy_u_w((v4i32) res0, 0); + out1 = __msa_copy_u_w((v4i32) res1, 0); + SW(out0, dst); + dst += dst_stride; + SW(out1, dst); + dst += dst_stride; + + src0 = src2; + } +} + +static void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint8_t height) +{ + uint8_t loop_cnt; + v16i8 src0, src1, src2, src3, src4; + v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1; + v16u8 dst0, dst1, dst2, dst3; + v16u8 src0_r, src1_r, src2_r, src3_r, src4_r; + v8u16 add0, add1, add2, add3, add4; + v8u16 sum0, sum1, sum2, sum3; + + src0 = LD_SB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1); + SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1); + ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r, + src1_r, src2_r); + ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r); + HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2); + HADD_UB2_UH(src3_r, src4_r, add3, add4); + ADD4(add0, add1, add1, add2, add2, add3, add3, add4, + sum0, sum1, sum2, sum3); + SRARI_H4_UH(sum0, sum1, sum2, sum3, 2); + PCKEV_AVG_ST8x4_UB(sum0, dst0, sum1, dst1, + sum2, dst2, sum3, dst3, dst, dst_stride); + dst += (4 * dst_stride); + src0 = src4; + } +} + +static void common_hv_bil_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + uint8_t height) +{ + uint8_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 src11, src12, src13, src14, src15, src16, src17; + v16u8 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; + v16u8 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l; + v16u8 src7_l, src8_l; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r; + v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l; + v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8((src + 1), src_stride, + src9, src10, src11, src12, src13, src14, src15, src16); + src += (8 * src_stride); + + src8 = LD_UB(src); + src17 = LD_UB(src + 1); + + ILVRL_B2_UB(src9, src0, src0_r, src0_l); + ILVRL_B2_UB(src10, src1, src1_r, src1_l); + ILVRL_B2_UB(src11, src2, src2_r, src2_l); + ILVRL_B2_UB(src12, src3, src3_r, src3_l); + ILVRL_B2_UB(src13, src4, src4_r, src4_l); + ILVRL_B2_UB(src14, src5, src5_r, src5_l); + ILVRL_B2_UB(src15, src6, src6_r, src6_l); + ILVRL_B2_UB(src16, src7, src7_r, src7_l); + ILVRL_B2_UB(src17, src8, src8_r, src8_l); + HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2); + HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5); + HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8); + ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_r, sum1_r, + sum2_r, sum3_r); + ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_r, sum5_r, + sum6_r, sum7_r); + HADD_UB3_UH(src0_l, src1_l, src2_l, add0, add1, add2); + HADD_UB3_UH(src3_l, src4_l, src5_l, add3, add4, add5); + HADD_UB3_UH(src6_l, src7_l, src8_l, add6, add7, add8); + ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_l, sum1_l, + sum2_l, sum3_l); + ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_l, sum5_l, + sum6_l, sum7_l); + SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2); + SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2); + SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2); + SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + PCKEV_AVG_ST_UB(sum0_l, sum0_r, dst0, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(sum1_l, sum1_r, dst1, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(sum2_l, sum2_r, dst2, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(sum3_l, sum3_r, dst3, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(sum4_l, sum4_r, dst4, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(sum5_l, sum5_r, dst5, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(sum6_l, sum6_r, dst6, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(sum7_l, sum7_r, dst7, dst); + dst += dst_stride; + } +} + +static void copy_width8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + out4 = __msa_copy_u_d((v2i64) src4, 0); + out5 = __msa_copy_u_d((v2i64) src5, 0); + out6 = __msa_copy_u_d((v2i64) src6, 0); + out7 = __msa_copy_u_d((v2i64) src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + for (cnt = height >> 3; cnt--;) { + LD_UB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + out4 = __msa_copy_u_d((v2i64) src4, 0); + out5 = __msa_copy_u_d((v2i64) src5, 0); + out6 = __msa_copy_u_d((v2i64) src6, 0); + out7 = __msa_copy_u_d((v2i64) src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 4) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 2) { + for (cnt = (height / 2); cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + + SD(out0, dst); + dst += dst_stride; + SD(out1, dst); + dst += dst_stride; + } + } +} + +static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t width) +{ + int32_t cnt, loop_cnt; + const uint8_t *src_tmp; + uint8_t *dst_tmp; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src_tmp, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src_tmp += (8 * src_stride); + + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, + dst_tmp, dst_stride); + dst_tmp += (8 * dst_stride); + } + + src += 16; + dst += 16; + } +} + +static void copy_width16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, + dst, dst_stride); + dst += (8 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); + } else if (0 == height % 4) { + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void avg_width4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + uint32_t out0, out1, out2, out3; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + + if (0 == (height % 4)) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + + out0 = __msa_copy_u_w((v4i32) dst0, 0); + out1 = __msa_copy_u_w((v4i32) dst1, 0); + out2 = __msa_copy_u_w((v4i32) dst2, 0); + out3 = __msa_copy_u_w((v4i32) dst3, 0); + SW4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == (height % 2)) { + for (cnt = (height / 2); cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + + AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1); + + out0 = __msa_copy_u_w((v4i32) dst0, 0); + out1 = __msa_copy_u_w((v4i32) dst1, 0); + SW(out0, dst); + dst += dst_stride; + SW(out1, dst); + dst += dst_stride; + } + } +} + +static void avg_width8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + uint64_t out0, out1, out2, out3; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + + out0 = __msa_copy_u_d((v2i64) dst0, 0); + out1 = __msa_copy_u_d((v2i64) dst1, 0); + out2 = __msa_copy_u_d((v2i64) dst2, 0); + out3 = __msa_copy_u_d((v2i64) dst3, 0); + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avg_width16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + for (cnt = (height / 8); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +void ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + copy_width16_msa(pixels, line_size, block, line_size, h); +} + +void ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + common_hz_bil_16w_msa(pixels, line_size, block, line_size, h); +} + +void ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + common_vt_bil_16w_msa(pixels, line_size, block, line_size, h); +} + +void ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + common_hv_bil_16w_msa(pixels, line_size, block, line_size, h); +} + +void ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + copy_width8_msa(pixels, line_size, block, line_size, h); +} + +void ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + common_hz_bil_8w_msa(pixels, line_size, block, line_size, h); +} + +void ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + common_vt_bil_8w_msa(pixels, line_size, block, line_size, h); +} + +void ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + common_hv_bil_8w_msa(pixels, line_size, block, line_size, h); +} + +void ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + common_hz_bil_4w_msa(pixels, line_size, block, line_size, h); +} + +void ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + common_vt_bil_4w_msa(pixels, line_size, block, line_size, h); +} + +void ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + common_hv_bil_4w_msa(pixels, line_size, block, line_size, h); +} + +void ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + if (h == 16) { + common_hz_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size); + } else if (h == 8) { + common_hz_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size); + } +} + +void ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + if (h == 16) { + common_vt_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size); + } else if (h == 8) { + common_vt_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size); + } +} + +void ff_put_no_rnd_pixels16_xy2_msa(uint8_t *block, + const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + if (h == 16) { + common_hv_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size); + } else if (h == 8) { + common_hv_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size); + } +} + +void ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + if (h == 8) { + common_hz_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size); + } else if (h == 4) { + common_hz_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size); + } +} + +void ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + if (h == 8) { + common_vt_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size); + } else if (h == 4) { + common_vt_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size); + } +} + +void ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + if (h == 8) { + common_hv_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size); + } else if (h == 4) { + common_hv_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size); + } +} + +void ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + avg_width16_msa(pixels, line_size, block, line_size, h); +} + +void ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + common_hz_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h); +} + +void ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + common_vt_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h); +} + +void ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + common_hv_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h); +} + +void ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + avg_width8_msa(pixels, line_size, block, line_size, h); +} + +void ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + common_hz_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h); +} + +void ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + common_vt_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h); +} + +void ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + common_hv_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h); +} + +void ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + avg_width4_msa(pixels, line_size, block, line_size, h); +} + +void ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + common_hz_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h); +} + +void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + common_vt_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h); +} + +void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + common_hv_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h); +} diff --git a/libavcodec/mips/idctdsp_init_mips.c b/libavcodec/mips/idctdsp_init_mips.c new file mode 100644 index 0000000000..8c26bca538 --- /dev/null +++ b/libavcodec/mips/idctdsp_init_mips.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "idctdsp_mips.h" + +#if HAVE_MSA +static av_cold void idctdsp_init_msa(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + if ((avctx->lowres != 1) && (avctx->lowres != 2) && (avctx->lowres != 3) && + (avctx->bits_per_raw_sample != 10) && + (avctx->bits_per_raw_sample != 12) && + (avctx->idct_algo == FF_IDCT_AUTO)) { + c->idct_put = ff_simple_idct_put_msa; + c->idct_add = ff_simple_idct_add_msa; + c->idct = ff_simple_idct_msa; + c->perm_type = FF_IDCT_PERM_NONE; + } + + c->put_pixels_clamped = ff_put_pixels_clamped_msa; + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_msa; + c->add_pixels_clamped = ff_add_pixels_clamped_msa; +} +#endif // #if HAVE_MSA + +#if HAVE_MMI +static av_cold void idctdsp_init_mmi(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + if ((avctx->lowres != 1) && (avctx->lowres != 2) && (avctx->lowres != 3) && + (avctx->bits_per_raw_sample != 10) && + (avctx->bits_per_raw_sample != 12) && + (avctx->idct_algo == FF_IDCT_AUTO)) { + c->idct = ff_simple_idct_mmi; + c->perm_type = FF_IDCT_PERM_NONE; + } + + c->put_pixels_clamped = ff_put_pixels_clamped_mmi; + c->add_pixels_clamped = ff_add_pixels_clamped_mmi; + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmi; +} +#endif /* HAVE_MMI */ + +av_cold void ff_idctdsp_init_mips(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ +#if HAVE_MSA + idctdsp_init_msa(c, avctx, high_bit_depth); +#endif // #if HAVE_MSA +#if HAVE_MMI + idctdsp_init_mmi(c, avctx, high_bit_depth); +#endif /* HAVE_MMI */ +} diff --git a/libavcodec/mips/idctdsp_mips.h b/libavcodec/mips/idctdsp_mips.h new file mode 100644 index 0000000000..19267e6705 --- /dev/null +++ b/libavcodec/mips/idctdsp_mips.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_IDCTDSP_MIPS_H +#define AVCODEC_MIPS_IDCTDSP_MIPS_H + +#include "../mpegvideo.h" + +void ff_put_pixels_clamped_msa(const int16_t *block, + uint8_t *av_restrict pixels, + ptrdiff_t line_size); +void ff_put_signed_pixels_clamped_msa(const int16_t *block, + uint8_t *av_restrict pixels, + ptrdiff_t line_size); +void ff_add_pixels_clamped_msa(const int16_t *block, + uint8_t *av_restrict pixels, + ptrdiff_t line_size); +void ff_j_rev_dct_msa(int16_t *data); +void ff_jref_idct_put_msa(uint8_t *dest, int32_t stride, int16_t *block); +void ff_jref_idct_add_msa(uint8_t *dest, int32_t stride, int16_t *block); +void ff_simple_idct_msa(int16_t *block); +void ff_simple_idct_put_msa(uint8_t *dest, int32_t stride_dst, int16_t *block); +void ff_simple_idct_add_msa(uint8_t *dest, int32_t stride_dst, int16_t *block); + +void ff_put_pixels_clamped_mmi(const int16_t *block, + uint8_t *av_restrict pixels, ptrdiff_t line_size); +void ff_put_signed_pixels_clamped_mmi(const int16_t *block, + uint8_t *av_restrict pixels, ptrdiff_t line_size); +void ff_add_pixels_clamped_mmi(const int16_t *block, + uint8_t *av_restrict pixels, ptrdiff_t line_size); +void ff_simple_idct_mmi(int16_t *block); +void ff_simple_idct_put_mmi(uint8_t *dest, int32_t line_size, int16_t *block); +void ff_simple_idct_add_mmi(uint8_t *dest, int32_t line_size, int16_t *block); + +#endif // #ifndef AVCODEC_MIPS_IDCTDSP_MIPS_H diff --git a/libavcodec/mips/idctdsp_mmi.c b/libavcodec/mips/idctdsp_mmi.c new file mode 100644 index 0000000000..24beb62df5 --- /dev/null +++ b/libavcodec/mips/idctdsp_mmi.c @@ -0,0 +1,208 @@ +/* + * Loongson SIMD optimized idctdsp + * + * Copyright (c) 2015 Loongson Technology Corporation Limited + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "idctdsp_mips.h" +#include "constants.h" +#include "libavutil/mips/asmdefs.h" + +void ff_put_pixels_clamped_mmi(const int16_t *block, + uint8_t *av_restrict pixels, ptrdiff_t line_size) +{ + double ftmp[8]; + mips_reg addr[1]; + + __asm__ volatile ( + "ldc1 %[ftmp0], 0x00(%[block]) \n\t" + "ldc1 %[ftmp1], 0x08(%[block]) \n\t" + "ldc1 %[ftmp2], 0x10(%[block]) \n\t" + "ldc1 %[ftmp3], 0x18(%[block]) \n\t" + "ldc1 %[ftmp4], 0x20(%[block]) \n\t" + "ldc1 %[ftmp5], 0x28(%[block]) \n\t" + "ldc1 %[ftmp6], 0x30(%[block]) \n\t" + "ldc1 %[ftmp7], 0x38(%[block]) \n\t" + PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t" + "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp7] \n\t" + "sdc1 %[ftmp0], 0x00(%[pixels]) \n\t" + "sdc1 %[ftmp2], 0x00(%[addr0]) \n\t" + "gssdxc1 %[ftmp4], 0x00(%[addr0], %[line_size]) \n\t" + "gssdxc1 %[ftmp6], 0x00(%[pixels], %[line_sizex3]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [addr0]"=&r"(addr[0]), + [pixels]"+&r"(pixels) + : [line_size]"r"((mips_reg)line_size), + [line_sizex3]"r"((mips_reg)(line_size*3)), + [block]"r"(block) + : "memory" + ); + + pixels += line_size*4; + block += 32; + + __asm__ volatile ( + "ldc1 %[ftmp0], 0x00(%[block]) \n\t" + "ldc1 %[ftmp1], 0x08(%[block]) \n\t" + "ldc1 %[ftmp2], 0x10(%[block]) \n\t" + "ldc1 %[ftmp3], 0x18(%[block]) \n\t" + "ldc1 %[ftmp4], 0x20(%[block]) \n\t" + "ldc1 %[ftmp5], 0x28(%[block]) \n\t" + "ldc1 %[ftmp6], 0x30(%[block]) \n\t" + "ldc1 %[ftmp7], 0x38(%[block]) \n\t" + PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t" + "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "packushb %[ftmp6], %[ftmp6], %[ftmp7] \n\t" + "sdc1 %[ftmp0], 0x00(%[pixels]) \n\t" + "sdc1 %[ftmp2], 0x00(%[addr0]) \n\t" + "gssdxc1 %[ftmp4], 0x00(%[addr0], %[line_size]) \n\t" + "gssdxc1 %[ftmp6], 0x00(%[pixels], %[line_sizex3]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [addr0]"=&r"(addr[0]), + [pixels]"+&r"(pixels) + : [line_size]"r"((mips_reg)line_size), + [line_sizex3]"r"((mips_reg)(line_size*3)), + [block]"r"(block) + : "memory" + ); +} + +void ff_put_signed_pixels_clamped_mmi(const int16_t *block, + uint8_t *av_restrict pixels, ptrdiff_t line_size) +{ + int64_t line_skip = line_size; + int64_t line_skip3 = 0; + double ftmp[5]; + mips_reg addr[1]; + + __asm__ volatile ( + PTR_ADDU "%[line_skip3], %[line_skip], %[line_skip] \n\t" + "ldc1 %[ftmp1], 0x00(%[block]) \n\t" + "ldc1 %[ftmp0], 0x08(%[block]) \n\t" + "packsshb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "ldc1 %[ftmp2], 0x10(%[block]) \n\t" + "ldc1 %[ftmp0], 0x18(%[block]) \n\t" + "packsshb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "ldc1 %[ftmp3], 0x20(%[block]) \n\t" + "ldc1 %[ftmp0], 0x28(%[block]) \n\t" + "packsshb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "ldc1 %[ftmp4], 48(%[block]) \n\t" + "ldc1 %[ftmp0], 56(%[block]) \n\t" + "packsshb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "paddb %[ftmp1], %[ftmp1], %[ff_pb_80] \n\t" + "paddb %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t" + "paddb %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "paddb %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "sdc1 %[ftmp1], 0x00(%[pixels]) \n\t" + "gssdxc1 %[ftmp2], 0x00(%[pixels], %[line_skip]) \n\t" + "gssdxc1 %[ftmp3], 0x00(%[pixels], %[line_skip3]) \n\t" + PTR_ADDU "%[line_skip3], %[line_skip3], %[line_skip] \n\t" + "gssdxc1 %[ftmp4], 0x00(%[pixels], %[line_skip3]) \n\t" + PTR_ADDU "%[addr0], %[line_skip3], %[line_skip] \n\t" + PTR_ADDU "%[pixels], %[pixels], %[addr0] \n\t" + "ldc1 %[ftmp1], 0x40(%[block]) \n\t" + "ldc1 %[ftmp0], 0x48(%[block]) \n\t" + "packsshb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "ldc1 %[ftmp2], 0x50(%[block]) \n\t" + "ldc1 %[ftmp0], 0x58(%[block]) \n\t" + "packsshb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "ldc1 %[ftmp3], 0x60(%[block]) \n\t" + "ldc1 %[ftmp0], 0x68(%[block]) \n\t" + "packsshb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "ldc1 %[ftmp4], 0x70(%[block]) \n\t" + "ldc1 %[ftmp0], 0x78(%[block]) \n\t" + "packsshb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "paddb %[ftmp1], %[ftmp1], %[ff_pb_80] \n\t" + "paddb %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t" + "paddb %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "paddb %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "sdc1 %[ftmp1], 0x00(%[pixels]) \n\t" + "gssdxc1 %[ftmp2], 0x00(%[pixels], %[line_skip]) \n\t" + PTR_ADDU "%[addr0], %[line_skip], %[line_skip] \n\t" + "gssdxc1 %[ftmp3], 0x00(%[pixels], %[addr0]) \n\t" + "gssdxc1 %[ftmp4], 0x00(%[pixels], %[line_skip3]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), + [addr0]"=&r"(addr[0]), + [pixels]"+&r"(pixels), [line_skip3]"+&r"(line_skip3) + : [block]"r"(block), + [line_skip]"r"((mips_reg)line_skip), + [ff_pb_80]"f"(ff_pb_80) + : "memory" + ); +} + +void ff_add_pixels_clamped_mmi(const int16_t *block, + uint8_t *av_restrict pixels, ptrdiff_t line_size) +{ + double ftmp[8]; + uint64_t tmp[1]; + + __asm__ volatile ( + "li %[tmp0], 0x04 \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "1: \n\t" + "ldc1 %[ftmp1], 0x00(%[block]) \n\t" + "ldc1 %[ftmp2], 0x08(%[block]) \n\t" + "ldc1 %[ftmp3], 0x10(%[block]) \n\t" + "ldc1 %[ftmp4], 0x18(%[block]) \n\t" + "ldc1 %[ftmp5], 0x00(%[pixels]) \n\t" + "gsldxc1 %[ftmp6], 0x00(%[pixels], %[line_size]) \n\t" + "mov.d %[ftmp7], %[ftmp5] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "mov.d %[ftmp7], %[ftmp6] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "packushb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "sdc1 %[ftmp1], 0x00(%[pixels]) \n\t" + "gssdxc1 %[ftmp3], 0x00(%[pixels], %[line_size]) \n\t" + "addi %[tmp0], %[tmp0], -0x01 \n\t" + PTR_ADDIU "%[block], %[block], 0x20 \n\t" + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + "bnez %[tmp0], 1b" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [tmp0]"=&r"(tmp[0]), + [pixels]"+&r"(pixels), [block]"+&r"(block) + : [line_size]"r"((mips_reg)line_size) + : "memory" + ); +} diff --git a/libavcodec/mips/idctdsp_msa.c b/libavcodec/mips/idctdsp_msa.c new file mode 100644 index 0000000000..b29e420556 --- /dev/null +++ b/libavcodec/mips/idctdsp_msa.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "idctdsp_mips.h" + +static void put_pixels_clamped_msa(const int16_t *block, uint8_t *pixels, + int32_t stride) +{ + uint64_t in0_d, in1_d, in2_d, in3_d, in4_d, in5_d, in6_d, in7_d; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7); + CLIP_SH4_0_255(in0, in1, in2, in3); + CLIP_SH4_0_255(in4, in5, in6, in7); + PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3); + PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7); + + in0_d = __msa_copy_u_d((v2i64) in0, 0); + in1_d = __msa_copy_u_d((v2i64) in1, 0); + in2_d = __msa_copy_u_d((v2i64) in2, 0); + in3_d = __msa_copy_u_d((v2i64) in3, 0); + in4_d = __msa_copy_u_d((v2i64) in4, 0); + in5_d = __msa_copy_u_d((v2i64) in5, 0); + in6_d = __msa_copy_u_d((v2i64) in6, 0); + in7_d = __msa_copy_u_d((v2i64) in7, 0); + SD4(in0_d, in1_d, in2_d, in3_d, pixels, stride); + pixels += 4 * stride; + SD4(in4_d, in5_d, in6_d, in7_d, pixels, stride); +} + +static void put_signed_pixels_clamped_msa(const int16_t *block, uint8_t *pixels, + int32_t stride) +{ + uint64_t in0_d, in1_d, in2_d, in3_d, in4_d, in5_d, in6_d, in7_d; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7); + + in0 += 128; + in1 += 128; + in2 += 128; + in3 += 128; + in4 += 128; + in5 += 128; + in6 += 128; + in7 += 128; + + CLIP_SH4_0_255(in0, in1, in2, in3); + CLIP_SH4_0_255(in4, in5, in6, in7); + PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3); + PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7); + + in0_d = __msa_copy_u_d((v2i64) in0, 0); + in1_d = __msa_copy_u_d((v2i64) in1, 0); + in2_d = __msa_copy_u_d((v2i64) in2, 0); + in3_d = __msa_copy_u_d((v2i64) in3, 0); + in4_d = __msa_copy_u_d((v2i64) in4, 0); + in5_d = __msa_copy_u_d((v2i64) in5, 0); + in6_d = __msa_copy_u_d((v2i64) in6, 0); + in7_d = __msa_copy_u_d((v2i64) in7, 0); + SD4(in0_d, in1_d, in2_d, in3_d, pixels, stride); + pixels += 4 * stride; + SD4(in4_d, in5_d, in6_d, in7_d, pixels, stride); +} + +static void add_pixels_clamped_msa(const int16_t *block, uint8_t *pixels, + int32_t stride) +{ + uint64_t in0_d, in1_d, in2_d, in3_d, in4_d, in5_d, in6_d, in7_d; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v16u8 pix_in0, pix_in1, pix_in2, pix_in3; + v16u8 pix_in4, pix_in5, pix_in6, pix_in7; + v8u16 pix0, pix1, pix2, pix3, pix4, pix5, pix6, pix7; + v8i16 zero = { 0 }; + + LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_UB8(pixels, stride, pix_in0, pix_in1, pix_in2, + pix_in3, pix_in4, pix_in5, pix_in6, pix_in7); + + ILVR_B4_UH(zero, pix_in0, zero, pix_in1, zero, pix_in2, zero, pix_in3, + pix0, pix1, pix2, pix3); + ILVR_B4_UH(zero, pix_in4, zero, pix_in5, zero, pix_in6, zero, pix_in7, + pix4, pix5, pix6, pix7); + + in0 += (v8i16) pix0; + in1 += (v8i16) pix1; + in2 += (v8i16) pix2; + in3 += (v8i16) pix3; + in4 += (v8i16) pix4; + in5 += (v8i16) pix5; + in6 += (v8i16) pix6; + in7 += (v8i16) pix7; + + CLIP_SH4_0_255(in0, in1, in2, in3); + CLIP_SH4_0_255(in4, in5, in6, in7); + PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3); + PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7); + + in0_d = __msa_copy_u_d((v2i64) in0, 0); + in1_d = __msa_copy_u_d((v2i64) in1, 0); + in2_d = __msa_copy_u_d((v2i64) in2, 0); + in3_d = __msa_copy_u_d((v2i64) in3, 0); + in4_d = __msa_copy_u_d((v2i64) in4, 0); + in5_d = __msa_copy_u_d((v2i64) in5, 0); + in6_d = __msa_copy_u_d((v2i64) in6, 0); + in7_d = __msa_copy_u_d((v2i64) in7, 0); + SD4(in0_d, in1_d, in2_d, in3_d, pixels, stride); + pixels += 4 * stride; + SD4(in4_d, in5_d, in6_d, in7_d, pixels, stride); +} + +void ff_put_pixels_clamped_msa(const int16_t *block, + uint8_t *av_restrict pixels, + ptrdiff_t line_size) +{ + put_pixels_clamped_msa(block, pixels, line_size); +} + +void ff_put_signed_pixels_clamped_msa(const int16_t *block, + uint8_t *av_restrict pixels, + ptrdiff_t line_size) +{ + put_signed_pixels_clamped_msa(block, pixels, line_size); +} + +void ff_add_pixels_clamped_msa(const int16_t *block, + uint8_t *av_restrict pixels, + ptrdiff_t line_size) +{ + add_pixels_clamped_msa(block, pixels, line_size); +} diff --git a/libavcodec/mips/iirfilter_mips.c b/libavcodec/mips/iirfilter_mips.c new file mode 100644 index 0000000000..87db9ffe55 --- /dev/null +++ b/libavcodec/mips/iirfilter_mips.c @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Bojan Zivkovic (bojan@mips.com) + * + * IIR filter optimized for MIPS floating-point architecture + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + + /** + * @file + * Reference: libavcodec/iirfilter.c + */ + +#include "libavcodec/iirfilter.h" + +#if HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 +typedef struct FFIIRFilterCoeffs { + int order; + float gain; + int *cx; + float *cy; +} FFIIRFilterCoeffs; + +typedef struct FFIIRFilterState { + float x[1]; +} FFIIRFilterState; + +static void ff_iir_filter_flt_mips(const struct FFIIRFilterCoeffs *c, + struct FFIIRFilterState *s, int size, + const float *src, int sstep, float *dst, int dstep) +{ + if (c->order == 2) { + int i; + const float *src0 = src; + float *dst0 = dst; + for (i = 0; i < size; i++) { + float in = *src0 * c->gain + s->x[0] * c->cy[0] + s->x[1] * c->cy[1]; + *dst0 = s->x[0] + in + s->x[1] * c->cx[1]; + s->x[0] = s->x[1]; + s->x[1] = in; + src0 += sstep; + dst0 += dstep; + } + } else if (c->order == 4) { + int i; + const float *src0 = src; + float *dst0 = dst; + float four = 4.0; + float six = 6.0; + for (i = 0; i < size; i += 4) { + float in1, in2, in3, in4; + float res1, res2, res3, res4; + float *x = s->x; + float *cy = c->cy; + float gain = c->gain; + float src0_0 = src0[0 ]; + float src0_1 = src0[sstep ]; + float src0_2 = src0[2*sstep]; + float src0_3 = src0[3*sstep]; + + __asm__ volatile ( + "lwc1 $f0, 0(%[cy]) \n\t" + "lwc1 $f4, 0(%[x]) \n\t" + "lwc1 $f5, 4(%[x]) \n\t" + "lwc1 $f6, 8(%[x]) \n\t" + "lwc1 $f7, 12(%[x]) \n\t" + "mul.s %[in1], %[src0_0], %[gain] \n\t" + "mul.s %[in2], %[src0_1], %[gain] \n\t" + "mul.s %[in3], %[src0_2], %[gain] \n\t" + "mul.s %[in4], %[src0_3], %[gain] \n\t" + "lwc1 $f1, 4(%[cy]) \n\t" + "madd.s %[in1], %[in1], $f0, $f4 \n\t" + "madd.s %[in2], %[in2], $f0, $f5 \n\t" + "madd.s %[in3], %[in3], $f0, $f6 \n\t" + "madd.s %[in4], %[in4], $f0, $f7 \n\t" + "lwc1 $f2, 8(%[cy]) \n\t" + "madd.s %[in1], %[in1], $f1, $f5 \n\t" + "madd.s %[in2], %[in2], $f1, $f6 \n\t" + "madd.s %[in3], %[in3], $f1, $f7 \n\t" + "lwc1 $f3, 12(%[cy]) \n\t" + "add.s $f8, $f5, $f7 \n\t" + "madd.s %[in1], %[in1], $f2, $f6 \n\t" + "madd.s %[in2], %[in2], $f2, $f7 \n\t" + "mul.s $f9, $f6, %[six] \n\t" + "mul.s $f10, $f7, %[six] \n\t" + "madd.s %[in1], %[in1], $f3, $f7 \n\t" + "madd.s %[in2], %[in2], $f3, %[in1] \n\t" + "madd.s %[in3], %[in3], $f2, %[in1] \n\t" + "madd.s %[in4], %[in4], $f1, %[in1] \n\t" + "add.s %[res1], $f4, %[in1] \n\t" + "swc1 %[in1], 0(%[x]) \n\t" + "add.s $f0, $f6, %[in1] \n\t" + "madd.s %[in3], %[in3], $f3, %[in2] \n\t" + "madd.s %[in4], %[in4], $f2, %[in2] \n\t" + "add.s %[res2], $f5, %[in2] \n\t" + "madd.s %[res1], %[res1], $f8, %[four] \n\t" + "add.s $f8, $f7, %[in2] \n\t" + "swc1 %[in2], 4(%[x]) \n\t" + "madd.s %[in4], %[in4], $f3, %[in3] \n\t" + "add.s %[res3], $f6, %[in3] \n\t" + "add.s %[res1], %[res1], $f9 \n\t" + "madd.s %[res2], %[res2], $f0, %[four] \n\t" + "swc1 %[in3], 8(%[x]) \n\t" + "add.s %[res4], $f7, %[in4] \n\t" + "madd.s %[res3], %[res3], $f8, %[four] \n\t" + "swc1 %[in4], 12(%[x]) \n\t" + "add.s %[res2], %[res2], $f10 \n\t" + "add.s $f8, %[in1], %[in3] \n\t" + "madd.s %[res3], %[res3], %[in1], %[six] \n\t" + "madd.s %[res4], %[res4], $f8, %[four] \n\t" + "madd.s %[res4], %[res4], %[in2], %[six] \n\t" + + : [in1]"=&f"(in1), [in2]"=&f"(in2), + [in3]"=&f"(in3), [in4]"=&f"(in4), + [res1]"=&f"(res1), [res2]"=&f"(res2), + [res3]"=&f"(res3), [res4]"=&f"(res4) + : [src0_0]"f"(src0_0), [src0_1]"f"(src0_1), + [src0_2]"f"(src0_2), [src0_3]"f"(src0_3), + [gain]"f"(gain), [x]"r"(x), [cy]"r"(cy), + [four]"f"(four), [six]"f"(six) + : "$f0", "$f1", "$f2", "$f3", + "$f4", "$f5", "$f6", "$f7", + "$f8", "$f9", "$f10", + "memory" + ); + + dst0[0 ] = res1; + dst0[sstep ] = res2; + dst0[2*sstep] = res3; + dst0[3*sstep] = res4; + + src0 += 4*sstep; + dst0 += 4*dstep; + } + } else { + int i; + const float *src0 = src; + float *dst0 = dst; + for (i = 0; i < size; i++) { + int j; + float in, res; + in = *src0 * c->gain; + for(j = 0; j < c->order; j++) + in += c->cy[j] * s->x[j]; + res = s->x[0] + in + s->x[c->order >> 1] * c->cx[c->order >> 1]; + for(j = 1; j < c->order >> 1; j++) + res += (s->x[j] + s->x[c->order - j]) * c->cx[j]; + for(j = 0; j < c->order - 1; j++) + s->x[j] = s->x[j + 1]; + *dst0 = res; + s->x[c->order - 1] = in; + src0 += sstep; + dst0 += dstep; + } + } +} +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_INLINE_ASM */ + +void ff_iir_filter_init_mips(FFIIRFilterContext *f) { +#if HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 + f->filter_flt = ff_iir_filter_flt_mips; +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_INLINE_ASM */ +} diff --git a/libavcodec/mips/lsp_mips.h b/libavcodec/mips/lsp_mips.h new file mode 100644 index 0000000000..6219c5aa40 --- /dev/null +++ b/libavcodec/mips/lsp_mips.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Nedeljko Babic (nbabic@mips.com) + * + * LSP routines for ACELP-based codecs optimized for MIPS + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/lsp.c + */ +#ifndef AVCODEC_MIPS_LSP_MIPS_H +#define AVCODEC_MIPS_LSP_MIPS_H + +#if HAVE_MIPSFPU && HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 +#include "libavutil/mips/asmdefs.h" + +static av_always_inline void ff_lsp2polyf_mips(const double *lsp, double *f, int lp_half_order) +{ + int i, j = 0; + double * p_fi = f; + double * p_f = 0; + + f[0] = 1.0; + f[1] = -2 * lsp[0]; + lsp -= 2; + + for(i=2; i<=lp_half_order; i++) + { + double tmp, f_j_2, f_j_1, f_j; + double val = lsp[2*i]; + + __asm__ volatile( + "move %[p_f], %[p_fi] \n\t" + "add.d %[val], %[val], %[val] \n\t" + PTR_ADDIU "%[p_fi], 8 \n\t" + "ldc1 %[f_j_1], 0(%[p_f]) \n\t" + "ldc1 %[f_j], 8(%[p_f]) \n\t" + "neg.d %[val], %[val] \n\t" + "add.d %[tmp], %[f_j_1], %[f_j_1] \n\t" + "madd.d %[tmp], %[tmp], %[f_j], %[val] \n\t" + "addiu %[j], %[i], -2 \n\t" + "ldc1 %[f_j_2], -8(%[p_f]) \n\t" + "sdc1 %[tmp], 16(%[p_f]) \n\t" + "beqz %[j], ff_lsp2polyf_lp_j_end%= \n\t" + "ff_lsp2polyf_lp_j%=: \n\t" + "add.d %[tmp], %[f_j], %[f_j_2] \n\t" + "madd.d %[tmp], %[tmp], %[f_j_1], %[val] \n\t" + "mov.d %[f_j], %[f_j_1] \n\t" + "addiu %[j], -1 \n\t" + "mov.d %[f_j_1], %[f_j_2] \n\t" + "ldc1 %[f_j_2], -16(%[p_f]) \n\t" + "sdc1 %[tmp], 8(%[p_f]) \n\t" + PTR_ADDIU "%[p_f], -8 \n\t" + "bgtz %[j], ff_lsp2polyf_lp_j%= \n\t" + "ff_lsp2polyf_lp_j_end%=: \n\t" + + : [f_j_2]"=&f"(f_j_2), [f_j_1]"=&f"(f_j_1), [val]"+f"(val), + [tmp]"=&f"(tmp), [f_j]"=&f"(f_j), [p_f]"+r"(p_f), + [j]"+r"(j), [p_fi]"+r"(p_fi) + : [i]"r"(i) + : "memory" + ); + f[1] += val; + } +} +#define ff_lsp2polyf ff_lsp2polyf_mips +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_MIPSFPU && HAVE_INLINE_ASM */ +#endif /* AVCODEC_MIPS_LSP_MIPS_H */ diff --git a/libavcodec/mips/mathops.h b/libavcodec/mips/mathops.h index 573d325bf1..bb9dc8375a 100644 --- a/libavcodec/mips/mathops.h +++ b/libavcodec/mips/mathops.h @@ -1,20 +1,21 @@ /* * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -27,58 +28,39 @@ #if HAVE_INLINE_ASM -#if HAVE_LOONGSON -#if ARCH_MIPS64 +#if HAVE_LOONGSON3 -static inline av_const int64_t MAC64(int64_t d, int a, int b) +#define MULH MULH +static inline av_const int MULH(int a, int b) { - int64_t m; - __asm__ ("dmult %2, %3 \n\t" - "mflo %1 \n\t" - "daddu %0, %0, %1 \n\t" - : "+r"(d), "=&r"(m) : "r"(a), "r"(b) + int c; + __asm__ ("dmult %1, %2 \n\t" + "mflo %0 \n\t" + "dsrl %0, %0, 32 \n\t" + : "=r"(c) + : "r"(a),"r"(b) : "hi", "lo"); - return d; + return c; } -#define MAC64(d, a, b) ((d) = MAC64(d, a, b)) -static inline av_const int64_t MLS64(int64_t d, int a, int b) +#define mid_pred mid_pred +static inline av_const int mid_pred(int a, int b, int c) { - int64_t m; - __asm__ ("dmult %2, %3 \n\t" - "mflo %1 \n\t" - "dsubu %0, %0, %1 \n\t" - : "+r"(d), "=&r"(m) : "r"(a), "r"(b) - : "hi", "lo"); - return d; -} -#define MLS64(d, a, b) ((d) = MLS64(d, a, b)) - -#else - -static inline av_const int64_t MAC64(int64_t d, int a, int b) -{ - int64_t m; - __asm__ ("dmult.g %1, %2, %3 \n\t" - "daddu %0, %0, %1 \n\t" - : "+r"(d), "=&r"(m) : "r"(a), "r"(b)); - return d; + int t = b; + __asm__ ("sgt $8, %1, %2 \n\t" + "movn %0, %1, $8 \n\t" + "movn %1, %2, $8 \n\t" + "sgt $8, %1, %3 \n\t" + "movz %1, %3, $8 \n\t" + "sgt $8, %0, %1 \n\t" + "movn %0, %1, $8 \n\t" + : "+&r"(t),"+&r"(a) + : "r"(b),"r"(c) + : "$8"); + return t; } -#define MAC64(d, a, b) ((d) = MAC64(d, a, b)) - -static inline av_const int64_t MLS64(int64_t d, int a, int b) -{ - int64_t m; - __asm__ ("dmult.g %1, %2, %3 \n\t" - "dsubu %0, %0, %1 \n\t" - : "+r"(d), "=&r"(m) : "r"(a), "r"(b)); - return d; -} -#define MLS64(d, a, b) ((d) = MLS64(d, a, b)) - -#endif -#endif /* HAVE_LOONGSON */ +#endif /* HAVE_LOONGSON3 */ #endif /* HAVE_INLINE_ASM */ diff --git a/libavcodec/mips/me_cmp_init_mips.c b/libavcodec/mips/me_cmp_init_mips.c new file mode 100644 index 0000000000..219a0dc00c --- /dev/null +++ b/libavcodec/mips/me_cmp_init_mips.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "me_cmp_mips.h" + +#if HAVE_MSA +static av_cold void me_cmp_msa(MECmpContext *c, AVCodecContext *avctx) +{ +#if BIT_DEPTH == 8 + c->pix_abs[0][0] = ff_pix_abs16_msa; + c->pix_abs[0][1] = ff_pix_abs16_x2_msa; + c->pix_abs[0][2] = ff_pix_abs16_y2_msa; + c->pix_abs[0][3] = ff_pix_abs16_xy2_msa; + c->pix_abs[1][0] = ff_pix_abs8_msa; + c->pix_abs[1][1] = ff_pix_abs8_x2_msa; + c->pix_abs[1][2] = ff_pix_abs8_y2_msa; + c->pix_abs[1][3] = ff_pix_abs8_xy2_msa; + + c->hadamard8_diff[0] = ff_hadamard8_diff16_msa; + c->hadamard8_diff[1] = ff_hadamard8_diff8x8_msa; + + c->hadamard8_diff[4] = ff_hadamard8_intra16_msa; + c->hadamard8_diff[5] = ff_hadamard8_intra8x8_msa; + + c->sad[0] = ff_pix_abs16_msa; + c->sad[1] = ff_pix_abs8_msa; + c->sse[0] = ff_sse16_msa; + c->sse[1] = ff_sse8_msa; + c->sse[2] = ff_sse4_msa; +#endif +} +#endif // #if HAVE_MSA + +av_cold void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx) +{ +#if HAVE_MSA + me_cmp_msa(c, avctx); +#endif // #if HAVE_MSA +} diff --git a/libavcodec/mips/me_cmp_mips.h b/libavcodec/mips/me_cmp_mips.h new file mode 100644 index 0000000000..e0d0f51af8 --- /dev/null +++ b/libavcodec/mips/me_cmp_mips.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_ME_CMP_MIPS_H +#define AVCODEC_MIPS_ME_CMP_MIPS_H + +#include "../mpegvideo.h" +#include "libavcodec/bit_depth_template.c" + +int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src, + ptrdiff_t stride, int h); +int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src, + ptrdiff_t stride, int h); +int ff_hadamard8_diff16_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src, + ptrdiff_t stride, int h); +int ff_hadamard8_intra16_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src, + ptrdiff_t stride, int h); +int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_sse16_msa(MpegEncContext *v, uint8_t *pu8Src, uint8_t *pu8Ref, + ptrdiff_t stride, int i32Height); +int ff_sse8_msa(MpegEncContext *v, uint8_t *pu8Src, uint8_t *pu8Ref, + ptrdiff_t stride, int i32Height); +int ff_sse4_msa(MpegEncContext *v, uint8_t *pu8Src, uint8_t *pu8Ref, + ptrdiff_t stride, int i32Height); +void ff_add_pixels8_msa(uint8_t *av_restrict pixels, int16_t *block, + ptrdiff_t stride); + +#endif // #ifndef AVCODEC_MIPS_ME_CMP_MIPS_H diff --git a/libavcodec/mips/me_cmp_msa.c b/libavcodec/mips/me_cmp_msa.c new file mode 100644 index 0000000000..0e3165cd8f --- /dev/null +++ b/libavcodec/mips/me_cmp_msa.c @@ -0,0 +1,686 @@ +/* + * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "me_cmp_mips.h" + +static uint32_t sad_8width_msa(uint8_t *src, int32_t src_stride, + uint8_t *ref, int32_t ref_stride, + int32_t height) +{ + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, + src0, src1, ref0, ref1); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + return (HADD_UH_U32(sad)); +} + +static uint32_t sad_16width_msa(uint8_t *src, int32_t src_stride, + uint8_t *ref, int32_t ref_stride, + int32_t height) +{ + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + LD_UB2(ref, ref_stride, ref0, ref1); + ref += (2 * ref_stride); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + LD_UB2(ref, ref_stride, ref0, ref1); + ref += (2 * ref_stride); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + return (HADD_UH_U32(sad)); +} + +static uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src, + int32_t src_stride, + uint8_t *ref, + int32_t ref_stride, + int32_t height) +{ + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, comp0, comp1; + v16u8 ref0, ref1, ref2, ref3, ref4, ref5; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 3); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + + PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5); + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + + PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5); + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + } + + return (HADD_UH_U32(sad)); +} + +static uint32_t sad_horiz_bilinear_filter_16width_msa(uint8_t *src, + int32_t src_stride, + uint8_t *ref, + int32_t ref_stride, + int32_t height) +{ + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, comp0, comp1; + v16u8 ref00, ref10, ref20, ref30, ref01, ref11, ref21, ref31; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 3); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30); + LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31); + ref += (4 * ref_stride); + + AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30); + LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31); + ref += (4 * ref_stride); + + AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + } + + return (HADD_UH_U32(sad)); +} + +static uint32_t sad_vert_bilinear_filter_8width_msa(uint8_t *src, + int32_t src_stride, + uint8_t *ref, + int32_t ref_stride, + int32_t height) +{ + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, comp0, comp1; + v16u8 ref0, ref1, ref2, ref3, ref4; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 3); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4); + ref += (4 * ref_stride); + + PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); + PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1); + PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3); + AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4); + ref += (4 * ref_stride); + + PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); + PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1); + PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3); + AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + } + + return (HADD_UH_U32(sad)); +} + +static uint32_t sad_vert_bilinear_filter_16width_msa(uint8_t *src, + int32_t src_stride, + uint8_t *ref, + int32_t ref_stride, + int32_t height) +{ + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, comp0, comp1; + v16u8 ref0, ref1, ref2, ref3, ref4; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 3); ht_cnt--;) { + LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3); + ref += (5 * ref_stride); + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + + ref4 = ref3; + + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (3 * ref_stride); + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + } + + return (HADD_UH_U32(sad)); +} + +static uint32_t sad_hv_bilinear_filter_8width_msa(uint8_t *src, + int32_t src_stride, + uint8_t *ref, + int32_t ref_stride, + int32_t height) +{ + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, temp0, temp1, diff; + v16u8 ref0, ref1, ref2, ref3, ref4; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 comp0, comp1, comp2, comp3; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); + + VSHF_B2_UB(ref4, ref4, ref0, ref0, mask, mask, temp0, temp1); + comp0 = __msa_hadd_u_h(temp0, temp0); + comp1 = __msa_hadd_u_h(temp1, temp1); + comp0 += comp1; + comp0 = (v8u16) __msa_srari_h((v8i16) comp0, 2); + comp0 = (v8u16) __msa_pckev_b((v16i8) comp0, (v16i8) comp0); + + temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref1, (v16i8) ref1); + comp2 = __msa_hadd_u_h(temp0, temp0); + comp1 += comp2; + comp1 = (v8u16) __msa_srari_h((v8i16) comp1, 2); + comp1 = (v8u16) __msa_pckev_b((v16i8) comp1, (v16i8) comp1); + comp1 = (v8u16) __msa_pckev_d((v2i64) comp1, (v2i64) comp0); + diff = (v16u8) __msa_asub_u_b(src0, (v16u8) comp1); + sad += __msa_hadd_u_h(diff, diff); + + temp1 = (v16u8) __msa_vshf_b(mask, (v16i8) ref2, (v16i8) ref2); + comp3 = __msa_hadd_u_h(temp1, temp1); + comp2 += comp3; + comp2 = (v8u16) __msa_srari_h((v8i16) comp2, 2); + comp2 = (v8u16) __msa_pckev_b((v16i8) comp2, (v16i8) comp2); + + temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref3, (v16i8) ref3); + comp0 = __msa_hadd_u_h(temp0, temp0); + comp3 += comp0; + comp3 = (v8u16) __msa_srari_h((v8i16) comp3, 2); + comp3 = (v8u16) __msa_pckev_b((v16i8) comp3, (v16i8) comp3); + comp3 = (v8u16) __msa_pckev_d((v2i64) comp3, (v2i64) comp2); + diff = (v16u8) __msa_asub_u_b(src1, (v16u8) comp3); + sad += __msa_hadd_u_h(diff, diff); + } + + return (HADD_UH_U32(sad)); +} + +static uint32_t sad_hv_bilinear_filter_16width_msa(uint8_t *src, + int32_t src_stride, + uint8_t *ref, + int32_t ref_stride, + int32_t height) +{ + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, comp, diff; + v16u8 temp0, temp1, temp2, temp3; + v16u8 ref00, ref01, ref02, ref03, ref04, ref10, ref11, ref12, ref13, ref14; + v8u16 comp0, comp1, comp2, comp3; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 3); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB5(ref, ref_stride, ref04, ref00, ref01, ref02, ref03); + LD_UB5(ref + 1, ref_stride, ref14, ref10, ref11, ref12, ref13); + ref += (5 * ref_stride); + + ILVRL_B2_UB(ref14, ref04, temp0, temp1); + comp0 = __msa_hadd_u_h(temp0, temp0); + comp1 = __msa_hadd_u_h(temp1, temp1); + ILVRL_B2_UB(ref10, ref00, temp2, temp3); + comp2 = __msa_hadd_u_h(temp2, temp2); + comp3 = __msa_hadd_u_h(temp3, temp3); + comp0 += comp2; + comp1 += comp3; + SRARI_H2_UH(comp0, comp1, 2); + comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0); + diff = __msa_asub_u_b(src0, comp); + sad += __msa_hadd_u_h(diff, diff); + + ILVRL_B2_UB(ref11, ref01, temp0, temp1); + comp0 = __msa_hadd_u_h(temp0, temp0); + comp1 = __msa_hadd_u_h(temp1, temp1); + comp2 += comp0; + comp3 += comp1; + SRARI_H2_UH(comp2, comp3, 2); + comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2); + diff = __msa_asub_u_b(src1, comp); + sad += __msa_hadd_u_h(diff, diff); + + ILVRL_B2_UB(ref12, ref02, temp2, temp3); + comp2 = __msa_hadd_u_h(temp2, temp2); + comp3 = __msa_hadd_u_h(temp3, temp3); + comp0 += comp2; + comp1 += comp3; + SRARI_H2_UH(comp0, comp1, 2); + comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0); + diff = __msa_asub_u_b(src2, comp); + sad += __msa_hadd_u_h(diff, diff); + + ILVRL_B2_UB(ref13, ref03, temp0, temp1); + comp0 = __msa_hadd_u_h(temp0, temp0); + comp1 = __msa_hadd_u_h(temp1, temp1); + comp2 += comp0; + comp3 += comp1; + SRARI_H2_UH(comp2, comp3, 2); + comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2); + diff = __msa_asub_u_b(src3, comp); + sad += __msa_hadd_u_h(diff, diff); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref00, ref01, ref02, ref03); + LD_UB4(ref + 1, ref_stride, ref10, ref11, ref12, ref13); + ref += (3 * ref_stride); + + ILVRL_B2_UB(ref10, ref00, temp2, temp3); + comp2 = __msa_hadd_u_h(temp2, temp2); + comp3 = __msa_hadd_u_h(temp3, temp3); + comp0 += comp2; + comp1 += comp3; + SRARI_H2_UH(comp0, comp1, 2); + comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0); + diff = __msa_asub_u_b(src0, comp); + sad += __msa_hadd_u_h(diff, diff); + + ILVRL_B2_UB(ref11, ref01, temp0, temp1); + comp0 = __msa_hadd_u_h(temp0, temp0); + comp1 = __msa_hadd_u_h(temp1, temp1); + comp2 += comp0; + comp3 += comp1; + SRARI_H2_UH(comp2, comp3, 2); + comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2); + diff = __msa_asub_u_b(src1, comp); + sad += __msa_hadd_u_h(diff, diff); + + ILVRL_B2_UB(ref12, ref02, temp2, temp3); + comp2 = __msa_hadd_u_h(temp2, temp2); + comp3 = __msa_hadd_u_h(temp3, temp3); + comp0 += comp2; + comp1 += comp3; + SRARI_H2_UH(comp0, comp1, 2); + comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0); + diff = __msa_asub_u_b(src2, comp); + sad += __msa_hadd_u_h(diff, diff); + + ILVRL_B2_UB(ref13, ref03, temp0, temp1); + comp0 = __msa_hadd_u_h(temp0, temp0); + comp1 = __msa_hadd_u_h(temp1, temp1); + comp2 += comp0; + comp3 += comp1; + SRARI_H2_UH(comp2, comp3, 2); + comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2); + diff = __msa_asub_u_b(src3, comp); + sad += __msa_hadd_u_h(diff, diff); + } + + return (HADD_UH_U32(sad)); +} + +#define CALC_MSE_B(src, ref, var) \ +{ \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ +} + +static uint32_t sse_4width_msa(uint8_t *src_ptr, int32_t src_stride, + uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) +{ + int32_t ht_cnt; + uint32_t sse; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + CALC_MSE_B(src, ref, var); + } + + sse = HADD_SW_S32(var); + + return sse; +} + +static uint32_t sse_8width_msa(uint8_t *src_ptr, int32_t src_stride, + uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) +{ + int32_t ht_cnt; + uint32_t sse; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, + src0, src1, ref0, ref1); + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + } + + sse = HADD_SW_S32(var); + + return sse; +} + +static uint32_t sse_16width_msa(uint8_t *src_ptr, int32_t src_stride, + uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) +{ + int32_t ht_cnt; + uint32_t sse; + v16u8 src, ref; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + } + + sse = HADD_SW_S32(var); + + return sse; +} + +static int32_t hadamard_diff_8x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *ref, int32_t ref_stride) +{ + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; + v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + v8i16 sum = { 0 }; + v8i16 zero = { 0 }; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ILVR_B8_UH(src0, ref0, src1, ref1, src2, ref2, src3, ref3, + src4, ref4, src5, ref5, src6, ref6, src7, ref7, + diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7); + HSUB_UB4_UH(diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3); + HSUB_UB4_UH(diff4, diff5, diff6, diff7, diff4, diff5, diff6, diff7); + TRANSPOSE8x8_UH_UH(diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7, + diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7); + BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1, + temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1); + BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2, + diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2); + BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4, + temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4); + TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, + temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7); + BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1, + diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1); + BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2, + temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2); + ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7, + diff0, diff1, diff2, diff3); + sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7); + sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6); + sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5); + sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4); + sum += __msa_add_a_h((v8i16) diff0, zero); + sum += __msa_add_a_h((v8i16) diff1, zero); + sum += __msa_add_a_h((v8i16) diff2, zero); + sum += __msa_add_a_h((v8i16) diff3, zero); + + return (HADD_UH_U32(sum)); +} + +static int32_t hadamard_intra_8x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *ref, int32_t ref_stride) +{ + int32_t sum_res = 0; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; + v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + v8i16 sum = { 0 }; + v16i8 zero = { 0 }; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + TRANSPOSE8x8_UB_UB(src0, src1, src2, src3, src4, src5, src6, src7, + src0, src1, src2, src3, src4, src5, src6, src7); + ILVR_B8_UH(zero, src0, zero, src1, zero, src2, zero, src3, + zero, src4, zero, src5, zero, src6, zero, src7, + diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7); + BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1, + temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1); + BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2, + diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2); + BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4, + temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4); + TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, + temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7); + BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1, + diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1); + BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2, + temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2); + ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7, + diff0, diff1, diff2, diff3); + sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7); + sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6); + sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5); + sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4); + sum += __msa_add_a_h((v8i16) diff0, (v8i16) zero); + sum += __msa_add_a_h((v8i16) diff1, (v8i16) zero); + sum += __msa_add_a_h((v8i16) diff2, (v8i16) zero); + sum += __msa_add_a_h((v8i16) diff3, (v8i16) zero); + sum_res = (HADD_UH_U32(sum)); + sum_res -= abs(temp0[0] + temp4[0]); + + return sum_res; +} + +int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, + ptrdiff_t stride, int height) +{ + return sad_16width_msa(src, stride, ref, stride, height); +} + +int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, + ptrdiff_t stride, int height) +{ + return sad_8width_msa(src, stride, ref, stride, height); +} + +int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h) +{ + return sad_horiz_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h); +} + +int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h) +{ + return sad_vert_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h); +} + +int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h) +{ + return sad_hv_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h); +} + +int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h) +{ + return sad_horiz_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h); +} + +int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h) +{ + return sad_vert_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h); +} + +int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h) +{ + return sad_hv_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h); +} + +int ff_sse16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, + ptrdiff_t stride, int height) +{ + return sse_16width_msa(src, stride, ref, stride, height); +} + +int ff_sse8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, + ptrdiff_t stride, int height) +{ + return sse_8width_msa(src, stride, ref, stride, height); +} + +int ff_sse4_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, + ptrdiff_t stride, int height) +{ + return sse_4width_msa(src, stride, ref, stride, height); +} + +int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src, + ptrdiff_t stride, int h) +{ + return hadamard_diff_8x8_msa(src, stride, dst, stride); +} + +int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src, + ptrdiff_t stride, int h) +{ + return hadamard_intra_8x8_msa(src, stride, dst, stride); +} + +/* Hadamard Transform functions */ +#define WRAPPER8_16_SQ(name8, name16) \ +int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \ + ptrdiff_t stride, int h) \ +{ \ + int score = 0; \ + score += name8(s, dst, src, stride, 8); \ + score += name8(s, dst + 8, src + 8, stride, 8); \ + if(h == 16) { \ + dst += 8 * stride; \ + src += 8 * stride; \ + score +=name8(s, dst, src, stride, 8); \ + score +=name8(s, dst + 8, src + 8, stride, 8); \ + } \ + return score; \ +} + +WRAPPER8_16_SQ(ff_hadamard8_diff8x8_msa, ff_hadamard8_diff16_msa); +WRAPPER8_16_SQ(ff_hadamard8_intra8x8_msa, ff_hadamard8_intra16_msa); diff --git a/libavcodec/mips/mpegaudiodsp_mips_fixed.c b/libavcodec/mips/mpegaudiodsp_mips_fixed.c new file mode 100644 index 0000000000..ed8c89089e --- /dev/null +++ b/libavcodec/mips/mpegaudiodsp_mips_fixed.c @@ -0,0 +1,918 @@ + /* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Bojan Zivkovic (bojan@mips.com) + * + * MPEG Audio decoder optimized for MIPS fixed-point architecture + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/mpegaudiodsp_template.c + */ + +#include <string.h> + +#include "libavutil/mips/asmdefs.h" +#include "libavcodec/mpegaudiodsp.h" + +#if HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 + +static void ff_mpadsp_apply_window_mips_fixed(int32_t *synth_buf, int32_t *window, + int *dither_state, int16_t *samples, int incr) +{ + register const int32_t *w, *w2, *p; + int j; + int16_t *samples2; + int w_asm, p_asm, w_asm1, p_asm1, w_asm2, p_asm2; + int w2_asm, w2_asm1, *p_temp1, *p_temp2; + int sum1 = 0; + int const min_asm = -32768, max_asm = 32767; + int temp1, temp2 = 0, temp3 = 0; + int64_t sum; + + /* copy to avoid wrap */ + memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf)); + samples2 = samples + 31 * incr; + w = window; + w2 = window + 31; + sum = *dither_state; + p = synth_buf + 16; + p_temp1 = synth_buf + 16; + p_temp2 = synth_buf + 48; + temp1 = sum; + + /** + * use of round_sample function from the original code is eliminated, + * changed with appropriate assembly instructions. + */ + __asm__ volatile ( + "mthi $zero \n\t" + "mtlo %[temp1] \n\t" + "lw %[w_asm], 0(%[w]) \n\t" + "lw %[p_asm], 0(%[p]) \n\t" + "lw %[w_asm1], 64*4(%[w]) \n\t" + "lw %[p_asm1], 64*4(%[p]) \n\t" + "lw %[w_asm2], 128*4(%[w]) \n\t" + "lw %[p_asm2], 128*4(%[p]) \n\t" + "madd %[w_asm], %[p_asm] \n\t" + "madd %[w_asm1], %[p_asm1] \n\t" + "madd %[w_asm2], %[p_asm2] \n\t" + "lw %[w_asm], 192*4(%[w]) \n\t" + "lw %[p_asm], 192*4(%[p]) \n\t" + "lw %[w_asm1], 256*4(%[w]) \n\t" + "lw %[p_asm1], 256*4(%[p]) \n\t" + "lw %[w_asm2], 320*4(%[w]) \n\t" + "lw %[p_asm2], 320*4(%[p]) \n\t" + "madd %[w_asm], %[p_asm] \n\t" + "madd %[w_asm1], %[p_asm1] \n\t" + "madd %[w_asm2], %[p_asm2] \n\t" + "lw %[w_asm], 384*4(%[w]) \n\t" + "lw %[p_asm], 384*4(%[p]) \n\t" + "lw %[w_asm1], 448*4(%[w]) \n\t" + "lw %[p_asm1], 448*4(%[p]) \n\t" + "lw %[w_asm2], 32*4(%[w]) \n\t" + "lw %[p_asm2], 32*4(%[p]) \n\t" + "madd %[w_asm], %[p_asm] \n\t" + "madd %[w_asm1], %[p_asm1] \n\t" + "msub %[w_asm2], %[p_asm2] \n\t" + "lw %[w_asm], 96*4(%[w]) \n\t" + "lw %[p_asm], 96*4(%[p]) \n\t" + "lw %[w_asm1], 160*4(%[w]) \n\t" + "lw %[p_asm1], 160*4(%[p]) \n\t" + "lw %[w_asm2], 224*4(%[w]) \n\t" + "lw %[p_asm2], 224*4(%[p]) \n\t" + "msub %[w_asm], %[p_asm] \n\t" + "msub %[w_asm1], %[p_asm1] \n\t" + "msub %[w_asm2], %[p_asm2] \n\t" + "lw %[w_asm], 288*4(%[w]) \n\t" + "lw %[p_asm], 288*4(%[p]) \n\t" + "lw %[w_asm1], 352*4(%[w]) \n\t" + "lw %[p_asm1], 352*4(%[p]) \n\t" + "msub %[w_asm], %[p_asm] \n\t" + "lw %[w_asm], 480*4(%[w]) \n\t" + "lw %[p_asm], 480*4(%[p]) \n\t" + "lw %[w_asm2], 416*4(%[w]) \n\t" + "lw %[p_asm2], 416*4(%[p]) \n\t" + "msub %[w_asm], %[p_asm] \n\t" + "msub %[w_asm1], %[p_asm1] \n\t" + "msub %[w_asm2], %[p_asm2] \n\t" + + /*round_sample function from the original code is eliminated, + * changed with appropriate assembly instructions + * code example: + + "extr.w %[sum1],$ac0,24 \n\t" + "mflo %[temp3], $ac0 \n\t" + "and %[temp1], %[temp3], 0x00ffffff \n\t" + "slt %[temp2], %[sum1], %[min_asm] \n\t" + "movn %[sum1], %[min_asm],%[temp2] \n\t" + "slt %[temp2], %[max_asm],%[sum1] \n\t" + "movn %[sum1], %[max_asm],%[temp2] \n\t" + "sh %[sum1], 0(%[samples]) \n\t" + */ + + "extr.w %[sum1], $ac0, 24 \n\t" + "mflo %[temp3] \n\t" + PTR_ADDIU "%[w], %[w], 4 \n\t" + "and %[temp1], %[temp3], 0x00ffffff \n\t" + "slt %[temp2], %[sum1], %[min_asm] \n\t" + "movn %[sum1], %[min_asm], %[temp2] \n\t" + "slt %[temp2], %[max_asm], %[sum1] \n\t" + "movn %[sum1], %[max_asm], %[temp2] \n\t" + "sh %[sum1], 0(%[samples]) \n\t" + + : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1), + [p_asm1] "=&r" (p_asm1), [temp1] "+r" (temp1), [temp2] "+r" (temp2), + [w_asm2] "=&r" (w_asm2), [p_asm2] "=&r" (p_asm2), + [sum1] "+r" (sum1), [w] "+r" (w), [temp3] "+r" (temp3) + : [p] "r" (p), [samples] "r" (samples), [min_asm] "r" (min_asm), + [max_asm] "r" (max_asm) + : "memory", "hi","lo" + ); + + samples += incr; + + /* we calculate two samples at the same time to avoid one memory + access per two sample */ + + for(j = 1; j < 16; j++) { + __asm__ volatile ( + "mthi $0, $ac1 \n\t" + "mtlo $0, $ac1 \n\t" + "mthi $0 \n\t" + "mtlo %[temp1] \n\t" + PTR_ADDIU "%[p_temp1], %[p_temp1], 4 \n\t" + "lw %[w_asm], 0(%[w]) \n\t" + "lw %[p_asm], 0(%[p_temp1]) \n\t" + "lw %[w2_asm], 0(%[w2]) \n\t" + "lw %[w_asm1], 64*4(%[w]) \n\t" + "lw %[p_asm1], 64*4(%[p_temp1]) \n\t" + "lw %[w2_asm1], 64*4(%[w2]) \n\t" + "madd %[w_asm], %[p_asm] \n\t" + "msub $ac1, %[w2_asm], %[p_asm] \n\t" + "madd %[w_asm1], %[p_asm1] \n\t" + "msub $ac1, %[w2_asm1], %[p_asm1] \n\t" + "lw %[w_asm], 128*4(%[w]) \n\t" + "lw %[p_asm], 128*4(%[p_temp1]) \n\t" + "lw %[w2_asm], 128*4(%[w2]) \n\t" + "lw %[w_asm1], 192*4(%[w]) \n\t" + "lw %[p_asm1], 192*4(%[p_temp1]) \n\t" + "lw %[w2_asm1], 192*4(%[w2]) \n\t" + "madd %[w_asm], %[p_asm] \n\t" + "msub $ac1, %[w2_asm], %[p_asm] \n\t" + "madd %[w_asm1], %[p_asm1] \n\t" + "msub $ac1, %[w2_asm1], %[p_asm1] \n\t" + "lw %[w_asm], 256*4(%[w]) \n\t" + "lw %[p_asm], 256*4(%[p_temp1]) \n\t" + "lw %[w2_asm], 256*4(%[w2]) \n\t" + "lw %[w_asm1], 320*4(%[w]) \n\t" + "lw %[p_asm1], 320*4(%[p_temp1]) \n\t" + "lw %[w2_asm1], 320*4(%[w2]) \n\t" + "madd %[w_asm], %[p_asm] \n\t" + "msub $ac1, %[w2_asm], %[p_asm] \n\t" + "madd %[w_asm1], %[p_asm1] \n\t" + "msub $ac1, %[w2_asm1], %[p_asm1] \n\t" + "lw %[w_asm], 384*4(%[w]) \n\t" + "lw %[p_asm], 384*4(%[p_temp1]) \n\t" + "lw %[w2_asm], 384*4(%[w2]) \n\t" + "lw %[w_asm1], 448*4(%[w]) \n\t" + "lw %[p_asm1], 448*4(%[p_temp1]) \n\t" + "lw %[w2_asm1], 448*4(%[w2]) \n\t" + "madd %[w_asm], %[p_asm] \n\t" + "msub $ac1, %[w2_asm], %[p_asm] \n\t" + "madd %[w_asm1], %[p_asm1] \n\t" + "msub $ac1, %[w2_asm1], %[p_asm1] \n\t" + PTR_ADDIU "%[p_temp2], %[p_temp2], -4 \n\t" + "lw %[w_asm], 32*4(%[w]) \n\t" + "lw %[p_asm], 0(%[p_temp2]) \n\t" + "lw %[w2_asm], 32*4(%[w2]) \n\t" + "lw %[w_asm1], 96*4(%[w]) \n\t" + "lw %[p_asm1], 64*4(%[p_temp2]) \n\t" + "lw %[w2_asm1], 96*4(%[w2]) \n\t" + "msub %[w_asm], %[p_asm] \n\t" + "msub $ac1, %[w2_asm], %[p_asm] \n\t" + "msub %[w_asm1], %[p_asm1] \n\t" + "msub $ac1, %[w2_asm1], %[p_asm1] \n\t" + "lw %[w_asm], 160*4(%[w]) \n\t" + "lw %[p_asm], 128*4(%[p_temp2]) \n\t" + "lw %[w2_asm], 160*4(%[w2]) \n\t" + "lw %[w_asm1], 224*4(%[w]) \n\t" + "lw %[p_asm1], 192*4(%[p_temp2]) \n\t" + "lw %[w2_asm1], 224*4(%[w2]) \n\t" + "msub %[w_asm], %[p_asm] \n\t" + "msub $ac1, %[w2_asm], %[p_asm] \n\t" + "msub %[w_asm1], %[p_asm1] \n\t" + "msub $ac1, %[w2_asm1], %[p_asm1] \n\t" + "lw %[w_asm], 288*4(%[w]) \n\t" + "lw %[p_asm], 256*4(%[p_temp2]) \n\t" + "lw %[w2_asm], 288*4(%[w2]) \n\t" + "lw %[w_asm1], 352*4(%[w]) \n\t" + "lw %[p_asm1], 320*4(%[p_temp2]) \n\t" + "lw %[w2_asm1], 352*4(%[w2]) \n\t" + "msub %[w_asm], %[p_asm] \n\t" + "msub $ac1, %[w2_asm], %[p_asm] \n\t" + "msub %[w_asm1], %[p_asm1] \n\t" + "msub $ac1, %[w2_asm1], %[p_asm1] \n\t" + "lw %[w_asm], 416*4(%[w]) \n\t" + "lw %[p_asm], 384*4(%[p_temp2]) \n\t" + "lw %[w2_asm], 416*4(%[w2]) \n\t" + "lw %[w_asm1], 480*4(%[w]) \n\t" + "lw %[p_asm1], 448*4(%[p_temp2]) \n\t" + "lw %[w2_asm1], 480*4(%[w2]) \n\t" + "msub %[w_asm], %[p_asm] \n\t" + "msub %[w_asm1], %[p_asm1] \n\t" + "msub $ac1, %[w2_asm], %[p_asm] \n\t" + "msub $ac1, %[w2_asm1], %[p_asm1] \n\t" + PTR_ADDIU "%[w], %[w], 4 \n\t" + PTR_ADDIU "%[w2], %[w2], -4 \n\t" + "mflo %[temp2] \n\t" + "extr.w %[sum1], $ac0, 24 \n\t" + "li %[temp3], 1 \n\t" + "and %[temp1], %[temp2], 0x00ffffff \n\t" + "madd $ac1, %[temp1], %[temp3] \n\t" + "slt %[temp2], %[sum1], %[min_asm] \n\t" + "movn %[sum1], %[min_asm], %[temp2] \n\t" + "slt %[temp2], %[max_asm], %[sum1] \n\t" + "movn %[sum1], %[max_asm], %[temp2] \n\t" + "sh %[sum1], 0(%[samples]) \n\t" + "mflo %[temp3], $ac1 \n\t" + "extr.w %[sum1], $ac1, 24 \n\t" + "and %[temp1], %[temp3], 0x00ffffff \n\t" + "slt %[temp2], %[sum1], %[min_asm] \n\t" + "movn %[sum1], %[min_asm], %[temp2] \n\t" + "slt %[temp2], %[max_asm], %[sum1] \n\t" + "movn %[sum1], %[max_asm], %[temp2] \n\t" + "sh %[sum1], 0(%[samples2]) \n\t" + + : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1), + [p_asm1] "=&r" (p_asm1), [w2_asm1] "=&r" (w2_asm1), + [w2_asm] "=&r" (w2_asm), [temp1] "+r" (temp1), [temp2] "+r" (temp2), + [p_temp1] "+r" (p_temp1), [p_temp2] "+r" (p_temp2), [sum1] "+r" (sum1), + [w] "+r" (w), [w2] "+r" (w2), [samples] "+r" (samples), + [samples2] "+r" (samples2), [temp3] "+r" (temp3) + : [min_asm] "r" (min_asm), [max_asm] "r" (max_asm) + : "memory", "hi", "lo", "$ac1hi", "$ac1lo" + ); + + samples += incr; + samples2 -= incr; + } + + p = synth_buf + 32; + + __asm__ volatile ( + "mthi $0 \n\t" + "mtlo %[temp1] \n\t" + "lw %[w_asm], 32*4(%[w]) \n\t" + "lw %[p_asm], 0(%[p]) \n\t" + "lw %[w_asm1], 96*4(%[w]) \n\t" + "lw %[p_asm1], 64*4(%[p]) \n\t" + "lw %[w_asm2], 160*4(%[w]) \n\t" + "lw %[p_asm2], 128*4(%[p]) \n\t" + "msub %[w_asm], %[p_asm] \n\t" + "msub %[w_asm1], %[p_asm1] \n\t" + "msub %[w_asm2], %[p_asm2] \n\t" + "lw %[w_asm], 224*4(%[w]) \n\t" + "lw %[p_asm], 192*4(%[p]) \n\t" + "lw %[w_asm1], 288*4(%[w]) \n\t" + "lw %[p_asm1], 256*4(%[p]) \n\t" + "lw %[w_asm2], 352*4(%[w]) \n\t" + "lw %[p_asm2], 320*4(%[p]) \n\t" + "msub %[w_asm], %[p_asm] \n\t" + "msub %[w_asm1], %[p_asm1] \n\t" + "msub %[w_asm2], %[p_asm2] \n\t" + "lw %[w_asm], 416*4(%[w]) \n\t" + "lw %[p_asm], 384*4(%[p]) \n\t" + "lw %[w_asm1], 480*4(%[w]) \n\t" + "lw %[p_asm1], 448*4(%[p]) \n\t" + "msub %[w_asm], %[p_asm] \n\t" + "msub %[w_asm1], %[p_asm1] \n\t" + "extr.w %[sum1], $ac0, 24 \n\t" + "mflo %[temp2] \n\t" + "and %[temp1], %[temp2], 0x00ffffff \n\t" + "slt %[temp2], %[sum1], %[min_asm] \n\t" + "movn %[sum1], %[min_asm], %[temp2] \n\t" + "slt %[temp2], %[max_asm], %[sum1] \n\t" + "movn %[sum1], %[max_asm], %[temp2] \n\t" + "sh %[sum1], 0(%[samples]) \n\t" + + : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1), + [p_asm1] "=&r" (p_asm1), [temp1] "+r" (temp1), [temp2] "+r" (temp2), + [w_asm2] "=&r" (w_asm2), [p_asm2] "=&r" (p_asm2), [sum1] "+r" (sum1) + : [w] "r" (w), [p] "r" (p), [samples] "r" (samples), [min_asm] "r" (min_asm), + [max_asm] "r" (max_asm) + : "memory", "hi", "lo", "$ac1hi", "$ac1lo" + ); + + *dither_state= temp1; +} + +static void imdct36_mips_fixed(int *out, int *buf, int *in, int *win) +{ + int j; + int t0, t1, t2, t3, s0, s1, s2, s3; + int tmp[18], *tmp1, *in1; + /* temporary variables */ + int temp_reg1, temp_reg2, temp_reg3, temp_reg4, temp_reg5, temp_reg6; + int t4, t5, t6, t8, t7; + + /* values defined in macros and tables are + * eliminated - they are directly loaded in appropriate variables + */ + int const C_1 = 4229717092; /* cos(pi*1/18)*2 */ + int const C_2 = 4035949074; /* cos(pi*2/18)*2 */ + int const C_3 = 575416510; /* -cos(pi*3/18)*2 */ + int const C_3A = 3719550786; /* cos(pi*3/18)*2 */ + int const C_4 = 1004831466; /* -cos(pi*4/18)*2 */ + int const C_5 = 1534215534; /* -cos(pi*5/18)*2 */ + int const C_7 = -1468965330; /* -cos(pi*7/18)*2 */ + int const C_8 = -745813244; /* -cos(pi*8/18)*2 */ + + /* + * instructions of the first two loops are reorganized and loops are unrolled, + * in order to eliminate unnecessary readings and writings in array + */ + + __asm__ volatile ( + "lw %[t1], 17*4(%[in]) \n\t" + "lw %[t2], 16*4(%[in]) \n\t" + "lw %[t3], 15*4(%[in]) \n\t" + "lw %[t4], 14*4(%[in]) \n\t" + "addu %[t1], %[t1], %[t2] \n\t" + "addu %[t2], %[t2], %[t3] \n\t" + "addu %[t3], %[t3], %[t4] \n\t" + "lw %[t5], 13*4(%[in]) \n\t" + "addu %[t1], %[t1], %[t3] \n\t" + "sw %[t2], 16*4(%[in]) \n\t" + "lw %[t6], 12*4(%[in]) \n\t" + "sw %[t1], 17*4(%[in]) \n\t" + "addu %[t4], %[t4], %[t5] \n\t" + "addu %[t5], %[t5], %[t6] \n\t" + "lw %[t7], 11*4(%[in]) \n\t" + "addu %[t3], %[t3], %[t5] \n\t" + "sw %[t4], 14*4(%[in]) \n\t" + "lw %[t8], 10*4(%[in]) \n\t" + "sw %[t3], 15*4(%[in]) \n\t" + "addu %[t6], %[t6], %[t7] \n\t" + "addu %[t7], %[t7], %[t8] \n\t" + "sw %[t6], 12*4(%[in]) \n\t" + "addu %[t5], %[t5], %[t7] \n\t" + "lw %[t1], 9*4(%[in]) \n\t" + "lw %[t2], 8*4(%[in]) \n\t" + "sw %[t5], 13*4(%[in]) \n\t" + "addu %[t8], %[t8], %[t1] \n\t" + "addu %[t1], %[t1], %[t2] \n\t" + "sw %[t8], 10*4(%[in]) \n\t" + "addu %[t7], %[t7], %[t1] \n\t" + "lw %[t3], 7*4(%[in]) \n\t" + "lw %[t4], 6*4(%[in]) \n\t" + "sw %[t7], 11*4(%[in]) \n\t" + "addu %[t2], %[t2], %[t3] \n\t" + "addu %[t3], %[t3], %[t4] \n\t" + "sw %[t2], 8*4(%[in]) \n\t" + "addu %[t1], %[t1], %[t3] \n\t" + "lw %[t5], 5*4(%[in]) \n\t" + "lw %[t6], 4*4(%[in]) \n\t" + "sw %[t1], 9*4(%[in]) \n\t" + "addu %[t4], %[t4], %[t5] \n\t" + "addu %[t5], %[t5], %[t6] \n\t" + "sw %[t4], 6*4(%[in]) \n\t" + "addu %[t3], %[t3], %[t5] \n\t" + "lw %[t7], 3*4(%[in]) \n\t" + "lw %[t8], 2*4(%[in]) \n\t" + "sw %[t3], 7*4(%[in]) \n\t" + "addu %[t6], %[t6], %[t7] \n\t" + "addu %[t7], %[t7], %[t8] \n\t" + "sw %[t6], 4*4(%[in]) \n\t" + "addu %[t5], %[t5], %[t7] \n\t" + "lw %[t1], 1*4(%[in]) \n\t" + "lw %[t2], 0*4(%[in]) \n\t" + "sw %[t5], 5*4(%[in]) \n\t" + "addu %[t8], %[t8], %[t1] \n\t" + "addu %[t1], %[t1], %[t2] \n\t" + "sw %[t8], 2*4(%[in]) \n\t" + "addu %[t7], %[t7], %[t1] \n\t" + "sw %[t7], 3*4(%[in]) \n\t" + "sw %[t1], 1*4(%[in]) \n\t" + + : [in] "+r" (in), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), + [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), + [t7] "=&r" (t7), [t8] "=&r" (t8) + : + : "memory" + ); + + for(j = 0; j < 2; j++) { + + tmp1 = tmp + j; + in1 = in + j; + + /** + * Original constants are multiplied by two in advanced + * for assembly optimization (e.g. C_2 = 2 * C2). + * That can lead to overflow in operations where they are used. + * + * Example of the solution: + * + * in original code: + * t0 = ((int64_t)(in1[2*2] + in1[2*4]) * (int64_t)(2*C2))>>32 + * + * in assembly: + * C_2 = 2 * C2; + * . + * . + * "lw %[t7], 4*4(%[in1]) \n\t" + * "lw %[t8], 8*4(%[in1]) \n\t" + * "addu %[temp_reg2],%[t7], %[t8] \n\t" + * "multu %[C_2], %[temp_reg2] \n\t" + * "mfhi %[temp_reg1] \n\t" + * "sra %[temp_reg2],%[temp_reg2],31 \n\t" + * "move %[t0], $0 \n\t" + * "movn %[t0], %[C_2], %[temp_reg2] \n\t" + * "sub %[t0], %[temp_reg1],%[t0] \n\t" + */ + + __asm__ volatile ( + "lw %[t7], 4*4(%[in1]) \n\t" + "lw %[t8], 8*4(%[in1]) \n\t" + "lw %[t6], 16*4(%[in1]) \n\t" + "lw %[t4], 0*4(%[in1]) \n\t" + "addu %[temp_reg2], %[t7], %[t8] \n\t" + "addu %[t2], %[t6], %[t8] \n\t" + "multu %[C_2], %[temp_reg2] \n\t" + "lw %[t5], 12*4(%[in1]) \n\t" + "sub %[t2], %[t2], %[t7] \n\t" + "sub %[t1], %[t4], %[t5] \n\t" + "sra %[t3], %[t5], 1 \n\t" + "sra %[temp_reg1], %[t2], 1 \n\t" + "addu %[t3], %[t3], %[t4] \n\t" + "sub %[temp_reg1], %[t1], %[temp_reg1] \n\t" + "sra %[temp_reg2], %[temp_reg2], 31 \n\t" + "sw %[temp_reg1], 6*4(%[tmp1]) \n\t" + "move %[t0], $0 \n\t" + "movn %[t0], %[C_2], %[temp_reg2] \n\t" + "mfhi %[temp_reg1] \n\t" + "addu %[t1], %[t1], %[t2] \n\t" + "sw %[t1], 16*4(%[tmp1]) \n\t" + "sub %[temp_reg4], %[t8], %[t6] \n\t" + "add %[temp_reg2], %[t7], %[t6] \n\t" + "mult $ac1, %[C_8], %[temp_reg4] \n\t" + "multu $ac2, %[C_4], %[temp_reg2] \n\t" + "sub %[t0], %[temp_reg1], %[t0] \n\t" + "sra %[temp_reg1], %[temp_reg2], 31 \n\t" + "move %[t2], $0 \n\t" + "movn %[t2], %[C_4], %[temp_reg1] \n\t" + "mfhi %[t1], $ac1 \n\t" + "mfhi %[temp_reg1], $ac2 \n\t" + "lw %[t6], 10*4(%[in1]) \n\t" + "lw %[t8], 14*4(%[in1]) \n\t" + "lw %[t7], 2*4(%[in1]) \n\t" + "lw %[t4], 6*4(%[in1]) \n\t" + "sub %[temp_reg3], %[t3], %[t0] \n\t" + "add %[temp_reg4], %[t3], %[t0] \n\t" + "sub %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t" + "add %[temp_reg4], %[temp_reg4], %[t1] \n\t" + "sub %[t2], %[temp_reg1], %[t2] \n\t" + "sw %[temp_reg4], 2*4(%[tmp1]) \n\t" + "sub %[temp_reg3], %[temp_reg3], %[t2] \n\t" + "add %[temp_reg1], %[t3], %[t2] \n\t" + "sw %[temp_reg3], 10*4(%[tmp1]) \n\t" + "sub %[temp_reg1], %[temp_reg1], %[t1] \n\t" + "addu %[temp_reg2], %[t6], %[t8] \n\t" + "sw %[temp_reg1], 14*4(%[tmp1]) \n\t" + "sub %[temp_reg2], %[temp_reg2], %[t7] \n\t" + "addu %[temp_reg3], %[t7], %[t6] \n\t" + "multu $ac3, %[C_3], %[temp_reg2] \n\t" + "multu %[C_1], %[temp_reg3] \n\t" + "sra %[temp_reg1], %[temp_reg2], 31 \n\t" + "move %[t1], $0 \n\t" + "sra %[temp_reg3], %[temp_reg3], 31 \n\t" + "movn %[t1], %[C_3], %[temp_reg1] \n\t" + "mfhi %[temp_reg1], $ac3 \n\t" + "mfhi %[temp_reg4] \n\t" + "move %[t2], $0 \n\t" + "movn %[t2], %[C_1], %[temp_reg3] \n\t" + "sub %[temp_reg3], %[t6], %[t8] \n\t" + "sub %[t2], %[temp_reg4], %[t2] \n\t" + "multu $ac1, %[C_7], %[temp_reg3] \n\t" + "sub %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t" + "sra %[temp_reg4], %[temp_reg3], 31 \n\t" + "sub %[t1], %[temp_reg1], %[t1] \n\t" + "move %[t3], $0 \n\t" + "sw %[t1], 4*4(%[tmp1]) \n\t" + "movn %[t3], %[C_7], %[temp_reg4] \n\t" + "multu $ac2, %[C_3A], %[t4] \n\t" + "add %[temp_reg2], %[t7], %[t8] \n\t" + "move %[t1], $0 \n\t" + "mfhi %[temp_reg4], $ac1 \n\t" + "multu $ac3,%[C_5], %[temp_reg2] \n\t" + "move %[t0], $0 \n\t" + "sra %[temp_reg1], %[temp_reg2], 31 \n\t" + "movn %[t1],%[C_5], %[temp_reg1] \n\t" + "sub %[temp_reg4], %[temp_reg4], %[temp_reg3] \n\t" + "mfhi %[temp_reg1], $ac3 \n\t" + "sra %[temp_reg3], %[t4], 31 \n\t" + "movn %[t0], %[C_3A], %[temp_reg3] \n\t" + "mfhi %[temp_reg3], $ac2 \n\t" + "sub %[t3], %[temp_reg4], %[t3] \n\t" + "add %[temp_reg4], %[t3], %[t2] \n\t" + "sub %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t" + "sub %[t1], %[temp_reg1], %[t1] \n\t" + "sub %[t0], %[temp_reg3], %[t0] \n\t" + "add %[temp_reg1], %[t2], %[t1] \n\t" + "add %[temp_reg4], %[temp_reg4], %[t0] \n\t" + "sub %[temp_reg2], %[t3], %[t1] \n\t" + "sw %[temp_reg4], 0*4(%[tmp1]) \n\t" + "sub %[temp_reg1], %[temp_reg1], %[t0] \n\t" + "sub %[temp_reg2], %[temp_reg2], %[t0] \n\t" + "sw %[temp_reg1], 12*4(%[tmp1]) \n\t" + "sw %[temp_reg2], 8*4(%[tmp1]) \n\t" + + : [t7] "=&r" (t7), [temp_reg1] "=&r" (temp_reg1), + [temp_reg2] "=&r" (temp_reg2), [temp_reg4] "=&r" (temp_reg4), + [temp_reg3] "=&r" (temp_reg3), [t8] "=&r" (t8), [t0] "=&r" (t0), + [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r"(t6), [t2] "=&r" (t2), + [t3] "=&r" (t3), [t1] "=&r" (t1) + : [C_2] "r" (C_2), [in1] "r" (in1), [tmp1] "r" (tmp1), [C_8] "r" (C_8), + [C_4] "r" (C_4), [C_3] "r" (C_3), [C_1] "r" (C_1), [C_7] "r" (C_7), + [C_3A] "r" (C_3A), [C_5] "r" (C_5) + : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo", + "$ac3hi", "$ac3lo" + ); + } + + /** + * loop is unrolled four times + * + * values defined in tables(icos36[] and icos36h[]) are not loaded from + * these tables - they are directly loaded in appropriate registers + * + */ + + __asm__ volatile ( + "lw %[t2], 1*4(%[tmp]) \n\t" + "lw %[t3], 3*4(%[tmp]) \n\t" + "lw %[t0], 0*4(%[tmp]) \n\t" + "lw %[t1], 2*4(%[tmp]) \n\t" + "addu %[temp_reg1], %[t3], %[t2] \n\t" + "li %[temp_reg2], 0x807D2B1E \n\t" + "move %[s1], $0 \n\t" + "multu %[temp_reg2], %[temp_reg1] \n\t" + "sra %[temp_reg1], %[temp_reg1], 31 \n\t" + "movn %[s1], %[temp_reg2], %[temp_reg1] \n\t" + "sub %[temp_reg3], %[t3], %[t2] \n\t" + "li %[temp_reg4], 0x2de5151 \n\t" + "mfhi %[temp_reg2] \n\t" + "addu %[s0], %[t1], %[t0] \n\t" + "lw %[temp_reg5], 9*4(%[win]) \n\t" + "mult $ac1, %[temp_reg4], %[temp_reg3] \n\t" + "lw %[temp_reg6], 4*9*4(%[buf]) \n\t" + "sub %[s2], %[t1], %[t0] \n\t" + "lw %[temp_reg3], 29*4(%[win]) \n\t" + "subu %[s1], %[temp_reg2], %[s1] \n\t" + "lw %[temp_reg4], 28*4(%[win]) \n\t" + "add %[t0], %[s0], %[s1] \n\t" + "extr.w %[s3], $ac1,23 \n\t" + "mult $ac2, %[t0], %[temp_reg3] \n\t" + "sub %[t1], %[s0], %[s1] \n\t" + "lw %[temp_reg1], 4*8*4(%[buf]) \n\t" + "mult %[t1], %[temp_reg5] \n\t" + "lw %[temp_reg2], 8*4(%[win]) \n\t" + "mfhi %[temp_reg3], $ac2 \n\t" + "mult $ac3, %[t0], %[temp_reg4] \n\t" + "add %[t0], %[s2], %[s3] \n\t" + "mfhi %[temp_reg5] \n\t" + "mult $ac1, %[t1], %[temp_reg2] \n\t" + "sub %[t1], %[s2], %[s3] \n\t" + "sw %[temp_reg3], 4*9*4(%[buf]) \n\t" + "mfhi %[temp_reg4], $ac3 \n\t" + "lw %[temp_reg3], 37*4(%[win]) \n\t" + "mfhi %[temp_reg2], $ac1 \n\t" + "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t" + "lw %[temp_reg6], 17*4(%[win]) \n\t" + "sw %[temp_reg5], 32*9*4(%[out]) \n\t" + "sw %[temp_reg4], 4*8*4(%[buf]) \n\t" + "mult %[t1], %[temp_reg6] \n\t" + "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t" + "lw %[temp_reg2], 0*4(%[win]) \n\t" + "lw %[temp_reg5], 4*17*4(%[buf]) \n\t" + "sw %[temp_reg1], 8*32*4(%[out]) \n\t" + "mfhi %[temp_reg6] \n\t" + "mult $ac1, %[t1], %[temp_reg2] \n\t" + "lw %[temp_reg4], 20*4(%[win]) \n\t" + "lw %[temp_reg1], 0(%[buf]) \n\t" + "mult $ac2, %[t0], %[temp_reg3] \n\t" + "mult %[t0], %[temp_reg4] \n\t" + "mfhi %[temp_reg2], $ac1 \n\t" + "lw %[t0], 4*4(%[tmp]) \n\t" + "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t" + "mfhi %[temp_reg3], $ac2 \n\t" + "mfhi %[temp_reg4] \n\t" + "sw %[temp_reg5], 17*32*4(%[out]) \n\t" + "lw %[t1], 6*4(%[tmp]) \n\t" + "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t" + "lw %[t2], 5*4(%[tmp]) \n\t" + "sw %[temp_reg1], 0*32*4(%[out]) \n\t" + "addu %[s0], %[t1], %[t0] \n\t" + "sw %[temp_reg3], 4*17*4(%[buf]) \n\t" + "lw %[t3], 7*4(%[tmp]) \n\t" + "sub %[s2], %[t1], %[t0] \n\t" + "sw %[temp_reg4], 0(%[buf]) \n\t" + "addu %[temp_reg5], %[t3], %[t2] \n\t" + "li %[temp_reg6], 0x8483EE0C \n\t" + "move %[s1], $0 \n\t" + "multu %[temp_reg6], %[temp_reg5] \n\t" + "sub %[temp_reg1], %[t3], %[t2] \n\t" + "li %[temp_reg2], 0xf746ea \n\t" + "sra %[temp_reg5], %[temp_reg5], 31 \n\t" + "mult $ac1, %[temp_reg2], %[temp_reg1] \n\t" + "movn %[s1], %[temp_reg6], %[temp_reg5] \n\t" + "mfhi %[temp_reg5] \n\t" + "lw %[temp_reg3], 10*4(%[win]) \n\t" + "lw %[temp_reg4], 4*10*4(%[buf]) \n\t" + "extr.w %[s3], $ac1, 23 \n\t" + "lw %[temp_reg1], 4*7*4(%[buf]) \n\t" + "lw %[temp_reg2], 7*4(%[win]) \n\t" + "lw %[temp_reg6], 30*4(%[win]) \n\t" + "subu %[s1], %[temp_reg5], %[s1] \n\t" + "sub %[t1], %[s0], %[s1] \n\t" + "add %[t0], %[s0], %[s1] \n\t" + "mult $ac2, %[t1], %[temp_reg3] \n\t" + "mult $ac3, %[t1], %[temp_reg2] \n\t" + "mult %[t0], %[temp_reg6] \n\t" + "lw %[temp_reg5], 27*4(%[win]) \n\t" + "mult $ac1, %[t0], %[temp_reg5] \n\t" + "mfhi %[temp_reg3], $ac2 \n\t" + "mfhi %[temp_reg2], $ac3 \n\t" + "mfhi %[temp_reg6] \n\t" + "add %[t0], %[s2], %[s3] \n\t" + "sub %[t1], %[s2], %[s3] \n\t" + "add %[temp_reg3], %[temp_reg3], %[temp_reg4] \n\t" + "lw %[temp_reg4], 16*4(%[win]) \n\t" + "mfhi %[temp_reg5], $ac1 \n\t" + "sw %[temp_reg3], 32*10*4(%[out]) \n\t" + "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t" + "lw %[temp_reg3], 4*16*4(%[buf]) \n\t" + "sw %[temp_reg6], 4*10*4(%[buf]) \n\t" + "sw %[temp_reg1], 7*32*4(%[out]) \n\t" + "mult $ac2, %[t1], %[temp_reg4] \n\t" + "sw %[temp_reg5], 4*7*4(%[buf]) \n\t" + "lw %[temp_reg6], 1*4(%[win]) \n\t" + "lw %[temp_reg5], 4*1*4(%[buf]) \n\t" + "lw %[temp_reg1], 36*4(%[win]) \n\t" + "mult $ac3, %[t1], %[temp_reg6] \n\t" + "lw %[temp_reg2], 21*4(%[win]) \n\t" + "mfhi %[temp_reg4], $ac2 \n\t" + "mult %[t0], %[temp_reg1] \n\t" + "mult $ac1, %[t0],%[temp_reg2] \n\t" + "lw %[t0], 8*4(%[tmp]) \n\t" + "mfhi %[temp_reg6], $ac3 \n\t" + "lw %[t1], 10*4(%[tmp]) \n\t" + "lw %[t3], 11*4(%[tmp]) \n\t" + "mfhi %[temp_reg1] \n\t" + "add %[temp_reg3], %[temp_reg3], %[temp_reg4] \n\t" + "lw %[t2], 9*4(%[tmp]) \n\t" + "mfhi %[temp_reg2], $ac1 \n\t" + "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t" + "sw %[temp_reg3], 16*32*4(%[out]) \n\t" + "sw %[temp_reg5], 1*32*4(%[out]) \n\t" + "sw %[temp_reg1], 4*16*4(%[buf]) \n\t" + "addu %[temp_reg3], %[t3], %[t2] \n\t" + "li %[temp_reg4], 0x8D3B7CD6 \n\t" + "sw %[temp_reg2], 4*1*4(%[buf]) \n\t" + "multu %[temp_reg4],%[temp_reg3] \n\t" + "sra %[temp_reg3], %[temp_reg3], 31 \n\t" + "move %[s1], $0 \n\t" + "movn %[s1], %[temp_reg4], %[temp_reg3] \n\t" + "addu %[s0], %[t1], %[t0] \n\t" + "mfhi %[temp_reg3] \n\t" + "sub %[s2], %[t1], %[t0] \n\t" + "sub %[temp_reg5], %[t3], %[t2] \n\t" + "li %[temp_reg6], 0x976fd9 \n\t" + "lw %[temp_reg2], 11*4(%[win]) \n\t" + "lw %[temp_reg1], 4*11*4(%[buf]) \n\t" + "mult $ac1, %[temp_reg6], %[temp_reg5] \n\t" + "subu %[s1], %[temp_reg3], %[s1] \n\t" + "lw %[temp_reg5], 31*4(%[win]) \n\t" + "sub %[t1], %[s0], %[s1] \n\t" + "add %[t0], %[s0], %[s1] \n\t" + "mult $ac2, %[t1], %[temp_reg2] \n\t" + "mult %[t0], %[temp_reg5] \n\t" + "lw %[temp_reg4], 6*4(%[win]) \n\t" + "extr.w %[s3], $ac1, 23 \n\t" + "lw %[temp_reg3], 4*6*4(%[buf]) \n\t" + "mfhi %[temp_reg2], $ac2 \n\t" + "lw %[temp_reg6], 26*4(%[win]) \n\t" + "mfhi %[temp_reg5] \n\t" + "mult $ac3, %[t1], %[temp_reg4] \n\t" + "mult $ac1, %[t0], %[temp_reg6] \n\t" + "add %[t0], %[s2], %[s3] \n\t" + "sub %[t1], %[s2], %[s3] \n\t" + "add %[temp_reg2], %[temp_reg2], %[temp_reg1] \n\t" + "mfhi %[temp_reg4], $ac3 \n\t" + "mfhi %[temp_reg6], $ac1 \n\t" + "sw %[temp_reg5], 4*11*4(%[buf]) \n\t" + "sw %[temp_reg2], 32*11*4(%[out]) \n\t" + "lw %[temp_reg1], 4*15*4(%[buf]) \n\t" + "add %[temp_reg3], %[temp_reg3], %[temp_reg4] \n\t" + "lw %[temp_reg2], 15*4(%[win]) \n\t" + "sw %[temp_reg3], 6*32*4(%[out]) \n\t" + "sw %[temp_reg6], 4*6*4(%[buf]) \n\t" + "mult %[t1], %[temp_reg2] \n\t" + "lw %[temp_reg3], 2*4(%[win]) \n\t" + "lw %[temp_reg4], 4*2*4(%[buf]) \n\t" + "lw %[temp_reg5], 35*4(%[win]) \n\t" + "mult $ac1, %[t1], %[temp_reg3] \n\t" + "mfhi %[temp_reg2] \n\t" + "lw %[temp_reg6], 22*4(%[win]) \n\t" + "mult $ac2, %[t0], %[temp_reg5] \n\t" + "lw %[t1], 14*4(%[tmp]) \n\t" + "mult $ac3, %[t0], %[temp_reg6] \n\t" + "lw %[t0], 12*4(%[tmp]) \n\t" + "mfhi %[temp_reg3], $ac1 \n\t" + "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t" + "mfhi %[temp_reg5], $ac2 \n\t" + "sw %[temp_reg1], 15*32*4(%[out]) \n\t" + "mfhi %[temp_reg6], $ac3 \n\t" + "lw %[t2], 13*4(%[tmp]) \n\t" + "lw %[t3], 15*4(%[tmp]) \n\t" + "add %[temp_reg4], %[temp_reg4], %[temp_reg3] \n\t" + "sw %[temp_reg5], 4*15*4(%[buf]) \n\t" + "addu %[temp_reg1], %[t3], %[t2] \n\t" + "li %[temp_reg2], 0x9C42577C \n\t" + "move %[s1], $0 \n\t" + "multu %[temp_reg2], %[temp_reg1] \n\t" + "sw %[temp_reg4], 2*32*4(%[out]) \n\t" + "sra %[temp_reg1], %[temp_reg1], 31 \n\t" + "movn %[s1], %[temp_reg2], %[temp_reg1] \n\t" + "sub %[temp_reg3], %[t3], %[t2] \n\t" + "li %[temp_reg4], 0x6f94a2 \n\t" + "mfhi %[temp_reg1] \n\t" + "addu %[s0], %[t1], %[t0] \n\t" + "sw %[temp_reg6], 4*2*4(%[buf]) \n\t" + "mult $ac1, %[temp_reg4], %[temp_reg3] \n\t" + "sub %[s2], %[t1], %[t0] \n\t" + "lw %[temp_reg5], 12*4(%[win]) \n\t" + "lw %[temp_reg6], 4*12*4(%[buf]) \n\t" + "subu %[s1], %[temp_reg1], %[s1] \n\t" + "sub %[t1], %[s0], %[s1] \n\t" + "lw %[temp_reg3], 32*4(%[win]) \n\t" + "mult $ac2, %[t1], %[temp_reg5] \n\t" + "add %[t0], %[s0], %[s1] \n\t" + "extr.w %[s3], $ac1, 23 \n\t" + "lw %[temp_reg2], 5*4(%[win]) \n\t" + "mult %[t0], %[temp_reg3] \n\t" + "mfhi %[temp_reg5], $ac2 \n\t" + "lw %[temp_reg4], 25*4(%[win]) \n\t" + "lw %[temp_reg1], 4*5*4(%[buf]) \n\t" + "mult $ac3, %[t1], %[temp_reg2] \n\t" + "mult $ac1, %[t0], %[temp_reg4] \n\t" + "mfhi %[temp_reg3] \n\t" + "add %[t0], %[s2], %[s3] \n\t" + "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t" + "mfhi %[temp_reg2], $ac3 \n\t" + "mfhi %[temp_reg4], $ac1 \n\t" + "sub %[t1], %[s2], %[s3] \n\t" + "sw %[temp_reg5], 32*12*4(%[out]) \n\t" + "sw %[temp_reg3], 4*12*4(%[buf]) \n\t" + "lw %[temp_reg6], 14*4(%[win]) \n\t" + "lw %[temp_reg5], 4*14*4(%[buf]) \n\t" + "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t" + "sw %[temp_reg4], 4*5*4(%[buf]) \n\t" + "sw %[temp_reg1], 5*32*4(%[out]) \n\t" + "mult %[t1], %[temp_reg6] \n\t" + "lw %[temp_reg4], 34*4(%[win]) \n\t" + "lw %[temp_reg2], 3*4(%[win]) \n\t" + "lw %[temp_reg1], 4*3*4(%[buf]) \n\t" + "mult $ac2, %[t0], %[temp_reg4] \n\t" + "mfhi %[temp_reg6] \n\t" + "mult $ac1, %[t1], %[temp_reg2] \n\t" + "lw %[temp_reg3], 23*4(%[win]) \n\t" + "lw %[s0], 16*4(%[tmp]) \n\t" + "mfhi %[temp_reg4], $ac2 \n\t" + "lw %[t1], 17*4(%[tmp]) \n\t" + "mult $ac3, %[t0], %[temp_reg3] \n\t" + "move %[s1], $0 \n\t" + "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t" + "mfhi %[temp_reg2], $ac1 \n\t" + "sw %[temp_reg5], 14*32*4(%[out]) \n\t" + "sw %[temp_reg4], 4*14*4(%[buf]) \n\t" + "mfhi %[temp_reg3], $ac3 \n\t" + "li %[temp_reg5], 0xB504F334 \n\t" + "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t" + "multu %[temp_reg5], %[t1] \n\t" + "lw %[temp_reg2], 4*13*4(%[buf]) \n\t" + "sw %[temp_reg1], 3*32*4(%[out]) \n\t" + "sra %[t1], %[t1], 31 \n\t" + "mfhi %[temp_reg6] \n\t" + "movn %[s1], %[temp_reg5], %[t1] \n\t" + "sw %[temp_reg3], 4*3*4(%[buf]) \n\t" + "lw %[temp_reg1], 13*4(%[win]) \n\t" + "lw %[temp_reg4], 4*4*4(%[buf]) \n\t" + "lw %[temp_reg3], 4*4(%[win]) \n\t" + "lw %[temp_reg5], 33*4(%[win]) \n\t" + "subu %[s1], %[temp_reg6], %[s1] \n\t" + "lw %[temp_reg6], 24*4(%[win]) \n\t" + "sub %[t1], %[s0], %[s1] \n\t" + "add %[t0], %[s0], %[s1] \n\t" + "mult $ac1, %[t1], %[temp_reg1] \n\t" + "mult $ac2, %[t1], %[temp_reg3] \n\t" + "mult $ac3, %[t0], %[temp_reg5] \n\t" + "mult %[t0], %[temp_reg6] \n\t" + "mfhi %[temp_reg1], $ac1 \n\t" + "mfhi %[temp_reg3], $ac2 \n\t" + "mfhi %[temp_reg5], $ac3 \n\t" + "mfhi %[temp_reg6] \n\t" + "add %[temp_reg2], %[temp_reg2], %[temp_reg1] \n\t" + "add %[temp_reg4], %[temp_reg4], %[temp_reg3] \n\t" + "sw %[temp_reg2], 13*32*4(%[out]) \n\t" + "sw %[temp_reg4], 4*32*4(%[out]) \n\t" + "sw %[temp_reg5], 4*13*4(%[buf]) \n\t" + "sw %[temp_reg6], 4*4*4(%[buf]) \n\t" + + : [t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), + [s0] "=&r" (s0), [s2] "=&r" (s2), [temp_reg1] "=&r" (temp_reg1), + [temp_reg2] "=&r" (temp_reg2), [s1] "=&r" (s1), [s3] "=&r" (s3), + [temp_reg3] "=&r" (temp_reg3), [temp_reg4] "=&r" (temp_reg4), + [temp_reg5] "=&r" (temp_reg5), [temp_reg6] "=&r" (temp_reg6), + [out] "+r" (out) + : [tmp] "r" (tmp), [win] "r" (win), [buf] "r" (buf) + : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo", + "$ac3hi", "$ac3lo" + ); +} + +static void ff_imdct36_blocks_mips_fixed(int *out, int *buf, int *in, + int count, int switch_point, int block_type) +{ + int j; + for (j=0 ; j < count; j++) { + /* apply window & overlap with previous buffer */ + + /* select window */ + int win_idx = (switch_point && j < 2) ? 0 : block_type; + int *win = ff_mdct_win_fixed[win_idx + (4 & -(j & 1))]; + + imdct36_mips_fixed(out, buf, in, win); + + in += 18; + buf += ((j&3) != 3 ? 1 : (72-3)); + out++; + } +} + +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_INLINE_ASM */ + +void ff_mpadsp_init_mipsdsp(MPADSPContext *s) +{ +#if HAVE_INLINE_ASM +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 + s->apply_window_fixed = ff_mpadsp_apply_window_mips_fixed; + s->imdct36_blocks_fixed = ff_imdct36_blocks_mips_fixed; +#endif +#endif +} diff --git a/libavcodec/mips/mpegaudiodsp_mips_float.c b/libavcodec/mips/mpegaudiodsp_mips_float.c new file mode 100644 index 0000000000..270838ebf1 --- /dev/null +++ b/libavcodec/mips/mpegaudiodsp_mips_float.c @@ -0,0 +1,1261 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Bojan Zivkovic (bojan@mips.com) + * + * MPEG Audio decoder optimized for MIPS floating-point architecture + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/mpegaudiodsp_template.c + * libavcodec/dct32.c + */ + +#include <string.h> + +#include "libavutil/mips/asmdefs.h" +#include "libavcodec/mpegaudiodsp.h" + +#if HAVE_INLINE_ASM && HAVE_MIPSFPU +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 + +static void ff_mpadsp_apply_window_mips_float(float *synth_buf, float *window, + int *dither_state, float *samples, int incr) +{ + register const float *w, *w2, *p; + int j; + float *samples2; + float sum, sum2; + /* temporary variables */ + int incr1 = incr << 2; + int t_sample; + float in1, in2, in3, in4, in5, in6, in7, in8; + float *p2; + + /* copy to avoid wrap */ + memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf)); + + /** + * instructions are scheduled to minimize pipeline stall. + * use of round_sample function from the original code is + * changed with appropriate assembly instructions. + */ + + __asm__ volatile ( + "lwc1 %[sum], 0(%[dither_state]) \t\n" + "sll %[t_sample], %[incr1], 5 \t\n" + "sub %[t_sample], %[t_sample], %[incr1] \n\t" + "li %[j], 4 \t\n" + "lwc1 %[in1], 0(%[window]) \t\n" + "lwc1 %[in2], 16*4(%[synth_buf]) \t\n" + "sw $zero, 0(%[dither_state]) \t\n" + "lwc1 %[in3], 64*4(%[window]) \t\n" + "lwc1 %[in4], 80*4(%[synth_buf]) \t\n" + PTR_ADDU "%[samples2],%[samples], %[t_sample] \t\n" + "madd.s %[sum], %[sum], %[in1], %[in2] \t\n" + "lwc1 %[in5], 128*4(%[window]) \t\n" + "lwc1 %[in6], 144*4(%[synth_buf]) \t\n" + "lwc1 %[in7], 192*4(%[window]) \t\n" + "madd.s %[sum], %[sum], %[in3], %[in4] \t\n" + "lwc1 %[in8], 208*4(%[synth_buf]) \t\n" + "lwc1 %[in1], 256*4(%[window]) \t\n" + "lwc1 %[in2], 272*4(%[synth_buf]) \t\n" + "madd.s %[sum], %[sum], %[in5], %[in6] \t\n" + "lwc1 %[in3], 320*4(%[window]) \t\n" + "lwc1 %[in4], 336*4(%[synth_buf]) \t\n" + "lwc1 %[in5], 384*4(%[window]) \t\n" + "madd.s %[sum], %[sum], %[in7], %[in8] \t\n" + "lwc1 %[in6], 400*4(%[synth_buf]) \t\n" + "lwc1 %[in7], 448*4(%[window]) \t\n" + "lwc1 %[in8], 464*4(%[synth_buf]) \t\n" + "madd.s %[sum], %[sum], %[in1], %[in2] \t\n" + "lwc1 %[in1], 32*4(%[window]) \t\n" + "lwc1 %[in2], 48*4(%[synth_buf]) \t\n" + "madd.s %[sum], %[sum], %[in3], %[in4] \t\n" + "lwc1 %[in3], 96*4(%[window]) \t\n" + "lwc1 %[in4], 112*4(%[synth_buf]) \t\n" + "madd.s %[sum], %[sum], %[in5], %[in6] \t\n" + "lwc1 %[in5], 160*4(%[window]) \t\n" + "lwc1 %[in6], 176*4(%[synth_buf]) \t\n" + "madd.s %[sum], %[sum], %[in7], %[in8] \t\n" + "lwc1 %[in7], 224*4(%[window]) \t\n" + "lwc1 %[in8], 240*4(%[synth_buf]) \t\n" + "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n" + "lwc1 %[in1], 288*4(%[window]) \t\n" + "lwc1 %[in2], 304*4(%[synth_buf]) \t\n" + "nmsub.s %[sum], %[sum], %[in3], %[in4] \t\n" + "lwc1 %[in3], 352*4(%[window]) \t\n" + "lwc1 %[in4], 368*4(%[synth_buf]) \t\n" + "nmsub.s %[sum], %[sum], %[in5], %[in6] \t\n" + "lwc1 %[in5], 416*4(%[window]) \t\n" + "lwc1 %[in6], 432*4(%[synth_buf]) \t\n" + "nmsub.s %[sum], %[sum], %[in7], %[in8] \t\n" + "lwc1 %[in7], 480*4(%[window]) \t\n" + "lwc1 %[in8], 496*4(%[synth_buf]) \t\n" + "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n" + PTR_ADDU "%[w], %[window], 4 \t\n" + "nmsub.s %[sum], %[sum], %[in3], %[in4] \t\n" + PTR_ADDU "%[w2], %[window], 124 \t\n" + PTR_ADDIU "%[p], %[synth_buf], 68 \t\n" + PTR_ADDIU "%[p2], %[synth_buf], 188 \t\n" + "nmsub.s %[sum], %[sum], %[in5], %[in6] \t\n" + "nmsub.s %[sum], %[sum], %[in7], %[in8] \t\n" + "swc1 %[sum], 0(%[samples]) \t\n" + PTR_ADDU "%[samples], %[samples], %[incr1] \t\n" + + /* calculate two samples at the same time to avoid one memory + access per two sample */ + + "ff_mpadsp_apply_window_loop%=: \t\n" + "lwc1 %[in1], 0(%[w]) \t\n" + "lwc1 %[in2], 0(%[p]) \t\n" + "lwc1 %[in3], 0(%[w2]) \t\n" + "lwc1 %[in4], 64*4(%[w]) \t\n" + "lwc1 %[in5], 64*4(%[p]) \t\n" + "lwc1 %[in6], 64*4(%[w2]) \t\n" + "mul.s %[sum], %[in1], %[in2] \t\n" + "mul.s %[sum2], %[in2], %[in3] \t\n" + "lwc1 %[in1], 128*4(%[w]) \t\n" + "lwc1 %[in2], 128*4(%[p]) \t\n" + "madd.s %[sum], %[sum], %[in4], %[in5] \t\n" + "nmadd.s %[sum2], %[sum2], %[in5], %[in6] \t\n" + "lwc1 %[in3], 128*4(%[w2]) \t\n" + "lwc1 %[in4], 192*4(%[w]) \t\n" + "madd.s %[sum], %[sum], %[in1], %[in2] \t\n" + "lwc1 %[in5], 192*4(%[p]) \t\n" + "lwc1 %[in6], 192*4(%[w2]) \t\n" + "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n" + "lwc1 %[in1], 256*4(%[w]) \t\n" + "lwc1 %[in2], 256*4(%[p]) \t\n" + "madd.s %[sum], %[sum], %[in4], %[in5] \t\n" + "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n" + "lwc1 %[in3], 256*4(%[w2]) \t\n" + "lwc1 %[in4], 320*4(%[w]) \t\n" + "madd.s %[sum], %[sum], %[in1], %[in2] \t\n" + "lwc1 %[in5], 320*4(%[p]) \t\n" + "lwc1 %[in6], 320*4(%[w2]) \t\n" + "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n" + "lwc1 %[in1], 384*4(%[w]) \t\n" + "lwc1 %[in2], 384*4(%[p]) \t\n" + "madd.s %[sum], %[sum], %[in4], %[in5] \t\n" + "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n" + "lwc1 %[in3], 384*4(%[w2]) \t\n" + "lwc1 %[in4], 448*4(%[w]) \t\n" + "madd.s %[sum], %[sum], %[in1], %[in2] \t\n" + "lwc1 %[in5], 448*4(%[p]) \t\n" + "lwc1 %[in6], 448*4(%[w2]) \t\n" + "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n" + "madd.s %[sum], %[sum], %[in4], %[in5] \t\n" + "lwc1 %[in1], 32*4(%[w]) \t\n" + "lwc1 %[in2], 0(%[p2]) \t\n" + "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n" + "lwc1 %[in3], 32*4(%[w2]) \t\n" + "lwc1 %[in4], 96*4(%[w]) \t\n" + "lwc1 %[in5], 64*4(%[p2]) \t\n" + "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n" + "lwc1 %[in6], 96*4(%[w2]) \t\n" + "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n" + "lwc1 %[in1], 160*4(%[w]) \t\n" + "nmsub.s %[sum], %[sum], %[in4], %[in5] \t\n" + "lwc1 %[in2], 128*4(%[p2]) \t\n" + "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n" + "lwc1 %[in3], 160*4(%[w2]) \t\n" + "lwc1 %[in4], 224*4(%[w]) \t\n" + "lwc1 %[in5], 192*4(%[p2]) \t\n" + "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n" + "lwc1 %[in6], 224*4(%[w2]) \t\n" + "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n" + "lwc1 %[in1], 288*4(%[w]) \t\n" + "nmsub.s %[sum], %[sum], %[in4], %[in5] \t\n" + "lwc1 %[in2], 256*4(%[p2]) \t\n" + "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n" + "lwc1 %[in3], 288*4(%[w2]) \t\n" + "lwc1 %[in4], 352*4(%[w]) \t\n" + "lwc1 %[in5], 320*4(%[p2]) \t\n" + "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n" + "lwc1 %[in6], 352*4(%[w2]) \t\n" + "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n" + "lwc1 %[in1], 416*4(%[w]) \t\n" + "nmsub.s %[sum], %[sum], %[in4], %[in5] \t\n" + "lwc1 %[in2], 384*4(%[p2]) \t\n" + "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n" + "lwc1 %[in3], 416*4(%[w2]) \t\n" + "lwc1 %[in4], 480*4(%[w]) \t\n" + "lwc1 %[in5], 448*4(%[p2]) \t\n" + "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n" + "lwc1 %[in6], 480*4(%[w2]) \t\n" + "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n" + PTR_ADDIU "%[w], %[w], 4 \t\n" + "nmsub.s %[sum], %[sum], %[in4], %[in5] \t\n" + PTR_ADDIU "%[w2], %[w2], -4 \t\n" + "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n" + "addu %[j], %[j], 4 \t\n" + PTR_ADDIU "%[p], 4 \t\n" + "swc1 %[sum], 0(%[samples]) \t\n" + PTR_ADDIU "%[p2], -4 \t\n" + "swc1 %[sum2], 0(%[samples2]) \t\n" + PTR_ADDU "%[samples], %[samples], %[incr1] \t\n" + PTR_SUBU "%[samples2],%[samples2], %[incr1] \t\n" + "bne %[j], 64, ff_mpadsp_apply_window_loop%= \t\n" + + "lwc1 %[in1], 48*4(%[window]) \t\n" + "lwc1 %[in2], 32*4(%[synth_buf]) \t\n" + "lwc1 %[in3], 112*4(%[window]) \t\n" + "lwc1 %[in4], 96*4(%[synth_buf]) \t\n" + "lwc1 %[in5], 176*4(%[window]) \t\n" + "lwc1 %[in6], 160*4(%[synth_buf]) \t\n" + "mul.s %[sum], %[in1], %[in2] \t\n" + "lwc1 %[in7], 240*4(%[window]) \t\n" + "lwc1 %[in8], 224*4(%[synth_buf]) \t\n" + "lwc1 %[in1], 304*4(%[window]) \t\n" + "nmadd.s %[sum], %[sum], %[in3], %[in4] \t\n" + "lwc1 %[in2], 288*4(%[synth_buf]) \t\n" + "lwc1 %[in3], 368*4(%[window]) \t\n" + "lwc1 %[in4], 352*4(%[synth_buf]) \t\n" + "nmsub.s %[sum], %[sum], %[in5], %[in6] \t\n" + "nmsub.s %[sum], %[sum], %[in7], %[in8] \t\n" + "lwc1 %[in5], 432*4(%[window]) \t\n" + "lwc1 %[in6], 416*4(%[synth_buf]) \t\n" + "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n" + "lwc1 %[in7], 496*4(%[window]) \t\n" + "lwc1 %[in8], 480*4(%[synth_buf]) \t\n" + "nmsub.s %[sum], %[sum], %[in3], %[in4] \t\n" + "nmsub.s %[sum], %[sum], %[in5], %[in6] \t\n" + "nmsub.s %[sum], %[sum], %[in7], %[in8] \t\n" + "swc1 %[sum], 0(%[samples]) \t\n" + + : [sum] "=&f" (sum), [sum2] "=&f" (sum2), + [w2] "=&r" (w2), [w] "=&r" (w), + [p] "=&r" (p), [p2] "=&r" (p2), [j] "=&r" (j), + [samples] "+r" (samples), [samples2] "=&r" (samples2), + [in1] "=&f" (in1), [in2] "=&f" (in2), + [in3] "=&f" (in3), [in4] "=&f" (in4), + [in5] "=&f" (in5), [in6] "=&f" (in6), + [in7] "=&f" (in7), [in8] "=&f" (in8), + [t_sample] "=&r" (t_sample) + : [synth_buf] "r" (synth_buf), [window] "r" (window), + [dither_state] "r" (dither_state), [incr1] "r" (incr1) + : "memory" + ); +} + +static void ff_dct32_mips_float(float *out, const float *tab) +{ + float val0 , val1 , val2 , val3 , val4 , val5 , val6 , val7, + val8 , val9 , val10, val11, val12, val13, val14, val15, + val16, val17, val18, val19, val20, val21, val22, val23, + val24, val25, val26, val27, val28, val29, val30, val31; + float fTmp1, fTmp2, fTmp3, fTmp4, fTmp5, fTmp6, fTmp7, fTmp8, + fTmp9, fTmp10, fTmp11; + + /** + * instructions are scheduled to minimize pipeline stall. + */ + __asm__ volatile ( + "lwc1 %[fTmp1], 0*4(%[tab]) \n\t" + "lwc1 %[fTmp2], 31*4(%[tab]) \n\t" + "lwc1 %[fTmp3], 15*4(%[tab]) \n\t" + "lwc1 %[fTmp4], 16*4(%[tab]) \n\t" + "li.s %[fTmp7], 0.50241928618815570551 \n\t" + "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" + "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" + "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" + "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" + "li.s %[fTmp10], 0.50060299823519630134 \n\t" + "li.s %[fTmp11], 10.19000812354805681150 \n\t" + "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t" + "add.s %[val0], %[fTmp5], %[fTmp6] \n\t" + "sub.s %[val15], %[fTmp5], %[fTmp6] \n\t" + "lwc1 %[fTmp1], 7*4(%[tab]) \n\t" + "lwc1 %[fTmp2], 24*4(%[tab]) \n\t" + "madd.s %[val16], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" + "nmsub.s %[val31], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" + "mul.s %[val15], %[val15], %[fTmp7] \n\t" + "lwc1 %[fTmp3], 8*4(%[tab]) \n\t" + "lwc1 %[fTmp4], 23*4(%[tab]) \n\t" + "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val31], %[val31], %[fTmp7] \n\t" + "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" + "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" + "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" + "li.s %[fTmp7], 5.10114861868916385802 \n\t" + "li.s %[fTmp10], 0.67480834145500574602 \n\t" + "li.s %[fTmp11], 0.74453627100229844977 \n\t" + "add.s %[val7], %[fTmp5], %[fTmp6] \n\t" + "sub.s %[val8], %[fTmp5], %[fTmp6] \n\t" + "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t" + "li.s %[fTmp1], 0.50979557910415916894 \n\t" + "sub.s %[fTmp2], %[val0], %[val7] \n\t" + "mul.s %[val8], %[val8], %[fTmp7] \n\t" + "madd.s %[val23], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" + "nmsub.s %[val24], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" + "add.s %[val0], %[val0], %[val7] \n\t" + "mul.s %[val7], %[fTmp1], %[fTmp2] \n\t" + "sub.s %[fTmp2], %[val15], %[val8] \n\t" + "add.s %[val8], %[val15], %[val8] \n\t" + "mul.s %[val24], %[val24], %[fTmp7] \n\t" + "sub.s %[fTmp3], %[val16], %[val23] \n\t" + "add.s %[val16], %[val16], %[val23] \n\t" + "mul.s %[val15], %[fTmp1], %[fTmp2] \n\t" + "sub.s %[fTmp4], %[val31], %[val24] \n\t" + "mul.s %[val23], %[fTmp1], %[fTmp3] \n\t" + "add.s %[val24], %[val31], %[val24] \n\t" + "mul.s %[val31], %[fTmp1], %[fTmp4] \n\t" + + : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), + [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6), + [fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9), + [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11), + [val0] "=f" (val0), [val7] "=f" (val7), + [val8] "=f" (val8), [val15] "=f" (val15), + [val16] "=f" (val16), [val23] "=f" (val23), + [val24] "=f" (val24), [val31] "=f" (val31) + : [tab] "r" (tab) + : "memory" + ); + + __asm__ volatile ( + "lwc1 %[fTmp1], 3*4(%[tab]) \n\t" + "lwc1 %[fTmp2], 28*4(%[tab]) \n\t" + "lwc1 %[fTmp3], 12*4(%[tab]) \n\t" + "lwc1 %[fTmp4], 19*4(%[tab]) \n\t" + "li.s %[fTmp7], 0.64682178335999012954 \n\t" + "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" + "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" + "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" + "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" + "li.s %[fTmp10], 0.53104259108978417447 \n\t" + "li.s %[fTmp11], 1.48416461631416627724 \n\t" + "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t" + "add.s %[val3], %[fTmp5], %[fTmp6] \n\t" + "sub.s %[val12], %[fTmp5], %[fTmp6] \n\t" + "lwc1 %[fTmp1], 4*4(%[tab]) \n\t" + "lwc1 %[fTmp2], 27*4(%[tab]) \n\t" + "madd.s %[val19], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" + "nmsub.s %[val28], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" + "mul.s %[val12], %[val12], %[fTmp7] \n\t" + "lwc1 %[fTmp3], 11*4(%[tab]) \n\t" + "lwc1 %[fTmp4], 20*4(%[tab]) \n\t" + "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val28], %[val28], %[fTmp7] \n\t" + "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" + "li.s %[fTmp7], 0.78815462345125022473 \n\t" + "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" + "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" + "li.s %[fTmp10], 0.55310389603444452782 \n\t" + "li.s %[fTmp11], 1.16943993343288495515 \n\t" + "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t" + "add.s %[val4], %[fTmp5], %[fTmp6] \n\t" + "sub.s %[val11], %[fTmp5], %[fTmp6] \n\t" + "li.s %[fTmp1], 2.56291544774150617881 \n\t" + "madd.s %[val20], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" + "nmsub.s %[val27], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" + "mul.s %[val11], %[val11], %[fTmp7] \n\t" + "sub.s %[fTmp2], %[val3], %[val4] \n\t" + "add.s %[val3], %[val3], %[val4] \n\t" + "sub.s %[fTmp4], %[val19], %[val20] \n\t" + "mul.s %[val27], %[val27], %[fTmp7] \n\t" + "sub.s %[fTmp3], %[val12], %[val11] \n\t" + "mul.s %[val4], %[fTmp1], %[fTmp2] \n\t" + "add.s %[val11], %[val12], %[val11] \n\t" + "add.s %[val19], %[val19], %[val20] \n\t" + "mul.s %[val20], %[fTmp1], %[fTmp4] \n\t" + "mul.s %[val12], %[fTmp1], %[fTmp3] \n\t" + "sub.s %[fTmp2], %[val28], %[val27] \n\t" + "add.s %[val27], %[val28], %[val27] \n\t" + "mul.s %[val28], %[fTmp1], %[fTmp2] \n\t" + + : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), + [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6), + [fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9), + [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11), + [val3] "=f" (val3), [val4] "=f" (val4), + [val11] "=f" (val11), [val12] "=f" (val12), + [val19] "=f" (val19), [val20] "=f" (val20), + [val27] "=f" (val27), [val28] "=f" (val28) + : [tab] "r" (tab) + : "memory" + ); + + __asm__ volatile ( + "li.s %[fTmp1], 0.54119610014619698439 \n\t" + "sub.s %[fTmp2], %[val0], %[val3] \n\t" + "add.s %[val0], %[val0], %[val3] \n\t" + "sub.s %[fTmp3], %[val7], %[val4] \n\t" + "add.s %[val4], %[val7], %[val4] \n\t" + "sub.s %[fTmp4], %[val8], %[val11] \n\t" + "mul.s %[val3], %[fTmp1], %[fTmp2] \n\t" + "add.s %[val8], %[val8], %[val11] \n\t" + "mul.s %[val7], %[fTmp1], %[fTmp3] \n\t" + "sub.s %[fTmp2], %[val15], %[val12] \n\t" + "mul.s %[val11], %[fTmp1], %[fTmp4] \n\t" + "add.s %[val12], %[val15], %[val12] \n\t" + "mul.s %[val15], %[fTmp1], %[fTmp2] \n\t" + + : [val0] "+f" (val0), [val3] "+f" (val3), + [val4] "+f" (val4), [val7] "+f" (val7), + [val8] "+f" (val8), [val11] "+f" (val11), + [val12] "+f" (val12), [val15] "+f" (val15), + [fTmp1] "=f" (fTmp1), [fTmp2] "=&f" (fTmp2), + [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4) + : + ); + + __asm__ volatile ( + "sub.s %[fTmp2], %[val16], %[val19] \n\t" + "add.s %[val16], %[val16], %[val19] \n\t" + "sub.s %[fTmp3], %[val23], %[val20] \n\t" + "add.s %[val20], %[val23], %[val20] \n\t" + "sub.s %[fTmp4], %[val24], %[val27] \n\t" + "mul.s %[val19], %[fTmp1], %[fTmp2] \n\t" + "add.s %[val24], %[val24], %[val27] \n\t" + "mul.s %[val23], %[fTmp1], %[fTmp3] \n\t" + "sub.s %[fTmp2], %[val31], %[val28] \n\t" + "mul.s %[val27], %[fTmp1], %[fTmp4] \n\t" + "add.s %[val28], %[val31], %[val28] \n\t" + "mul.s %[val31], %[fTmp1], %[fTmp2] \n\t" + + : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), + [val16] "+f" (val16), [val19] "+f" (val19), [val20] "+f" (val20), + [val23] "+f" (val23), [val24] "+f" (val24), [val27] "+f" (val27), + [val28] "+f" (val28), [val31] "+f" (val31) + : [fTmp1] "f" (fTmp1) + ); + + __asm__ volatile ( + "lwc1 %[fTmp1], 1*4(%[tab]) \n\t" + "lwc1 %[fTmp2], 30*4(%[tab]) \n\t" + "lwc1 %[fTmp3], 14*4(%[tab]) \n\t" + "lwc1 %[fTmp4], 17*4(%[tab]) \n\t" + "li.s %[fTmp7], 0.52249861493968888062 \n\t" + "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" + "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" + "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" + "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" + "li.s %[fTmp10], 0.50547095989754365998 \n\t" + "li.s %[fTmp11], 3.40760841846871878570 \n\t" + "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t" + "add.s %[val1], %[fTmp5], %[fTmp6] \n\t" + "sub.s %[val14], %[fTmp5], %[fTmp6] \n\t" + "lwc1 %[fTmp1], 6*4(%[tab]) \n\t" + "lwc1 %[fTmp2], 25*4(%[tab]) \n\t" + "madd.s %[val17], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" + "nmsub.s %[val30], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" + "mul.s %[val14], %[val14], %[fTmp7] \n\t" + "lwc1 %[fTmp3], 9*4(%[tab]) \n\t" + "lwc1 %[fTmp4], 22*4(%[tab]) \n\t" + "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val30], %[val30], %[fTmp7] \n\t" + "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" + "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" + "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" + "li.s %[fTmp7], 1.72244709823833392782 \n\t" + "li.s %[fTmp10], 0.62250412303566481615 \n\t" + "li.s %[fTmp11], 0.83934964541552703873 \n\t" + "add.s %[val6], %[fTmp5], %[fTmp6] \n\t" + "sub.s %[val9], %[fTmp5], %[fTmp6] \n\t" + "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t" + "li.s %[fTmp1], 0.60134488693504528054 \n\t" + "sub.s %[fTmp2], %[val1], %[val6] \n\t" + "add.s %[val1], %[val1], %[val6] \n\t" + "mul.s %[val9], %[val9], %[fTmp7] \n\t" + "madd.s %[val22], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" + "nmsub.s %[val25], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" + "mul.s %[val6], %[fTmp1], %[fTmp2] \n\t" + "sub.s %[fTmp2], %[val14], %[val9] \n\t" + "add.s %[val9], %[val14], %[val9] \n\t" + "mul.s %[val25], %[val25], %[fTmp7] \n\t" + "sub.s %[fTmp3], %[val17], %[val22] \n\t" + "add.s %[val17], %[val17], %[val22] \n\t" + "mul.s %[val14], %[fTmp1], %[fTmp2] \n\t" + "sub.s %[fTmp2], %[val30], %[val25] \n\t" + "mul.s %[val22], %[fTmp1], %[fTmp3] \n\t" + "add.s %[val25], %[val30], %[val25] \n\t" + "mul.s %[val30], %[fTmp1], %[fTmp2] \n\t" + + : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), + [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6), + [fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9), + [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11), + [val1] "=f" (val1), [val6] "=f" (val6), + [val9] "=f" (val9), [val14] "=f" (val14), + [val17] "=f" (val17), [val22] "=f" (val22), + [val25] "=f" (val25), [val30] "=f" (val30) + : [tab] "r" (tab) + : "memory" + ); + + __asm__ volatile ( + "lwc1 %[fTmp1], 2*4(%[tab]) \n\t" + "lwc1 %[fTmp2], 29*4(%[tab]) \n\t" + "lwc1 %[fTmp3], 13*4(%[tab]) \n\t" + "lwc1 %[fTmp4], 18*4(%[tab]) \n\t" + "li.s %[fTmp7], 0.56694403481635770368 \n\t" + "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" + "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" + "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" + "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" + "li.s %[fTmp10], 0.51544730992262454697 \n\t" + "li.s %[fTmp11], 2.05778100995341155085 \n\t" + "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t" + "add.s %[val2], %[fTmp5], %[fTmp6] \n\t" + "sub.s %[val13], %[fTmp5], %[fTmp6] \n\t" + "lwc1 %[fTmp1], 5*4(%[tab]) \n\t" + "lwc1 %[fTmp2], 26*4(%[tab]) \n\t" + "madd.s %[val18], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" + "nmsub.s %[val29], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" + "mul.s %[val13], %[val13], %[fTmp7] \n\t" + "lwc1 %[fTmp3], 10*4(%[tab]) \n\t" + "lwc1 %[fTmp4], 21*4(%[tab]) \n\t" + "mul.s %[val29], %[val29], %[fTmp7] \n\t" + "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" + "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" + "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" + "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" + "li.s %[fTmp7], 1.06067768599034747134 \n\t" + "li.s %[fTmp10], 0.58293496820613387367 \n\t" + "li.s %[fTmp11], 0.97256823786196069369 \n\t" + "add.s %[val5], %[fTmp5], %[fTmp6] \n\t" + "sub.s %[val10], %[fTmp5], %[fTmp6] \n\t" + "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t" + "li.s %[fTmp1], 0.89997622313641570463 \n\t" + "sub.s %[fTmp2], %[val2], %[val5] \n\t" + "mul.s %[val10], %[val10], %[fTmp7] \n\t" + "madd.s %[val21], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" + "nmsub.s %[val26], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" + "add.s %[val2], %[val2], %[val5] \n\t" + "mul.s %[val5], %[fTmp1], %[fTmp2] \n\t" + "sub.s %[fTmp3], %[val13], %[val10] \n\t" + "add.s %[val10], %[val13], %[val10] \n\t" + "mul.s %[val26], %[val26], %[fTmp7] \n\t" + "sub.s %[fTmp4], %[val18], %[val21] \n\t" + "add.s %[val18], %[val18], %[val21] \n\t" + "mul.s %[val13], %[fTmp1], %[fTmp3] \n\t" + "sub.s %[fTmp2], %[val29], %[val26] \n\t" + "add.s %[val26], %[val29], %[val26] \n\t" + "mul.s %[val21], %[fTmp1], %[fTmp4] \n\t" + "mul.s %[val29], %[fTmp1], %[fTmp2] \n\t" + + : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), + [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6), + [fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9), + [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11), + [val2] "=f" (val2), [val5] "=f" (val5), + [val10] "=f" (val10), [val13] "=f" (val13), + [val18] "=f" (val18), [val21] "=f" (val21), + [val26] "=f" (val26), [val29] "=f" (val29) + : [tab] "r" (tab) + : "memory" + ); + + __asm__ volatile ( + "li.s %[fTmp1], 1.30656296487637652785 \n\t" + "sub.s %[fTmp2], %[val1], %[val2] \n\t" + "add.s %[val1], %[val1], %[val2] \n\t" + "sub.s %[fTmp3], %[val6], %[val5] \n\t" + "add.s %[val5], %[val6], %[val5] \n\t" + "sub.s %[fTmp4], %[val9], %[val10] \n\t" + "mul.s %[val2], %[fTmp1], %[fTmp2] \n\t" + "add.s %[val9], %[val9], %[val10] \n\t" + "mul.s %[val6], %[fTmp1], %[fTmp3] \n\t" + "sub.s %[fTmp2], %[val14], %[val13] \n\t" + "mul.s %[val10], %[fTmp1], %[fTmp4] \n\t" + "add.s %[val13], %[val14], %[val13] \n\t" + "mul.s %[val14], %[fTmp1], %[fTmp2] \n\t" + + : [fTmp1] "=f" (fTmp1), [fTmp2] "=&f" (fTmp2), + [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), + [val1] "+f" (val1), [val2] "+f" (val2), + [val5] "+f" (val5), [val6] "+f" (val6), + [val9] "+f" (val9), [val10] "+f" (val10), + [val13] "+f" (val13), [val14] "+f" (val14) + : + ); + + __asm__ volatile ( + "sub.s %[fTmp2], %[val17], %[val18] \n\t" + "add.s %[val17], %[val17], %[val18] \n\t" + "sub.s %[fTmp3], %[val22], %[val21] \n\t" + "add.s %[val21], %[val22], %[val21] \n\t" + "sub.s %[fTmp4], %[val25], %[val26] \n\t" + "mul.s %[val18], %[fTmp1], %[fTmp2] \n\t" + "add.s %[val25], %[val25], %[val26] \n\t" + "mul.s %[val22], %[fTmp1], %[fTmp3] \n\t" + "sub.s %[fTmp2], %[val30], %[val29] \n\t" + "mul.s %[val26], %[fTmp1], %[fTmp4] \n\t" + "add.s %[val29], %[val30], %[val29] \n\t" + "mul.s %[val30], %[fTmp1], %[fTmp2] \n\t" + + : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), + [val17] "+f" (val17), [val18] "+f" (val18), [val21] "+f" (val21), + [val22] "+f" (val22), [val25] "+f" (val25), [val26] "+f" (val26), + [val29] "+f" (val29), [val30] "+f" (val30) + : [fTmp1] "f" (fTmp1) + ); + + __asm__ volatile ( + "li.s %[fTmp1], 0.70710678118654752439 \n\t" + "sub.s %[fTmp2], %[val0], %[val1] \n\t" + "add.s %[val0], %[val0], %[val1] \n\t" + "sub.s %[fTmp3], %[val3], %[val2] \n\t" + "add.s %[val2], %[val3], %[val2] \n\t" + "sub.s %[fTmp4], %[val4], %[val5] \n\t" + "mul.s %[val1], %[fTmp1], %[fTmp2] \n\t" + "swc1 %[val0], 0(%[out]) \n\t" + "mul.s %[val3], %[fTmp3], %[fTmp1] \n\t" + "add.s %[val4], %[val4], %[val5] \n\t" + "mul.s %[val5], %[fTmp1], %[fTmp4] \n\t" + "swc1 %[val1], 16*4(%[out]) \n\t" + "sub.s %[fTmp2], %[val7], %[val6] \n\t" + "add.s %[val2], %[val2], %[val3] \n\t" + "swc1 %[val3], 24*4(%[out]) \n\t" + "add.s %[val6], %[val7], %[val6] \n\t" + "mul.s %[val7], %[fTmp1], %[fTmp2] \n\t" + "swc1 %[val2], 8*4(%[out]) \n\t" + "add.s %[val6], %[val6], %[val7] \n\t" + "swc1 %[val7], 28*4(%[out]) \n\t" + "add.s %[val4], %[val4], %[val6] \n\t" + "add.s %[val6], %[val6], %[val5] \n\t" + "add.s %[val5], %[val5], %[val7] \n\t" + "swc1 %[val4], 4*4(%[out]) \n\t" + "swc1 %[val5], 20*4(%[out]) \n\t" + "swc1 %[val6], 12*4(%[out]) \n\t" + + : [fTmp1] "=f" (fTmp1), [fTmp2] "=&f" (fTmp2), + [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), + [val0] "+f" (val0), [val1] "+f" (val1), + [val2] "+f" (val2), [val3] "+f" (val3), + [val4] "+f" (val4), [val5] "+f" (val5), + [val6] "+f" (val6), [val7] "+f" (val7) + : [out] "r" (out) + ); + + __asm__ volatile ( + "sub.s %[fTmp2], %[val8], %[val9] \n\t" + "add.s %[val8], %[val8], %[val9] \n\t" + "sub.s %[fTmp3], %[val11], %[val10] \n\t" + "add.s %[val10], %[val11], %[val10] \n\t" + "sub.s %[fTmp4], %[val12], %[val13] \n\t" + "mul.s %[val9], %[fTmp1], %[fTmp2] \n\t" + "add.s %[val12], %[val12], %[val13] \n\t" + "mul.s %[val11], %[fTmp1], %[fTmp3] \n\t" + "sub.s %[fTmp2], %[val15], %[val14] \n\t" + "mul.s %[val13], %[fTmp1], %[fTmp4] \n\t" + "add.s %[val14], %[val15], %[val14] \n\t" + "add.s %[val10], %[val10], %[val11] \n\t" + "mul.s %[val15], %[fTmp1], %[fTmp2] \n\t" + "add.s %[val14], %[val14], %[val15] \n\t" + "add.s %[val12], %[val12], %[val14] \n\t" + "add.s %[val14], %[val14], %[val13] \n\t" + "add.s %[val13], %[val13], %[val15] \n\t" + "add.s %[val8], %[val8], %[val12] \n\t" + "add.s %[val12], %[val12], %[val10] \n\t" + "add.s %[val10], %[val10], %[val14] \n\t" + "add.s %[val14], %[val14], %[val9] \n\t" + "add.s %[val9], %[val9], %[val13] \n\t" + "add.s %[val13], %[val13], %[val11] \n\t" + "add.s %[val11], %[val11], %[val15] \n\t" + "swc1 %[val8], 2*4(%[out]) \n\t" + "swc1 %[val9], 18*4(%[out]) \n\t" + "swc1 %[val10], 10*4(%[out]) \n\t" + "swc1 %[val11], 26*4(%[out]) \n\t" + "swc1 %[val12], 6*4(%[out]) \n\t" + "swc1 %[val13], 22*4(%[out]) \n\t" + "swc1 %[val14], 14*4(%[out]) \n\t" + "swc1 %[val15], 30*4(%[out]) \n\t" + + : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), + [val8] "+f" (val8), [val9] "+f" (val9), [val10] "+f" (val10), + [val11] "+f" (val11), [val12] "+f" (val12), [val13] "+f" (val13), + [val14] "+f" (val14), [val15] "+f" (val15) + : [fTmp1] "f" (fTmp1), [out] "r" (out) + ); + + __asm__ volatile ( + "sub.s %[fTmp2], %[val16], %[val17] \n\t" + "add.s %[val16], %[val16], %[val17] \n\t" + "sub.s %[fTmp3], %[val19], %[val18] \n\t" + "add.s %[val18], %[val19], %[val18] \n\t" + "sub.s %[fTmp4], %[val20], %[val21] \n\t" + "mul.s %[val17], %[fTmp1], %[fTmp2] \n\t" + "add.s %[val20], %[val20], %[val21] \n\t" + "mul.s %[val19], %[fTmp1], %[fTmp3] \n\t" + "sub.s %[fTmp2], %[val23], %[val22] \n\t" + "mul.s %[val21], %[fTmp1], %[fTmp4] \n\t" + "add.s %[val22], %[val23], %[val22] \n\t" + "add.s %[val18], %[val18], %[val19] \n\t" + "mul.s %[val23], %[fTmp1], %[fTmp2] \n\t" + "add.s %[val22], %[val22], %[val23] \n\t" + "add.s %[val20], %[val20], %[val22] \n\t" + "add.s %[val22], %[val22], %[val21] \n\t" + "add.s %[val21], %[val21], %[val23] \n\t" + + : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), + [val16] "+f" (val16), [val17] "+f" (val17), [val18] "+f" (val18), + [val19] "+f" (val19), [val20] "+f" (val20), [val21] "+f" (val21), + [val22] "+f" (val22), [val23] "+f" (val23) + : [fTmp1] "f" (fTmp1) + ); + + __asm__ volatile ( + "sub.s %[fTmp2], %[val24], %[val25] \n\t" + "add.s %[val24], %[val24], %[val25] \n\t" + "sub.s %[fTmp3], %[val27], %[val26] \n\t" + "add.s %[val26], %[val27], %[val26] \n\t" + "sub.s %[fTmp4], %[val28], %[val29] \n\t" + "mul.s %[val25], %[fTmp1], %[fTmp2] \n\t" + "add.s %[val28], %[val28], %[val29] \n\t" + "mul.s %[val27], %[fTmp1], %[fTmp3] \n\t" + "sub.s %[fTmp2], %[val31], %[val30] \n\t" + "mul.s %[val29], %[fTmp1], %[fTmp4] \n\t" + "add.s %[val30], %[val31], %[val30] \n\t" + "add.s %[val26], %[val26], %[val27] \n\t" + "mul.s %[val31], %[fTmp1], %[fTmp2] \n\t" + "add.s %[val30], %[val30], %[val31] \n\t" + "add.s %[val28], %[val28], %[val30] \n\t" + "add.s %[val30], %[val30], %[val29] \n\t" + "add.s %[val29], %[val29], %[val31] \n\t" + "add.s %[val24], %[val24], %[val28] \n\t" + "add.s %[val28], %[val28], %[val26] \n\t" + "add.s %[val26], %[val26], %[val30] \n\t" + "add.s %[val30], %[val30], %[val25] \n\t" + "add.s %[val25], %[val25], %[val29] \n\t" + "add.s %[val29], %[val29], %[val27] \n\t" + "add.s %[val27], %[val27], %[val31] \n\t" + + : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), + [val24] "+f" (val24), [val25] "+f" (val25), [val26] "+f" (val26), + [val27] "+f" (val27), [val28] "+f" (val28), [val29] "+f" (val29), + [val30] "+f" (val30), [val31] "+f" (val31) + : [fTmp1] "f" (fTmp1) + ); + + out[ 1] = val16 + val24; + out[17] = val17 + val25; + out[ 9] = val18 + val26; + out[25] = val19 + val27; + out[ 5] = val20 + val28; + out[21] = val21 + val29; + out[13] = val22 + val30; + out[29] = val23 + val31; + out[ 3] = val24 + val20; + out[19] = val25 + val21; + out[11] = val26 + val22; + out[27] = val27 + val23; + out[ 7] = val28 + val18; + out[23] = val29 + val19; + out[15] = val30 + val17; + out[31] = val31; +} + +static void imdct36_mips_float(float *out, float *buf, float *in, float *win) +{ + float t0, t1, t2, t3, s0, s1, s2, s3; + float tmp[18]; + /* temporary variables */ + float in1, in2, in3, in4, in5, in6; + float out1, out2, out3, out4, out5; + float c1, c2, c3, c4, c5, c6, c7, c8, c9; + + /** + * all loops are unrolled totally, and instructions are scheduled to + * minimize pipeline stall. instructions of the first two loops are + * reorganized, in order to eliminate unnecessary readings and + * writings into array. values defined in macros and tables are + * eliminated - they are directly loaded in appropriate variables + */ + + /* loop 1 and 2 */ + __asm__ volatile ( + "lwc1 %[in1], 17*4(%[in]) \t\n" + "lwc1 %[in2], 16*4(%[in]) \t\n" + "lwc1 %[in3], 15*4(%[in]) \t\n" + "lwc1 %[in4], 14*4(%[in]) \t\n" + "lwc1 %[in5], 13*4(%[in]) \t\n" + "lwc1 %[in6], 12*4(%[in]) \t\n" + "add.s %[out1], %[in1], %[in2] \t\n" + "add.s %[out2], %[in2], %[in3] \t\n" + "add.s %[out3], %[in3], %[in4] \t\n" + "add.s %[out4], %[in4], %[in5] \t\n" + "add.s %[out5], %[in5], %[in6] \t\n" + "lwc1 %[in1], 11*4(%[in]) \t\n" + "swc1 %[out2], 16*4(%[in]) \t\n" + "add.s %[out1], %[out1], %[out3] \t\n" + "swc1 %[out4], 14*4(%[in]) \t\n" + "add.s %[out3], %[out3], %[out5] \t\n" + "lwc1 %[in2], 10*4(%[in]) \t\n" + "lwc1 %[in3], 9*4(%[in]) \t\n" + "swc1 %[out1], 17*4(%[in]) \t\n" + "lwc1 %[in4], 8*4(%[in]) \t\n" + "swc1 %[out3], 15*4(%[in]) \t\n" + "add.s %[out1], %[in6], %[in1] \t\n" + "add.s %[out2], %[in1], %[in2] \t\n" + "add.s %[out3], %[in2], %[in3] \t\n" + "add.s %[out4], %[in3], %[in4] \t\n" + "lwc1 %[in5], 7*4(%[in]) \t\n" + "swc1 %[out1], 12*4(%[in]) \t\n" + "add.s %[out5], %[out5], %[out2] \t\n" + "swc1 %[out3], 10*4(%[in]) \t\n" + "add.s %[out2], %[out2], %[out4] \t\n" + "lwc1 %[in6], 6*4(%[in]) \t\n" + "lwc1 %[in1], 5*4(%[in]) \t\n" + "swc1 %[out5], 13*4(%[in]) \t\n" + "lwc1 %[in2], 4*4(%[in]) \t\n" + "swc1 %[out2], 11*4(%[in]) \t\n" + "add.s %[out5], %[in4], %[in5] \t\n" + "add.s %[out1], %[in5], %[in6] \t\n" + "add.s %[out2], %[in6], %[in1] \t\n" + "add.s %[out3], %[in1], %[in2] \t\n" + "lwc1 %[in3], 3*4(%[in]) \t\n" + "swc1 %[out5], 8*4(%[in]) \t\n" + "add.s %[out4], %[out4], %[out1] \t\n" + "swc1 %[out2], 6*4(%[in]) \t\n" + "add.s %[out1], %[out1], %[out3] \t\n" + "lwc1 %[in4], 2*4(%[in]) \t\n" + "lwc1 %[in5], 1*4(%[in]) \t\n" + "swc1 %[out4], 9*4(%[in]) \t\n" + "lwc1 %[in6], 0(%[in]) \t\n" + "swc1 %[out1], 7*4(%[in]) \t\n" + "add.s %[out4], %[in2], %[in3] \t\n" + "add.s %[out5], %[in3], %[in4] \t\n" + "add.s %[out1], %[in4], %[in5] \t\n" + "add.s %[out2], %[in5], %[in6] \t\n" + "swc1 %[out4], 4*4(%[in]) \t\n" + "add.s %[out3], %[out3], %[out5] \t\n" + "swc1 %[out1], 2*4(%[in]) \t\n" + "add.s %[out5], %[out5], %[out2] \t\n" + "swc1 %[out2], 1*4(%[in]) \t\n" + "swc1 %[out3], 5*4(%[in]) \t\n" + "swc1 %[out5], 3*4(%[in]) \t\n" + + : [in1] "=&f" (in1), [in2] "=&f" (in2), + [in3] "=&f" (in3), [in4] "=&f" (in4), + [in5] "=&f" (in5), [in6] "=&f" (in6), + [out1] "=&f" (out1), [out2] "=&f" (out2), + [out3] "=&f" (out3), [out4] "=&f" (out4), + [out5] "=&f" (out5) + : [in] "r" (in) + : "memory" + ); + + /* loop 3 */ + __asm__ volatile ( + "li.s %[c1], 0.5 \t\n" + "lwc1 %[in1], 8*4(%[in]) \t\n" + "lwc1 %[in2], 16*4(%[in]) \t\n" + "lwc1 %[in3], 4*4(%[in]) \t\n" + "lwc1 %[in4], 0(%[in]) \t\n" + "lwc1 %[in5], 12*4(%[in]) \t\n" + "li.s %[c2], 0.93969262078590838405 \t\n" + "add.s %[t2], %[in1], %[in2] \t\n" + "add.s %[t0], %[in1], %[in3] \t\n" + "li.s %[c3], -0.76604444311897803520 \t\n" + "madd.s %[t3], %[in4], %[in5], %[c1] \t\n" + "sub.s %[t1], %[in4], %[in5] \t\n" + "sub.s %[t2], %[t2], %[in3] \t\n" + "mul.s %[t0], %[t0], %[c2] \t\n" + "li.s %[c4], -0.17364817766693034885 \t\n" + "li.s %[c5], -0.86602540378443864676 \t\n" + "li.s %[c6], 0.98480775301220805936 \t\n" + "nmsub.s %[out1], %[t1], %[t2], %[c1] \t\n" + "add.s %[out2], %[t1], %[t2] \t\n" + "add.s %[t2], %[in2], %[in3] \t\n" + "sub.s %[t1], %[in1], %[in2] \t\n" + "sub.s %[out3], %[t3], %[t0] \t\n" + "swc1 %[out1], 6*4(%[tmp]) \t\n" + "swc1 %[out2], 16*4(%[tmp]) \t\n" + "mul.s %[t2], %[t2], %[c3] \t\n" + "mul.s %[t1], %[t1], %[c4] \t\n" + "add.s %[out1], %[t3], %[t0] \t\n" + "lwc1 %[in1], 10*4(%[in]) \t\n" + "lwc1 %[in2], 14*4(%[in]) \t\n" + "sub.s %[out3], %[out3], %[t2] \t\n" + "add.s %[out2], %[t3], %[t2] \t\n" + "add.s %[out1], %[out1], %[t1] \t\n" + "lwc1 %[in3], 2*4(%[in]) \t\n" + "lwc1 %[in4], 6*4(%[in]) \t\n" + "swc1 %[out3], 10*4(%[tmp]) \t\n" + "sub.s %[out2], %[out2], %[t1] \t\n" + "swc1 %[out1], 2*4(%[tmp]) \t\n" + "add.s %[out1], %[in1], %[in2] \t\n" + "add.s %[t2], %[in1], %[in3] \t\n" + "sub.s %[t3], %[in1], %[in2] \t\n" + "swc1 %[out2], 14*4(%[tmp]) \t\n" + "li.s %[c7], -0.34202014332566873304 \t\n" + "sub.s %[out1], %[out1], %[in3] \t\n" + "mul.s %[t2], %[t2], %[c6] \t\n" + "mul.s %[t3], %[t3], %[c7] \t\n" + "li.s %[c8], 0.86602540378443864676 \t\n" + "mul.s %[t0], %[in4], %[c8] \t\n" + "mul.s %[out1], %[out1], %[c5] \t\n" + "add.s %[t1], %[in2], %[in3] \t\n" + "li.s %[c9], -0.64278760968653932632 \t\n" + "add.s %[out2], %[t2], %[t3] \t\n" + "lwc1 %[in1], 9*4(%[in]) \t\n" + "swc1 %[out1], 4*4(%[tmp]) \t\n" + "mul.s %[t1], %[t1], %[c9] \t\n" + "lwc1 %[in2], 17*4(%[in]) \t\n" + "add.s %[out2], %[out2], %[t0] \t\n" + "lwc1 %[in3], 5*4(%[in]) \t\n" + "lwc1 %[in4], 1*4(%[in]) \t\n" + "add.s %[out3], %[t2], %[t1] \t\n" + "sub.s %[out1], %[t3], %[t1] \t\n" + "swc1 %[out2], 0(%[tmp]) \t\n" + "lwc1 %[in5], 13*4(%[in]) \t\n" + "add.s %[t2], %[in1], %[in2] \t\n" + "sub.s %[out3], %[out3], %[t0] \t\n" + "sub.s %[out1], %[out1], %[t0] \t\n" + "add.s %[t0], %[in1], %[in3] \t\n" + "madd.s %[t3], %[in4], %[in5], %[c1] \t\n" + "sub.s %[t2], %[t2], %[in3] \t\n" + "swc1 %[out3], 12*4(%[tmp]) \t\n" + "swc1 %[out1], 8*4(%[tmp]) \t\n" + "sub.s %[t1], %[in4], %[in5] \t\n" + "mul.s %[t0], %[t0], %[c2] \t\n" + "nmsub.s %[out1], %[t1], %[t2], %[c1] \t\n" + "add.s %[out2], %[t1], %[t2] \t\n" + "add.s %[t2], %[in2], %[in3] \t\n" + "sub.s %[t1], %[in1], %[in2] \t\n" + "sub.s %[out3], %[t3], %[t0] \t\n" + "swc1 %[out1], 7*4(%[tmp]) \t\n" + "swc1 %[out2], 17*4(%[tmp]) \t\n" + "mul.s %[t2], %[t2], %[c3] \t\n" + "mul.s %[t1], %[t1], %[c4] \t\n" + "add.s %[out1], %[t3], %[t0] \t\n" + "lwc1 %[in1], 11*4(%[in]) \t\n" + "lwc1 %[in2], 15*4(%[in]) \t\n" + "sub.s %[out3], %[out3], %[t2] \t\n" + "add.s %[out2], %[t3], %[t2] \t\n" + "add.s %[out1], %[out1], %[t1] \t\n" + "lwc1 %[in3], 3*4(%[in]) \t\n" + "lwc1 %[in4], 7*4(%[in]) \t\n" + "swc1 %[out3], 11*4(%[tmp]) \t\n" + "sub.s %[out2], %[out2], %[t1] \t\n" + "swc1 %[out1], 3*4(%[tmp]) \t\n" + "add.s %[out3], %[in1], %[in2] \t\n" + "add.s %[t2], %[in1], %[in3] \t\n" + "sub.s %[t3], %[in1], %[in2] \t\n" + "swc1 %[out2], 15*4(%[tmp]) \t\n" + "mul.s %[t0], %[in4], %[c8] \t\n" + "sub.s %[out3], %[out3], %[in3] \t\n" + "mul.s %[t2], %[t2], %[c6] \t\n" + "mul.s %[t3], %[t3], %[c7] \t\n" + "add.s %[t1], %[in2], %[in3] \t\n" + "mul.s %[out3], %[out3], %[c5] \t\n" + "add.s %[out1], %[t2], %[t3] \t\n" + "mul.s %[t1], %[t1], %[c9] \t\n" + "swc1 %[out3], 5*4(%[tmp]) \t\n" + "add.s %[out1], %[out1], %[t0] \t\n" + "add.s %[out2], %[t2], %[t1] \t\n" + "sub.s %[out3], %[t3], %[t1] \t\n" + "swc1 %[out1], 1*4(%[tmp]) \t\n" + "sub.s %[out2], %[out2], %[t0] \t\n" + "sub.s %[out3], %[out3], %[t0] \t\n" + "swc1 %[out2], 13*4(%[tmp]) \t\n" + "swc1 %[out3], 9*4(%[tmp]) \t\n" + + : [t0] "=&f" (t0), [t1] "=&f" (t1), + [t2] "=&f" (t2), [t3] "=&f" (t3), + [in1] "=&f" (in1), [in2] "=&f" (in2), + [in3] "=&f" (in3), [in4] "=&f" (in4), + [in5] "=&f" (in5), + [out1] "=&f" (out1), [out2] "=&f" (out2), + [out3] "=&f" (out3), + [c1] "=&f" (c1), [c2] "=&f" (c2), + [c3] "=&f" (c3), [c4] "=&f" (c4), + [c5] "=&f" (c5), [c6] "=&f" (c6), + [c7] "=&f" (c7), [c8] "=&f" (c8), + [c9] "=&f" (c9) + : [in] "r" (in), [tmp] "r" (tmp) + : "memory" + ); + + /* loop 4 */ + __asm__ volatile ( + "lwc1 %[in1], 2*4(%[tmp]) \t\n" + "lwc1 %[in2], 0(%[tmp]) \t\n" + "lwc1 %[in3], 3*4(%[tmp]) \t\n" + "lwc1 %[in4], 1*4(%[tmp]) \t\n" + "li.s %[c1], 0.50190991877167369479 \t\n" + "li.s %[c2], 5.73685662283492756461 \t\n" + "add.s %[s0], %[in1], %[in2] \t\n" + "sub.s %[s2], %[in1], %[in2] \t\n" + "add.s %[s1], %[in3], %[in4] \t\n" + "sub.s %[s3], %[in3], %[in4] \t\n" + "lwc1 %[in1], 9*4(%[win]) \t\n" + "lwc1 %[in2], 4*9*4(%[buf]) \t\n" + "lwc1 %[in3], 8*4(%[win]) \t\n" + "mul.s %[s1], %[s1], %[c1] \t\n" + "mul.s %[s3], %[s3], %[c2] \t\n" + "lwc1 %[in4], 4*8*4(%[buf]) \t\n" + "lwc1 %[in5], 29*4(%[win]) \t\n" + "lwc1 %[in6], 28*4(%[win]) \t\n" + "add.s %[t0], %[s0], %[s1] \t\n" + "sub.s %[t1], %[s0], %[s1] \t\n" + "li.s %[c1], 0.51763809020504152469 \t\n" + "li.s %[c2], 1.93185165257813657349 \t\n" + "mul.s %[out3], %[in5], %[t0] \t\n" + "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" + "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" + "mul.s %[out4], %[in6], %[t0] \t\n" + "add.s %[t0], %[s2], %[s3] \t\n" + "swc1 %[out3], 4*9*4(%[buf]) \t\n" + "swc1 %[out1], 288*4(%[out]) \t\n" + "swc1 %[out2], 256*4(%[out]) \t\n" + "swc1 %[out4], 4*8*4(%[buf]) \t\n" + "sub.s %[t1], %[s2], %[s3] \t\n" + "lwc1 %[in1], 17*4(%[win]) \t\n" + "lwc1 %[in2], 4*17*4(%[buf]) \t\n" + "lwc1 %[in3], 0(%[win]) \t\n" + "lwc1 %[in4], 0(%[buf]) \t\n" + "lwc1 %[in5], 37*4(%[win]) \t\n" + "lwc1 %[in6], 20*4(%[win]) \t\n" + "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" + "lwc1 %[in1], 6*4(%[tmp]) \t\n" + "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" + "mul.s %[out3], %[t0], %[in5] \t\n" + "mul.s %[out4], %[t0], %[in6] \t\n" + "swc1 %[out1], 544*4(%[out]) \t\n" + "lwc1 %[in2], 4*4(%[tmp]) \t\n" + "swc1 %[out2], 0(%[out]) \t\n" + "swc1 %[out3], 4*17*4(%[buf]) \t\n" + "swc1 %[out4], 0(%[buf]) \t\n" + "lwc1 %[in3], 7*4(%[tmp]) \t\n" + "add.s %[s0], %[in1], %[in2] \t\n" + "sub.s %[s2], %[in1], %[in2] \t\n" + "lwc1 %[in4], 5*4(%[tmp]) \t\n" + "add.s %[s1], %[in3], %[in4] \t\n" + "sub.s %[s3], %[in3], %[in4] \t\n" + "lwc1 %[in1], 10*4(%[win]) \t\n" + "lwc1 %[in2], 4*10*4(%[buf]) \t\n" + "lwc1 %[in3], 7*4(%[win]) \t\n" + "mul.s %[s1], %[s1], %[c1] \t\n" + "mul.s %[s3], %[s3], %[c2] \t\n" + "add.s %[t0], %[s0], %[s1] \t\n" + "sub.s %[t1], %[s0], %[s1] \t\n" + "lwc1 %[in4], 4*7*4(%[buf]) \t\n" + "lwc1 %[in5], 30*4(%[win]) \t\n" + "lwc1 %[in6], 27*4(%[win]) \t\n" + "li.s %[c1], 0.55168895948124587824 \t\n" + "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" + "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" + "mul.s %[out3], %[t0], %[in5] \t\n" + "mul.s %[out4], %[t0], %[in6] \t\n" + "add.s %[t0], %[s2], %[s3] \t\n" + "swc1 %[out1], 320*4(%[out]) \t\n" + "swc1 %[out2], 224*4(%[out]) \t\n" + "swc1 %[out3], 4*10*4(%[buf]) \t\n" + "swc1 %[out4], 4*7*4(%[buf]) \t\n" + "sub.s %[t1], %[s2], %[s3] \t\n" + "lwc1 %[in1], 16*4(%[win]) \t\n" + "lwc1 %[in2], 4*16*4(%[buf]) \t\n" + "lwc1 %[in3], 1*4(%[win]) \t\n" + "lwc1 %[in4], 4*1*4(%[buf]) \t\n" + "lwc1 %[in5], 36*4(%[win]) \t\n" + "lwc1 %[in6], 21*4(%[win]) \t\n" + "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" + "lwc1 %[in1], 10*4(%[tmp]) \t\n" + "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" + "mul.s %[out3], %[in5], %[t0] \t\n" + "mul.s %[out4], %[in6], %[t0] \t\n" + "swc1 %[out1], 512*4(%[out]) \t\n" + "lwc1 %[in2], 8*4(%[tmp]) \t\n" + "swc1 %[out2], 32*4(%[out]) \t\n" + "swc1 %[out3], 4*16*4(%[buf]) \t\n" + "swc1 %[out4], 4*1*4(%[buf]) \t\n" + "li.s %[c2], 1.18310079157624925896 \t\n" + "add.s %[s0], %[in1], %[in2] \t\n" + "sub.s %[s2], %[in1], %[in2] \t\n" + "lwc1 %[in3], 11*4(%[tmp]) \t\n" + "lwc1 %[in4], 9*4(%[tmp]) \t\n" + "add.s %[s1], %[in3], %[in4] \t\n" + "sub.s %[s3], %[in3], %[in4] \t\n" + "lwc1 %[in1], 11*4(%[win]) \t\n" + "lwc1 %[in2], 4*11*4(%[buf]) \t\n" + "lwc1 %[in3], 6*4(%[win]) \t\n" + "mul.s %[s1], %[s1], %[c1] \t\n" + "mul.s %[s3], %[s3], %[c2] \t\n" + "lwc1 %[in4], 4*6*4(%[buf]) \t\n" + "lwc1 %[in5], 31*4(%[win]) \t\n" + "lwc1 %[in6], 26*4(%[win]) \t\n" + "add.s %[t0], %[s0], %[s1] \t\n" + "sub.s %[t1], %[s0], %[s1] \t\n" + "mul.s %[out3], %[t0], %[in5] \t\n" + "mul.s %[out4], %[t0], %[in6] \t\n" + "add.s %[t0], %[s2], %[s3] \t\n" + "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" + "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" + "swc1 %[out3], 4*11*4(%[buf]) \t\n" + "swc1 %[out4], 4*6*4(%[buf]) \t\n" + "sub.s %[t1], %[s2], %[s3] \t\n" + "swc1 %[out1], 352*4(%[out]) \t\n" + "swc1 %[out2], 192*4(%[out]) \t\n" + "lwc1 %[in1], 15*4(%[win]) \t\n" + "lwc1 %[in2], 4*15*4(%[buf]) \t\n" + "lwc1 %[in3], 2*4(%[win]) \t\n" + "lwc1 %[in4], 4*2*4(%[buf]) \t\n" + "lwc1 %[in5], 35*4(%[win]) \t\n" + "lwc1 %[in6], 22*4(%[win]) \t\n" + "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" + "lwc1 %[in1], 14*4(%[tmp]) \t\n" + "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" + "mul.s %[out3], %[t0], %[in5] \t\n" + "mul.s %[out4], %[t0], %[in6] \t\n" + "swc1 %[out1], 480*4(%[out]) \t\n" + "lwc1 %[in2], 12*4(%[tmp]) \t\n" + "swc1 %[out2], 64*4(%[out]) \t\n" + "swc1 %[out3], 4*15*4(%[buf]) \t\n" + "swc1 %[out4], 4*2*4(%[buf]) \t\n" + "lwc1 %[in3], 15*4(%[tmp]) \t\n" + "add.s %[s0], %[in1], %[in2] \t\n" + "sub.s %[s2], %[in1], %[in2] \t\n" + "lwc1 %[in4], 13*4(%[tmp]) \t\n" + "li.s %[c1], 0.61038729438072803416 \t\n" + "li.s %[c2], 0.87172339781054900991 \t\n" + "add.s %[s1], %[in3], %[in4] \t\n" + "sub.s %[s3], %[in3], %[in4] \t\n" + "lwc1 %[in1], 12*4(%[win]) \t\n" + "lwc1 %[in2], 4*12*4(%[buf]) \t\n" + "lwc1 %[in3], 5*4(%[win]) \t\n" + "mul.s %[s1], %[s1], %[c1] \t\n" + "mul.s %[s3], %[s3], %[c2] \t\n" + "lwc1 %[in4], 4*5*4(%[buf]) \t\n" + "lwc1 %[in5], 32*4(%[win]) \t\n" + "lwc1 %[in6], 25*4(%[win]) \t\n" + "add.s %[t0], %[s0], %[s1] \t\n" + "sub.s %[t1], %[s0], %[s1] \t\n" + "lwc1 %[s0], 16*4(%[tmp]) \t\n" + "lwc1 %[s1], 17*4(%[tmp]) \t\n" + "li.s %[c1], 0.70710678118654752439 \t\n" + "mul.s %[out3], %[t0], %[in5] \t\n" + "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" + "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" + "mul.s %[out4], %[t0], %[in6] \t\n" + "add.s %[t0], %[s2], %[s3] \t\n" + "swc1 %[out3], 4*12*4(%[buf]) \t\n" + "swc1 %[out1], 384*4(%[out]) \t\n" + "swc1 %[out2], 160*4(%[out]) \t\n" + "swc1 %[out4], 4*5*4(%[buf]) \t\n" + "sub.s %[t1], %[s2], %[s3] \t\n" + "lwc1 %[in1], 14*4(%[win]) \t\n" + "lwc1 %[in2], 4*14*4(%[buf]) \t\n" + "lwc1 %[in3], 3*4(%[win]) \t\n" + "lwc1 %[in4], 4*3*4(%[buf]) \t\n" + "lwc1 %[in5], 34*4(%[win]) \t\n" + "lwc1 %[in6], 23*4(%[win]) \t\n" + "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" + "mul.s %[s1], %[s1], %[c1] \t\n" + "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" + "mul.s %[out3], %[in5], %[t0] \t\n" + "mul.s %[out4], %[in6], %[t0] \t\n" + "swc1 %[out1], 448*4(%[out]) \t\n" + "add.s %[t0], %[s0], %[s1] \t\n" + "swc1 %[out2], 96*4(%[out]) \t\n" + "swc1 %[out3], 4*14*4(%[buf]) \t\n" + "swc1 %[out4], 4*3*4(%[buf]) \t\n" + "sub.s %[t1], %[s0], %[s1] \t\n" + "lwc1 %[in1], 13*4(%[win]) \t\n" + "lwc1 %[in2], 4*13*4(%[buf]) \t\n" + "lwc1 %[in3], 4*4(%[win]) \t\n" + "lwc1 %[in4], 4*4*4(%[buf]) \t\n" + "lwc1 %[in5], 33*4(%[win]) \t\n" + "lwc1 %[in6], 24*4(%[win]) \t\n" + "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" + "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" + "mul.s %[out3], %[t0], %[in5] \t\n" + "mul.s %[out4], %[t0], %[in6] \t\n" + "swc1 %[out1], 416*4(%[out]) \t\n" + "swc1 %[out2], 128*4(%[out]) \t\n" + "swc1 %[out3], 4*13*4(%[buf]) \t\n" + "swc1 %[out4], 4*4*4(%[buf]) \t\n" + + : [c1] "=&f" (c1), [c2] "=&f" (c2), + [in1] "=&f" (in1), [in2] "=&f" (in2), + [in3] "=&f" (in3), [in4] "=&f" (in4), + [in5] "=&f" (in5), [in6] "=&f" (in6), + [out1] "=&f" (out1), [out2] "=&f" (out2), + [out3] "=&f" (out3), [out4] "=&f" (out4), + [t0] "=&f" (t0), [t1] "=&f" (t1), + [t2] "=&f" (t2), [t3] "=&f" (t3), + [s0] "=&f" (s0), [s1] "=&f" (s1), + [s2] "=&f" (s2), [s3] "=&f" (s3) + : [tmp] "r" (tmp), [win] "r" (win), + [buf] "r" (buf), [out] "r" (out) + : "memory" + ); +} + +static void ff_imdct36_blocks_mips_float(float *out, float *buf, float *in, + int count, int switch_point, int block_type) +{ + int j; + for (j=0 ; j < count; j++) { + /* apply window & overlap with previous buffer */ + + /* select window */ + int win_idx = (switch_point && j < 2) ? 0 : block_type; + float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; + + imdct36_mips_float(out, buf, in, win); + + in += 18; + buf += ((j&3) != 3 ? 1 : (72-3)); + out++; + } +} + +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_INLINE_ASM && HAVE_MIPSFPU */ + +void ff_mpadsp_init_mipsfpu(MPADSPContext *s) +{ +#if HAVE_INLINE_ASM && HAVE_MIPSFPU +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 + s->apply_window_float = ff_mpadsp_apply_window_mips_float; + s->imdct36_blocks_float = ff_imdct36_blocks_mips_float; + s->dct32_float = ff_dct32_mips_float; +#endif +#endif +} diff --git a/libavcodec/mips/mpegvideo_init_mips.c b/libavcodec/mips/mpegvideo_init_mips.c new file mode 100644 index 0000000000..1918da5f46 --- /dev/null +++ b/libavcodec/mips/mpegvideo_init_mips.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "h263dsp_mips.h" +#include "mpegvideo_mips.h" + +#if HAVE_MSA +static av_cold void dct_unquantize_init_msa(MpegEncContext *s) +{ + s->dct_unquantize_h263_intra = ff_dct_unquantize_h263_intra_msa; + s->dct_unquantize_h263_inter = ff_dct_unquantize_h263_inter_msa; + if (!s->q_scale_type) + s->dct_unquantize_mpeg2_inter = ff_dct_unquantize_mpeg2_inter_msa; +} +#endif // #if HAVE_MSA + +#if HAVE_MMI +static av_cold void dct_unquantize_init_mmi(MpegEncContext *s) +{ + s->dct_unquantize_h263_intra = ff_dct_unquantize_h263_intra_mmi; + s->dct_unquantize_h263_inter = ff_dct_unquantize_h263_inter_mmi; + s->dct_unquantize_mpeg1_intra = ff_dct_unquantize_mpeg1_intra_mmi; + s->dct_unquantize_mpeg1_inter = ff_dct_unquantize_mpeg1_inter_mmi; + + if (!(s->avctx->flags & AV_CODEC_FLAG_BITEXACT)) + if (!s->q_scale_type) + s->dct_unquantize_mpeg2_intra = ff_dct_unquantize_mpeg2_intra_mmi; + + s->denoise_dct= ff_denoise_dct_mmi; +} +#endif /* HAVE_MMI */ + +av_cold void ff_mpv_common_init_mips(MpegEncContext *s) +{ +#if HAVE_MSA + dct_unquantize_init_msa(s); +#endif // #if HAVE_MSA +#if HAVE_MMI + dct_unquantize_init_mmi(s); +#endif /* HAVE_MMI */ +} diff --git a/libavcodec/mips/mpegvideo_mips.h b/libavcodec/mips/mpegvideo_mips.h new file mode 100644 index 0000000000..760d7b3295 --- /dev/null +++ b/libavcodec/mips/mpegvideo_mips.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_MPEGVIDEO_MIPS_H +#define AVCODEC_MIPS_MPEGVIDEO_MIPS_H + +#include "libavcodec/mpegvideo.h" + +void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block, + int n, int qscale); +void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block, + int n, int qscale); +void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block, + int n, int qscale); +void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block, + int n, int qscale); +void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block, + int n, int qscale); +void ff_denoise_dct_mmi(MpegEncContext *s, int16_t *block); + +#endif /* AVCODEC_MIPS_MPEGVIDEO_MIPS_H */ diff --git a/libavcodec/mips/mpegvideo_mmi.c b/libavcodec/mips/mpegvideo_mmi.c new file mode 100644 index 0000000000..450a18c288 --- /dev/null +++ b/libavcodec/mips/mpegvideo_mmi.c @@ -0,0 +1,492 @@ +/* + * Loongson SIMD optimized mpegvideo + * + * Copyright (c) 2015 Loongson Technology Corporation Limited + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "mpegvideo_mips.h" +#include "libavutil/mips/asmdefs.h" + +void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block, + int n, int qscale) +{ + int64_t level, qmul, qadd, nCoeffs; + double ftmp[6]; + mips_reg addr[1]; + + qmul = qscale << 1; + av_assert2(s->block_last_index[n]>=0 || s->h263_aic); + + if (!s->h263_aic) { + if (n<4) + level = block[0] * s->y_dc_scale; + else + level = block[0] * s->c_dc_scale; + qadd = (qscale-1) | 1; + } else { + qadd = 0; + level = block[0]; + } + + if(s->ac_pred) + nCoeffs = 63; + else + nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]]; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "packsswh %[qmul], %[qmul], %[qmul] \n\t" + "packsswh %[qmul], %[qmul], %[qmul] \n\t" + "packsswh %[qadd], %[qadd], %[qadd] \n\t" + "packsswh %[qadd], %[qadd], %[qadd] \n\t" + "psubh %[ftmp0], %[ftmp0], %[qadd] \n\t" + "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + ".p2align 4 \n\t" + "1: \n\t" + PTR_ADDU "%[addr0], %[block], %[nCoeffs] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[addr0]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[addr0]) \n\t" + "mov.d %[ftmp3], %[ftmp1] \n\t" + "mov.d %[ftmp4], %[ftmp2] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[qmul] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[qmul] \n\t" + "pcmpgth %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "pcmpgth %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "xor %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "xor %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "xor %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "xor %[ftmp4], %[ftmp4], %[ftmp2] \n\t" + "pcmpeqh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "pcmpeqh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pandn %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "pandn %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + PTR_ADDIU "%[nCoeffs], %[nCoeffs], 0x10 \n\t" + "gssdlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gssdlc1 %[ftmp2], 0x0f(%[addr0]) \n\t" + "gssdrc1 %[ftmp2], 0x08(%[addr0]) \n\t" + "blez %[nCoeffs], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [addr0]"=&r"(addr[0]) + : [block]"r"((mips_reg)(block+nCoeffs)), + [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))), + [qmul]"f"(qmul), [qadd]"f"(qadd) + : "memory" + ); + + block[0] = level; +} + +void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block, + int n, int qscale) +{ + int64_t qmul, qadd, nCoeffs; + double ftmp[6]; + mips_reg addr[1]; + + qmul = qscale << 1; + qadd = (qscale - 1) | 1; + av_assert2(s->block_last_index[n]>=0 || s->h263_aic); + nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]]; + + __asm__ volatile ( + "packsswh %[qmul], %[qmul], %[qmul] \n\t" + "packsswh %[qmul], %[qmul], %[qmul] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "packsswh %[qadd], %[qadd], %[qadd] \n\t" + "packsswh %[qadd], %[qadd], %[qadd] \n\t" + "psubh %[ftmp0], %[ftmp0], %[qadd] \n\t" + "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + ".p2align 4 \n\t" + "1: \n\t" + PTR_ADDU "%[addr0], %[block], %[nCoeffs] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[addr0]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[addr0]) \n\t" + "mov.d %[ftmp3], %[ftmp1] \n\t" + "mov.d %[ftmp4], %[ftmp2] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[qmul] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[qmul] \n\t" + "pcmpgth %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "pcmpgth %[ftmp4], %[ftmp4], %[ftmp5] \n\t" + "xor %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "xor %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "xor %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "xor %[ftmp4], %[ftmp4], %[ftmp2] \n\t" + "pcmpeqh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "pcmpeqh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pandn %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "pandn %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + PTR_ADDIU "%[nCoeffs], %[nCoeffs], 0x10 \n\t" + "gssdlc1 %[ftmp1], 0x07(%[addr0]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + "gssdlc1 %[ftmp2], 0x0f(%[addr0]) \n\t" + "gssdrc1 %[ftmp2], 0x08(%[addr0]) \n\t" + "blez %[nCoeffs], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [addr0]"=&r"(addr[0]) + : [block]"r"((mips_reg)(block+nCoeffs)), + [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))), + [qmul]"f"(qmul), [qadd]"f"(qadd) + : "memory" + ); +} + +void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block, + int n, int qscale) +{ + int64_t nCoeffs; + const uint16_t *quant_matrix; + int block0; + double ftmp[10]; + uint64_t tmp[1]; + mips_reg addr[1]; + + av_assert2(s->block_last_index[n]>=0); + nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1; + + if (n<4) + block0 = block[0] * s->y_dc_scale; + else + block0 = block[0] * s->c_dc_scale; + + /* XXX: only mpeg1 */ + quant_matrix = s->intra_matrix; + + __asm__ volatile ( + "dli %[tmp0], 0x0f \n\t" + "pcmpeqh %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dmtc1 %[tmp0], %[ftmp4] \n\t" + "dmtc1 %[qscale], %[ftmp1] \n\t" + "psrlh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "packsswh %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "packsswh %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "or %[addr0], %[nCoeffs], $0 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "gsldxc1 %[ftmp2], 0x00(%[addr0], %[block]) \n\t" + "gsldxc1 %[ftmp3], 0x08(%[addr0], %[block]) \n\t" + "mov.d %[ftmp4], %[ftmp2] \n\t" + "mov.d %[ftmp5], %[ftmp3] \n\t" + "gsldxc1 %[ftmp6], 0x00(%[addr0], %[quant]) \n\t" + "gsldxc1 %[ftmp7], 0x08(%[addr0], %[quant]) \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + "pcmpgth %[ftmp8], %[ftmp8], %[ftmp2] \n\t" + "pcmpgth %[ftmp9], %[ftmp9], %[ftmp3] \n\t" + "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t" + "xor %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "dli %[tmp0], 0x03 \n\t" + "pcmpeqh %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "dmtc1 %[tmp0], %[ftmp4] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "or %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "or %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t" + "xor %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "pandn %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "pandn %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "gssdxc1 %[ftmp6], 0x00(%[addr0], %[block]) \n\t" + "gssdxc1 %[ftmp7], 0x08(%[addr0], %[block]) \n\t" + PTR_ADDIU "%[addr0], %[addr0], 0x10 \n\t" + "bltz %[addr0], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]) + : [block]"r"((mips_reg)(block+nCoeffs)), + [quant]"r"((mips_reg)(quant_matrix+nCoeffs)), + [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))), + [qscale]"r"(qscale) + : "memory" + ); + + block[0] = block0; +} + +void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block, + int n, int qscale) +{ + int64_t nCoeffs; + const uint16_t *quant_matrix; + double ftmp[10]; + uint64_t tmp[1]; + mips_reg addr[1]; + + av_assert2(s->block_last_index[n] >= 0); + nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1; + quant_matrix = s->inter_matrix; + + __asm__ volatile ( + "dli %[tmp0], 0x0f \n\t" + "pcmpeqh %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dmtc1 %[tmp0], %[ftmp4] \n\t" + "dmtc1 %[qscale], %[ftmp1] \n\t" + "psrlh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "packsswh %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "packsswh %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "or %[addr0], %[nCoeffs], $0 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "gsldxc1 %[ftmp2], 0x00(%[addr0], %[block]) \n\t" + "gsldxc1 %[ftmp3], 0x08(%[addr0], %[block]) \n\t" + "mov.d %[ftmp4], %[ftmp2] \n\t" + "mov.d %[ftmp5], %[ftmp3] \n\t" + "gsldxc1 %[ftmp6], 0x00(%[addr0], %[quant]) \n\t" + "gsldxc1 %[ftmp7], 0x08(%[addr0], %[quant]) \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + "pcmpgth %[ftmp8], %[ftmp8], %[ftmp2] \n\t" + "pcmpgth %[ftmp9], %[ftmp9], %[ftmp3] \n\t" + "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t" + "xor %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "dli %[tmp0], 0x04 \n\t" + "pcmpeqh %[ftmp7], %[ftmp7], %[ftmp5] \n\t" + "dmtc1 %[tmp0], %[ftmp4] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "or %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "or %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t" + "xor %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "pandn %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "pandn %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "gssdxc1 %[ftmp6], 0x00(%[addr0], %[block]) \n\t" + "gssdxc1 %[ftmp7], 0x08(%[addr0], %[block]) \n\t" + PTR_ADDIU "%[addr0], %[addr0], 0x10 \n\t" + "bltz %[addr0], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]) + : [block]"r"((mips_reg)(block+nCoeffs)), + [quant]"r"((mips_reg)(quant_matrix+nCoeffs)), + [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))), + [qscale]"r"(qscale) + : "memory" + ); +} + +void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block, + int n, int qscale) +{ + uint64_t nCoeffs; + const uint16_t *quant_matrix; + int block0; + double ftmp[10]; + uint64_t tmp[1]; + mips_reg addr[1]; + + assert(s->block_last_index[n]>=0); + + if (s->alternate_scan) + nCoeffs = 63; + else + nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]]; + + if (n < 4) + block0 = block[0] * s->y_dc_scale; + else + block0 = block[0] * s->c_dc_scale; + + quant_matrix = s->intra_matrix; + + __asm__ volatile ( + "dli %[tmp0], 0x0f \n\t" + "pcmpeqh %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp3] \n\t" + "mtc1 %[qscale], %[ftmp9] \n\t" + "psrlh %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "packsswh %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + "packsswh %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + "or %[addr0], %[nCoeffs], $0 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "gsldxc1 %[ftmp1], 0x00(%[addr0], %[block]) \n\t" + "gsldxc1 %[ftmp2], 0x08(%[addr0], %[block]) \n\t" + "mov.d %[ftmp3], %[ftmp1] \n\t" + "mov.d %[ftmp4], %[ftmp2] \n\t" + "gsldxc1 %[ftmp5], 0x00(%[addr0], %[quant]) \n\t" + "gsldxc1 %[ftmp6], 0x00(%[addr0], %[quant]) \n\t" + "pmullh %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "pcmpgth %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "pcmpgth %[ftmp8], %[ftmp8], %[ftmp2] \n\t" + "xor %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t" + "psubh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp3] \n\t" + "dli %[tmp0], 0x03 \n\t" + "pcmpeqh %[ftmp6] , %[ftmp6], %[ftmp4] \n\t" + "mtc1 %[tmp0], %[ftmp3] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "xor %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t" + "psubh %[ftmp1], %[ftmp1], %[ftmp7] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t" + "pandn %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "pandn %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + PTR_ADDIU "%[addr0], %[addr0], 0x10 \n\t" + "gssdxc1 %[ftmp5], 0x00(%[addr0], %[block]) \n\t" + "gssdxc1 %[ftmp6], 0x08(%[addr0], %[block]) \n\t" + "blez %[addr0], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]) + : [block]"r"((mips_reg)(block+nCoeffs)), + [quant]"r"((mips_reg)(quant_matrix+nCoeffs)), + [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))), + [qscale]"r"(qscale) + : "memory" + ); + + block[0]= block0; +} + +void ff_denoise_dct_mmi(MpegEncContext *s, int16_t *block) +{ + const int intra = s->mb_intra; + int *sum = s->dct_error_sum[intra]; + uint16_t *offset = s->dct_offset[intra]; + double ftmp[8]; + mips_reg addr[1]; + + s->dct_count[intra]++; + + __asm__ volatile( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "1: \n\t" + "ldc1 %[ftmp1], 0x00(%[block]) \n\t" + "xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "ldc1 %[ftmp3], 0x08(%[block]) \n\t" + "xor %[ftmp4], %[ftmp4], %[ftmp4] \n\t" + "pcmpgth %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "pcmpgth %[ftmp4], %[ftmp4], %[ftmp3] \n\t" + "xor %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "xor %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "ldc1 %[ftmp6], 0x00(%[offset]) \n\t" + "mov.d %[ftmp5], %[ftmp1] \n\t" + "psubush %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + "ldc1 %[ftmp6], 0x08(%[offset]) \n\t" + "mov.d %[ftmp7], %[ftmp3] \n\t" + "psubush %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "xor %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "xor %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "sdc1 %[ftmp1], 0x00(%[block]) \n\t" + "sdc1 %[ftmp3], 0x08(%[block]) \n\t" + "mov.d %[ftmp1], %[ftmp5] \n\t" + "mov.d %[ftmp3], %[ftmp7] \n\t" + "punpcklhw %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpckhhw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpcklhw %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpckhhw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "ldc1 %[ftmp2], 0x00(%[sum]) \n\t" + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + "ldc1 %[ftmp2], 0x08(%[sum]) \n\t" + "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "ldc1 %[ftmp2], 0x10(%[sum]) \n\t" + "paddw %[ftmp7], %[ftmp7], %[ftmp2] \n\t" + "ldc1 %[ftmp2], 0x18(%[sum]) \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp2] \n\t" + "sdc1 %[ftmp5], 0x00(%[sum]) \n\t" + "sdc1 %[ftmp1], 0x08(%[sum]) \n\t" + "sdc1 %[ftmp7], 0x10(%[sum]) \n\t" + "sdc1 %[ftmp3], 0x18(%[sum]) \n\t" + PTR_ADDIU "%[block], %[block], 0x10 \n\t" + PTR_ADDIU "%[sum], %[sum], 0x20 \n\t" + PTR_SUBU "%[addr0], %[block1], %[block] \n\t" + PTR_ADDIU "%[offset], %[offset], 0x10 \n\t" + "bgtz %[addr0], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [addr0]"=&r"(addr[0]), + [block]"+&r"(block), [sum]"+&r"(sum), + [offset]"+&r"(offset) + : [block1]"r"(block+64) + : "memory" + ); +} diff --git a/libavcodec/mips/mpegvideo_msa.c b/libavcodec/mips/mpegvideo_msa.c new file mode 100644 index 0000000000..aa9ef770eb --- /dev/null +++ b/libavcodec/mips/mpegvideo_msa.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "h263dsp_mips.h" + +static void h263_dct_unquantize_msa(int16_t *block, int16_t qmul, + int16_t qadd, int8_t n_coeffs, + uint8_t loop_start) +{ + int16_t *block_dup = block; + int32_t level, cnt; + v8i16 block_vec, qmul_vec, qadd_vec, sub; + v8i16 add, mask, mul, zero_mask; + + qmul_vec = __msa_fill_h(qmul); + qadd_vec = __msa_fill_h(qadd); + for (cnt = 0; cnt < (n_coeffs >> 3); cnt++) { + block_vec = LD_SH(block_dup + loop_start); + mask = __msa_clti_s_h(block_vec, 0); + zero_mask = __msa_ceqi_h(block_vec, 0); + mul = block_vec * qmul_vec; + sub = mul - qadd_vec; + add = mul + qadd_vec; + add = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) sub, (v16u8) mask); + block_vec = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) block_vec, + (v16u8) zero_mask); + ST_SH(block_vec, block_dup + loop_start); + block_dup += 8; + } + + cnt = ((n_coeffs >> 3) * 8) + loop_start; + + for (; cnt <= n_coeffs; cnt++) { + level = block[cnt]; + if (level) { + if (level < 0) { + level = level * qmul - qadd; + } else { + level = level * qmul + qadd; + } + block[cnt] = level; + } + } +} + +static int32_t mpeg2_dct_unquantize_inter_msa(int16_t *block, + int32_t qscale, + const int16_t *quant_matrix) +{ + int32_t cnt, sum_res = -1; + v8i16 block_vec, block_neg, qscale_vec, mask; + v8i16 block_org0, block_org1, block_org2, block_org3; + v8i16 quant_m0, quant_m1, quant_m2, quant_m3; + v8i16 sum, mul, zero_mask; + v4i32 mul_vec, qscale_l, qscale_r, quant_m_r, quant_m_l; + v4i32 block_l, block_r, sad; + + qscale_vec = __msa_fill_h(qscale); + for (cnt = 0; cnt < 2; cnt++) { + LD_SH4(block, 8, block_org0, block_org1, block_org2, block_org3); + LD_SH4(quant_matrix, 8, quant_m0, quant_m1, quant_m2, quant_m3); + mask = __msa_clti_s_h(block_org0, 0); + zero_mask = __msa_ceqi_h(block_org0, 0); + block_neg = -block_org0; + block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org0, (v16u8) block_neg, + (v16u8) mask); + block_vec <<= 1; + block_vec += 1; + UNPCK_SH_SW(block_vec, block_r, block_l); + UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l); + UNPCK_SH_SW(quant_m0, quant_m_r, quant_m_l); + mul_vec = block_l * qscale_l; + mul_vec *= quant_m_l; + block_l = mul_vec >> 4; + mul_vec = block_r * qscale_r; + mul_vec *= quant_m_r; + block_r = mul_vec >> 4; + mul = (v8i16) __msa_pckev_h((v8i16) block_l, (v8i16) block_r); + block_neg = - mul; + sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg, + (v16u8) mask); + sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org0, + (v16u8) zero_mask); + ST_SH(sum, block); + block += 8; + quant_matrix += 8; + sad = __msa_hadd_s_w(sum, sum); + sum_res += HADD_SW_S32(sad); + mask = __msa_clti_s_h(block_org1, 0); + zero_mask = __msa_ceqi_h(block_org1, 0); + block_neg = - block_org1; + block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org1, (v16u8) block_neg, + (v16u8) mask); + block_vec <<= 1; + block_vec += 1; + UNPCK_SH_SW(block_vec, block_r, block_l); + UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l); + UNPCK_SH_SW(quant_m1, quant_m_r, quant_m_l); + mul_vec = block_l * qscale_l; + mul_vec *= quant_m_l; + block_l = mul_vec >> 4; + mul_vec = block_r * qscale_r; + mul_vec *= quant_m_r; + block_r = mul_vec >> 4; + mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r); + block_neg = - mul; + sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg, + (v16u8) mask); + sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org1, + (v16u8) zero_mask); + ST_SH(sum, block); + + block += 8; + quant_matrix += 8; + sad = __msa_hadd_s_w(sum, sum); + sum_res += HADD_SW_S32(sad); + mask = __msa_clti_s_h(block_org2, 0); + zero_mask = __msa_ceqi_h(block_org2, 0); + block_neg = - block_org2; + block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org2, (v16u8) block_neg, + (v16u8) mask); + block_vec <<= 1; + block_vec += 1; + UNPCK_SH_SW(block_vec, block_r, block_l); + UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l); + UNPCK_SH_SW(quant_m2, quant_m_r, quant_m_l); + mul_vec = block_l * qscale_l; + mul_vec *= quant_m_l; + block_l = mul_vec >> 4; + mul_vec = block_r * qscale_r; + mul_vec *= quant_m_r; + block_r = mul_vec >> 4; + mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r); + block_neg = - mul; + sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg, + (v16u8) mask); + sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org2, + (v16u8) zero_mask); + ST_SH(sum, block); + + block += 8; + quant_matrix += 8; + sad = __msa_hadd_s_w(sum, sum); + sum_res += HADD_SW_S32(sad); + mask = __msa_clti_s_h(block_org3, 0); + zero_mask = __msa_ceqi_h(block_org3, 0); + block_neg = - block_org3; + block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org3, (v16u8) block_neg, + (v16u8) mask); + block_vec <<= 1; + block_vec += 1; + UNPCK_SH_SW(block_vec, block_r, block_l); + UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l); + UNPCK_SH_SW(quant_m3, quant_m_r, quant_m_l); + mul_vec = block_l * qscale_l; + mul_vec *= quant_m_l; + block_l = mul_vec >> 4; + mul_vec = block_r * qscale_r; + mul_vec *= quant_m_r; + block_r = mul_vec >> 4; + mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r); + block_neg = - mul; + sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg, + (v16u8) mask); + sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org3, + (v16u8) zero_mask); + ST_SH(sum, block); + + block += 8; + quant_matrix += 8; + sad = __msa_hadd_s_w(sum, sum); + sum_res += HADD_SW_S32(sad); + } + + return sum_res; +} + +void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s, + int16_t *block, int32_t index, + int32_t qscale) +{ + int32_t qmul, qadd; + int32_t nCoeffs; + + av_assert2(s->block_last_index[index] >= 0 || s->h263_aic); + + qmul = qscale << 1; + + if (!s->h263_aic) { + block[0] *= index < 4 ? s->y_dc_scale : s->c_dc_scale; + qadd = (qscale - 1) | 1; + } else { + qadd = 0; + } + if (s->ac_pred) + nCoeffs = 63; + else + nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]]; + + h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 1); +} + +void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s, + int16_t *block, int32_t index, + int32_t qscale) +{ + int32_t qmul, qadd; + int32_t nCoeffs; + + av_assert2(s->block_last_index[index] >= 0); + + qadd = (qscale - 1) | 1; + qmul = qscale << 1; + + nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]]; + + h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 0); +} + +void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s, + int16_t *block, int32_t index, + int32_t qscale) +{ + const uint16_t *quant_matrix; + int32_t sum = -1; + + quant_matrix = s->inter_matrix; + + sum = mpeg2_dct_unquantize_inter_msa(block, qscale, quant_matrix); + + block[63] ^= sum & 1; +} diff --git a/libavcodec/mips/mpegvideoencdsp_init_mips.c b/libavcodec/mips/mpegvideoencdsp_init_mips.c new file mode 100644 index 0000000000..9bfe94e4cd --- /dev/null +++ b/libavcodec/mips/mpegvideoencdsp_init_mips.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/bit_depth_template.c" +#include "h263dsp_mips.h" + +#if HAVE_MSA +static av_cold void mpegvideoencdsp_init_msa(MpegvideoEncDSPContext *c, + AVCodecContext *avctx) +{ +#if BIT_DEPTH == 8 + c->pix_sum = ff_pix_sum_msa; +#endif +} +#endif // #if HAVE_MSA + +av_cold void ff_mpegvideoencdsp_init_mips(MpegvideoEncDSPContext *c, + AVCodecContext *avctx) +{ +#if HAVE_MSA + mpegvideoencdsp_init_msa(c, avctx); +#endif // #if HAVE_MSA +} diff --git a/libavcodec/mips/mpegvideoencdsp_msa.c b/libavcodec/mips/mpegvideoencdsp_msa.c new file mode 100644 index 0000000000..46473dafe5 --- /dev/null +++ b/libavcodec/mips/mpegvideoencdsp_msa.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "h263dsp_mips.h" +#include "libavutil/mips/generic_macros_msa.h" + +static int32_t sum_u8src_16width_msa(uint8_t *src, int32_t stride) +{ + uint32_t sum = 0; + v16u8 in0, in1, in2, in3, in4, in5, in6, in7; + v16u8 in8, in9, in10, in11, in12, in13, in14, in15; + + LD_UB8(src, stride, in0, in1, in2, in3, in4, in5, in6, in7); + src += (8 * stride); + LD_UB8(src, stride, in8, in9, in10, in11, in12, in13, in14, in15); + + HADD_UB4_UB(in0, in1, in2, in3, in0, in1, in2, in3); + HADD_UB4_UB(in4, in5, in6, in7, in4, in5, in6, in7); + HADD_UB4_UB(in8, in9, in10, in11, in8, in9, in10, in11); + HADD_UB4_UB(in12, in13, in14, in15, in12, in13, in14, in15); + + sum = HADD_UH_U32(in0); + sum += HADD_UH_U32(in1); + sum += HADD_UH_U32(in2); + sum += HADD_UH_U32(in3); + sum += HADD_UH_U32(in4); + sum += HADD_UH_U32(in5); + sum += HADD_UH_U32(in6); + sum += HADD_UH_U32(in7); + sum += HADD_UH_U32(in8); + sum += HADD_UH_U32(in9); + sum += HADD_UH_U32(in10); + sum += HADD_UH_U32(in11); + sum += HADD_UH_U32(in12); + sum += HADD_UH_U32(in13); + sum += HADD_UH_U32(in14); + sum += HADD_UH_U32(in15); + + return sum; +} + +int ff_pix_sum_msa(uint8_t *pix, int line_size) +{ + return sum_u8src_16width_msa(pix, line_size); +} diff --git a/libavcodec/mips/pixblockdsp_init_mips.c b/libavcodec/mips/pixblockdsp_init_mips.c new file mode 100644 index 0000000000..1b3741ea76 --- /dev/null +++ b/libavcodec/mips/pixblockdsp_init_mips.c @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) + * Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "pixblockdsp_mips.h" + +#if HAVE_MSA +static av_cold void pixblockdsp_init_msa(PixblockDSPContext *c, + AVCodecContext *avctx, + unsigned high_bit_depth) +{ + c->diff_pixels = ff_diff_pixels_msa; + + switch (avctx->bits_per_raw_sample) { + case 9: + case 10: + case 12: + case 14: + c->get_pixels = ff_get_pixels_16_msa; + break; + default: + if (avctx->bits_per_raw_sample <= 8 || avctx->codec_type != + AVMEDIA_TYPE_VIDEO) { + c->get_pixels = ff_get_pixels_8_msa; + } + break; + } +} +#endif // #if HAVE_MSA + +#if HAVE_MMI +static av_cold void pixblockdsp_init_mmi(PixblockDSPContext *c, + AVCodecContext *avctx, unsigned high_bit_depth) +{ + c->diff_pixels = ff_diff_pixels_mmi; + + if (!high_bit_depth || avctx->codec_type != AVMEDIA_TYPE_VIDEO) { + c->get_pixels = ff_get_pixels_8_mmi; + } +} +#endif /* HAVE_MMI */ + +void ff_pixblockdsp_init_mips(PixblockDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ +#if HAVE_MSA + pixblockdsp_init_msa(c, avctx, high_bit_depth); +#endif // #if HAVE_MSA +#if HAVE_MMI + pixblockdsp_init_mmi(c, avctx, high_bit_depth); +#endif /* HAVE_MMI */ +} diff --git a/libavcodec/mips/pixblockdsp_mips.h b/libavcodec/mips/pixblockdsp_mips.h new file mode 100644 index 0000000000..7f8cc96683 --- /dev/null +++ b/libavcodec/mips/pixblockdsp_mips.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) + * Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_PIXBLOCKDSP_MIPS_H +#define AVCODEC_MIPS_PIXBLOCKDSP_MIPS_H + +#include "../mpegvideo.h" + +void ff_diff_pixels_msa(int16_t *av_restrict block, const uint8_t *src1, + const uint8_t *src2, int stride); +void ff_get_pixels_16_msa(int16_t *restrict dst, const uint8_t *src, + ptrdiff_t stride); +void ff_get_pixels_8_msa(int16_t *restrict dst, const uint8_t *src, + ptrdiff_t stride); + +void ff_get_pixels_8_mmi(int16_t *av_restrict block, const uint8_t *pixels, + ptrdiff_t line_size); +void ff_diff_pixels_mmi(int16_t *av_restrict block, const uint8_t *src1, + const uint8_t *src2, int stride); + +#endif // #ifndef AVCODEC_MIPS_PIXBLOCKDSP_MIPS_H diff --git a/libavcodec/mips/pixblockdsp_mmi.c b/libavcodec/mips/pixblockdsp_mmi.c new file mode 100644 index 0000000000..3ff84c0f18 --- /dev/null +++ b/libavcodec/mips/pixblockdsp_mmi.c @@ -0,0 +1,98 @@ +/* + * Loongson SIMD optimized pixblockdsp + * + * Copyright (c) 2015 Loongson Technology Corporation Limited + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "pixblockdsp_mips.h" +#include "libavutil/mips/asmdefs.h" + +void ff_get_pixels_8_mmi(int16_t *av_restrict block, const uint8_t *pixels, + ptrdiff_t line_size) +{ + double ftmp[6]; + mips_reg tmp[2]; + + __asm__ volatile ( + "li %[tmp1], 0x08 \n\t" + "move %[tmp0], $0 \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[pixels]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[pixels]) \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" + "gssdxc1 %[ftmp2], 0x00(%[block], %[tmp0]) \n\t" + "gssdxc1 %[ftmp5], 0x08(%[block], %[tmp0]) \n\t" + PTR_ADDI "%[tmp1], %[tmp1], -0x01 \n\t" + PTR_ADDIU "%[tmp0], %[tmp0], 0x10 \n\t" + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + "bnez %[tmp1], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [pixels]"+&r"(pixels) + : [block]"r"((mips_reg)block), [line_size]"r"((mips_reg)line_size) + : "memory" + ); +} + +void ff_diff_pixels_mmi(int16_t *av_restrict block, const uint8_t *src1, + const uint8_t *src2, int stride) +{ + double ftmp[5]; + mips_reg tmp[1]; + + __asm__ volatile ( + "li %[tmp0], 0x08 \n\t" + "xor %[ftmp4], %[ftmp4], %[ftmp4] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src1]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src1]) \n\t" + "or %[ftmp1], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[ftmp2], 0x07(%[src2]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[src2]) \n\t" + "or %[ftmp3], %[ftmp2], %[ftmp2] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "punpckhbh %[ftmp1], %[ftmp1], %[ftmp4] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "punpckhbh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubh %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[block]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[block]) \n\t" + "gssdlc1 %[ftmp1], 0x0f(%[block]) \n\t" + "gssdrc1 %[ftmp1], 0x08(%[block]) \n\t" + PTR_ADDI "%[tmp0], %[tmp0], -0x01 \n\t" + PTR_ADDIU "%[block], %[block], 0x10 \n\t" + PTR_ADDU "%[src1], %[src1], %[stride] \n\t" + PTR_ADDU "%[src2], %[src2], %[stride] \n\t" + "bgtz %[tmp0], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), + [tmp0]"=&r"(tmp[0]), + [block]"+&r"(block), [src1]"+&r"(src1), + [src2]"+&r"(src2) + : [stride]"r"((mips_reg)stride) + : "memory" + ); +} diff --git a/libavcodec/mips/pixblockdsp_msa.c b/libavcodec/mips/pixblockdsp_msa.c new file mode 100644 index 0000000000..966e11a7f5 --- /dev/null +++ b/libavcodec/mips/pixblockdsp_msa.c @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "pixblockdsp_mips.h" + +static void diff_pixels_msa(int16_t *block, const uint8_t *src1, + const uint8_t *src2, int32_t stride) +{ + v16u8 in10, in11, in12, in13, in14, in15, in16, in17; + v16u8 in20, in21, in22, in23, in24, in25, in26, in27; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7; + + LD_UB8(src1, stride, in10, in11, in12, in13, in14, in15, in16, in17); + LD_UB8(src2, stride, in20, in21, in22, in23, in24, in25, in26, in27); + ILVR_B4_SH(in10, in20, in11, in21, in12, in22, in13, in23, + out0, out1, out2, out3); + ILVR_B4_SH(in14, in24, in15, in25, in16, in26, in17, in27, + out4, out5, out6, out7); + HSUB_UB4_SH(out0, out1, out2, out3, out0, out1, out2, out3); + HSUB_UB4_SH(out4, out5, out6, out7, out4, out5, out6, out7); + ST_SH8(out0, out1, out2, out3, out4, out5, out6, out7, block, 8); +} + +static void copy_8bit_to_16bit_width8_msa(const uint8_t *src, int32_t src_stride, + int16_t *dst, int32_t dst_stride, + int32_t height) +{ + uint8_t *dst_ptr; + int32_t cnt; + v16u8 src0, src1, src2, src3; + v16i8 zero = { 0 }; + + dst_ptr = (uint8_t *) dst; + + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + ILVR_B4_UB(zero, src0, zero, src1, zero, src2, zero, src3, + src0, src1, src2, src3); + + ST_UB4(src0, src1, src2, src3, dst_ptr, (dst_stride * 2)); + dst_ptr += (4 * 2 * dst_stride); + } +} + +static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t width) +{ + int32_t cnt, loop_cnt; + const uint8_t *src_tmp; + uint8_t *dst_tmp; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src_tmp, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src_tmp += (8 * src_stride); + + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, + dst_tmp, dst_stride); + dst_tmp += (8 * dst_stride); + } + + src += 16; + dst += 16; + } +} + +static void copy_width16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, + dst, dst_stride); + dst += (8 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); + } else if (0 == height % 4) { + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } +} + +void ff_get_pixels_16_msa(int16_t *av_restrict dest, const uint8_t *src, + ptrdiff_t stride) +{ + copy_width16_msa(src, stride, (uint8_t *) dest, 16, 8); +} + +void ff_get_pixels_8_msa(int16_t *av_restrict dest, const uint8_t *src, + ptrdiff_t stride) +{ + copy_8bit_to_16bit_width8_msa(src, stride, dest, 8, 8); +} + +void ff_diff_pixels_msa(int16_t *av_restrict block, const uint8_t *src1, + const uint8_t *src2, int stride) +{ + diff_pixels_msa(block, src1, src2, stride); +} diff --git a/libavcodec/mips/qpeldsp_init_mips.c b/libavcodec/mips/qpeldsp_init_mips.c new file mode 100644 index 0000000000..140e8f89c9 --- /dev/null +++ b/libavcodec/mips/qpeldsp_init_mips.c @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "qpeldsp_mips.h" + +#if HAVE_MSA +static av_cold void qpeldsp_init_msa(QpelDSPContext *c) +{ + c->put_qpel_pixels_tab[0][0] = ff_copy_16x16_msa; + c->put_qpel_pixels_tab[0][1] = ff_horiz_mc_qpel_aver_src0_16width_msa; + c->put_qpel_pixels_tab[0][2] = ff_horiz_mc_qpel_16width_msa; + c->put_qpel_pixels_tab[0][3] = ff_horiz_mc_qpel_aver_src1_16width_msa; + c->put_qpel_pixels_tab[0][4] = ff_vert_mc_qpel_aver_src0_16x16_msa; + c->put_qpel_pixels_tab[0][5] = ff_hv_mc_qpel_aver_hv_src00_16x16_msa; + c->put_qpel_pixels_tab[0][6] = ff_hv_mc_qpel_aver_v_src0_16x16_msa; + c->put_qpel_pixels_tab[0][7] = ff_hv_mc_qpel_aver_hv_src10_16x16_msa; + c->put_qpel_pixels_tab[0][8] = ff_vert_mc_qpel_16x16_msa; + c->put_qpel_pixels_tab[0][9] = ff_hv_mc_qpel_aver_h_src0_16x16_msa; + c->put_qpel_pixels_tab[0][10] = ff_hv_mc_qpel_16x16_msa; + c->put_qpel_pixels_tab[0][11] = ff_hv_mc_qpel_aver_h_src1_16x16_msa; + c->put_qpel_pixels_tab[0][12] = ff_vert_mc_qpel_aver_src1_16x16_msa; + c->put_qpel_pixels_tab[0][13] = ff_hv_mc_qpel_aver_hv_src01_16x16_msa; + c->put_qpel_pixels_tab[0][14] = ff_hv_mc_qpel_aver_v_src1_16x16_msa; + c->put_qpel_pixels_tab[0][15] = ff_hv_mc_qpel_aver_hv_src11_16x16_msa; + + c->put_qpel_pixels_tab[1][0] = ff_copy_8x8_msa; + c->put_qpel_pixels_tab[1][1] = ff_horiz_mc_qpel_aver_src0_8width_msa; + c->put_qpel_pixels_tab[1][2] = ff_horiz_mc_qpel_8width_msa; + c->put_qpel_pixels_tab[1][3] = ff_horiz_mc_qpel_aver_src1_8width_msa; + c->put_qpel_pixels_tab[1][4] = ff_vert_mc_qpel_aver_src0_8x8_msa; + c->put_qpel_pixels_tab[1][5] = ff_hv_mc_qpel_aver_hv_src00_8x8_msa; + c->put_qpel_pixels_tab[1][6] = ff_hv_mc_qpel_aver_v_src0_8x8_msa; + c->put_qpel_pixels_tab[1][7] = ff_hv_mc_qpel_aver_hv_src10_8x8_msa; + c->put_qpel_pixels_tab[1][8] = ff_vert_mc_qpel_8x8_msa; + c->put_qpel_pixels_tab[1][9] = ff_hv_mc_qpel_aver_h_src0_8x8_msa; + c->put_qpel_pixels_tab[1][10] = ff_hv_mc_qpel_8x8_msa; + c->put_qpel_pixels_tab[1][11] = ff_hv_mc_qpel_aver_h_src1_8x8_msa; + c->put_qpel_pixels_tab[1][12] = ff_vert_mc_qpel_aver_src1_8x8_msa; + c->put_qpel_pixels_tab[1][13] = ff_hv_mc_qpel_aver_hv_src01_8x8_msa; + c->put_qpel_pixels_tab[1][14] = ff_hv_mc_qpel_aver_v_src1_8x8_msa; + c->put_qpel_pixels_tab[1][15] = ff_hv_mc_qpel_aver_hv_src11_8x8_msa; + + c->put_no_rnd_qpel_pixels_tab[0][0] = ff_copy_16x16_msa; + c->put_no_rnd_qpel_pixels_tab[0][1] = + ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa; + c->put_no_rnd_qpel_pixels_tab[0][2] = ff_horiz_mc_qpel_no_rnd_16width_msa; + c->put_no_rnd_qpel_pixels_tab[0][3] = + ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa; + c->put_no_rnd_qpel_pixels_tab[0][4] = + ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa; + c->put_no_rnd_qpel_pixels_tab[0][5] = + ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa; + c->put_no_rnd_qpel_pixels_tab[0][6] = + ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa; + c->put_no_rnd_qpel_pixels_tab[0][7] = + ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa; + c->put_no_rnd_qpel_pixels_tab[0][8] = ff_vert_mc_qpel_no_rnd_16x16_msa; + c->put_no_rnd_qpel_pixels_tab[0][9] = + ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa; + c->put_no_rnd_qpel_pixels_tab[0][10] = ff_hv_mc_qpel_no_rnd_16x16_msa; + c->put_no_rnd_qpel_pixels_tab[0][11] = + ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa; + c->put_no_rnd_qpel_pixels_tab[0][12] = + ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa; + c->put_no_rnd_qpel_pixels_tab[0][13] = + ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa; + c->put_no_rnd_qpel_pixels_tab[0][14] = + ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa; + c->put_no_rnd_qpel_pixels_tab[0][15] = + ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa; + + c->put_no_rnd_qpel_pixels_tab[1][0] = ff_copy_8x8_msa; + c->put_no_rnd_qpel_pixels_tab[1][1] = + ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa; + c->put_no_rnd_qpel_pixels_tab[1][2] = ff_horiz_mc_qpel_no_rnd_8width_msa; + c->put_no_rnd_qpel_pixels_tab[1][3] = + ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa; + c->put_no_rnd_qpel_pixels_tab[1][4] = + ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa; + c->put_no_rnd_qpel_pixels_tab[1][5] = + ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa; + c->put_no_rnd_qpel_pixels_tab[1][6] = + ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa; + c->put_no_rnd_qpel_pixels_tab[1][7] = + ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa; + c->put_no_rnd_qpel_pixels_tab[1][8] = ff_vert_mc_qpel_no_rnd_8x8_msa; + c->put_no_rnd_qpel_pixels_tab[1][9] = + ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa; + c->put_no_rnd_qpel_pixels_tab[1][10] = ff_hv_mc_qpel_no_rnd_8x8_msa; + c->put_no_rnd_qpel_pixels_tab[1][11] = + ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa; + c->put_no_rnd_qpel_pixels_tab[1][12] = + ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa; + c->put_no_rnd_qpel_pixels_tab[1][13] = + ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa; + c->put_no_rnd_qpel_pixels_tab[1][14] = + ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa; + c->put_no_rnd_qpel_pixels_tab[1][15] = + ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa; + + c->avg_qpel_pixels_tab[0][0] = ff_avg_width16_msa; + c->avg_qpel_pixels_tab[0][1] = + ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa; + c->avg_qpel_pixels_tab[0][2] = ff_horiz_mc_qpel_avg_dst_16width_msa; + c->avg_qpel_pixels_tab[0][3] = + ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa; + c->avg_qpel_pixels_tab[0][4] = ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa; + c->avg_qpel_pixels_tab[0][5] = + ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa; + c->avg_qpel_pixels_tab[0][6] = ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa; + c->avg_qpel_pixels_tab[0][7] = + ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa; + c->avg_qpel_pixels_tab[0][8] = ff_vert_mc_qpel_avg_dst_16x16_msa; + c->avg_qpel_pixels_tab[0][9] = ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa; + c->avg_qpel_pixels_tab[0][10] = ff_hv_mc_qpel_avg_dst_16x16_msa; + c->avg_qpel_pixels_tab[0][11] = ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa; + c->avg_qpel_pixels_tab[0][12] = ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa; + c->avg_qpel_pixels_tab[0][13] = + ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa; + c->avg_qpel_pixels_tab[0][14] = ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa; + c->avg_qpel_pixels_tab[0][15] = + ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa; + + c->avg_qpel_pixels_tab[1][0] = ff_avg_width8_msa; + c->avg_qpel_pixels_tab[1][1] = + ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa; + c->avg_qpel_pixels_tab[1][2] = ff_horiz_mc_qpel_avg_dst_8width_msa; + c->avg_qpel_pixels_tab[1][3] = + ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa; + c->avg_qpel_pixels_tab[1][4] = ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa; + c->avg_qpel_pixels_tab[1][5] = ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa; + c->avg_qpel_pixels_tab[1][6] = ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa; + c->avg_qpel_pixels_tab[1][7] = ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa; + c->avg_qpel_pixels_tab[1][8] = ff_vert_mc_qpel_avg_dst_8x8_msa; + c->avg_qpel_pixels_tab[1][9] = ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa; + c->avg_qpel_pixels_tab[1][10] = ff_hv_mc_qpel_avg_dst_8x8_msa; + c->avg_qpel_pixels_tab[1][11] = ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa; + c->avg_qpel_pixels_tab[1][12] = ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa; + c->avg_qpel_pixels_tab[1][13] = ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa; + c->avg_qpel_pixels_tab[1][14] = ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa; + c->avg_qpel_pixels_tab[1][15] = ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa; +} +#endif // #if HAVE_MSA + +void ff_qpeldsp_init_mips(QpelDSPContext *c) +{ +#if HAVE_MSA + qpeldsp_init_msa(c); +#endif // #if HAVE_MSA +} diff --git a/libavcodec/mips/qpeldsp_mips.h b/libavcodec/mips/qpeldsp_mips.h new file mode 100644 index 0000000000..704d221331 --- /dev/null +++ b/libavcodec/mips/qpeldsp_mips.h @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_QPELDSP_MIPS_H +#define AVCODEC_MIPS_QPELDSP_MIPS_H + +#include "../mpegvideo.h" + +void ff_copy_8x8_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +void ff_copy_16x16_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +void ff_avg_width8_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +void ff_avg_width16_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); + +void ff_horiz_mc_qpel_aver_src0_8width_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_horiz_mc_qpel_aver_src0_16width_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_horiz_mc_qpel_8width_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_horiz_mc_qpel_16width_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_horiz_mc_qpel_aver_src1_8width_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_horiz_mc_qpel_aver_src1_16width_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_horiz_mc_qpel_no_rnd_8width_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_horiz_mc_qpel_no_rnd_16width_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_horiz_mc_qpel_avg_dst_8width_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_horiz_mc_qpel_avg_dst_16width_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); + +void ff_vert_mc_qpel_aver_src0_8x8_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_vert_mc_qpel_aver_src0_16x16_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_vert_mc_qpel_8x8_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_vert_mc_qpel_16x16_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_vert_mc_qpel_aver_src1_8x8_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_vert_mc_qpel_aver_src1_16x16_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_vert_mc_qpel_no_rnd_8x8_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_vert_mc_qpel_no_rnd_16x16_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_vert_mc_qpel_avg_dst_8x8_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_vert_mc_qpel_avg_dst_16x16_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); + +void ff_hv_mc_qpel_aver_hv_src00_16x16_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_aver_hv_src00_8x8_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_aver_v_src0_16x16_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_aver_v_src0_8x8_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_aver_hv_src10_16x16_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_aver_hv_src10_8x8_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_aver_h_src0_16x16_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_aver_h_src0_8x8_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_16x16_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_8x8_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +void ff_hv_mc_qpel_aver_h_src1_16x16_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_aver_h_src1_8x8_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_aver_hv_src01_16x16_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_aver_hv_src01_8x8_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_aver_v_src1_16x16_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_aver_v_src1_8x8_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_aver_hv_src11_16x16_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_aver_hv_src11_8x8_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_avg_dst_16x16_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_avg_dst_8x8_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_no_rnd_16x16_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_no_rnd_8x8_msa(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); +void ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(uint8_t *dst, + const uint8_t *src, + ptrdiff_t stride); + +#endif // #ifndef AVCODEC_MIPS_QPELDSP_MIPS_H diff --git a/libavcodec/mips/qpeldsp_msa.c b/libavcodec/mips/qpeldsp_msa.c new file mode 100644 index 0000000000..4710b3f732 --- /dev/null +++ b/libavcodec/mips/qpeldsp_msa.c @@ -0,0 +1,6518 @@ +/* + * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "qpeldsp_mips.h" + +#define APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, coef0, coef1, coef2) \ +( { \ + v16u8 out, tmp0, tmp1; \ + v16u8 data0, data1, data2, data3, data4, data5; \ + v8i16 res_r, res_l; \ + v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ + v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \ + \ + VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1); \ + ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l); \ + data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15); \ + data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1); \ + HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l); \ + ILVRL_B2_UH(data3, data0, sum1_r, sum1_l); \ + data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14); \ + data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2); \ + sum0_r *= (v8u16) (coef0); \ + sum0_l *= (v8u16) (coef0); \ + ILVRL_B2_UH(data4, data1, sum2_r, sum2_l); \ + data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13); \ + data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3); \ + DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \ + ILVRL_B2_UH(data5, data2, sum3_r, sum3_l); \ + HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \ + DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \ + res_r = (v8i16) (sum0_r - sum3_r); \ + res_l = (v8i16) (sum0_l - sum3_l); \ + SRARI_H2_SH(res_r, res_l, 5); \ + CLIP_SH2_0_255(res_r, res_l); \ + out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \ + \ + out; \ +} ) + +#define APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, \ + mask0, mask1, mask2, mask3, \ + coef0, coef1, coef2) \ +( { \ + v16u8 out; \ + v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ + v8u16 sum4_r, sum5_r, sum6_r, sum7_r; \ + v8i16 res0_r, res1_r; \ + \ + VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r); \ + VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r); \ + HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r); \ + DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r); \ + VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r); \ + VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r); \ + DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r); \ + DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r); \ + res0_r = (v8i16) (sum0_r - sum3_r); \ + res1_r = (v8i16) (sum4_r - sum7_r); \ + SRARI_H2_SH(res0_r, res1_r, 5); \ + CLIP_SH2_0_255(res0_r, res1_r); \ + out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); \ + \ + out; \ +} ) + +#define APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, \ + mask0, mask1, mask2, mask3, \ + coef0, coef1, coef2) \ +( { \ + v16u8 out; \ + v8i16 res0_r; \ + v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ + \ + VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r); \ + sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r); \ + sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0); \ + VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r); \ + DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \ + res0_r = (v8i16) (sum0_r - sum3_r); \ + res0_r = __msa_srari_h(res0_r, 5); \ + res0_r = CLIP_SH_0_255(res0_r); \ + out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \ + \ + out; \ +} ) + +#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, \ + mask2, mask3, coef0, \ + coef1, coef2) \ +( { \ + v16u8 out; \ + v8i16 res0_r; \ + v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ + \ + VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r); \ + sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r); \ + sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0); \ + VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r); \ + DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \ + res0_r = (v8i16) (sum0_r - sum3_r); \ + res0_r += 15; \ + res0_r >>= 5; \ + res0_r = CLIP_SH_0_255(res0_r); \ + out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \ + \ + out; \ +} ) + +#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, \ + coef0, coef1, coef2) \ +( { \ + v16u8 out, tmp0, tmp1; \ + v16u8 data0, data1, data2, data3, data4, data5; \ + v8i16 res_r, res_l; \ + v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ + v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \ + \ + VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1); \ + ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l); \ + data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15); \ + data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1); \ + HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l); \ + ILVRL_B2_UH(data3, data0, sum1_r, sum1_l); \ + data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14); \ + data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2); \ + sum0_r *= (v8u16) (coef0); \ + sum0_l *= (v8u16) (coef0); \ + ILVRL_B2_UH(data4, data1, sum2_r, sum2_l); \ + data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13); \ + data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3); \ + DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \ + ILVRL_B2_UH(data5, data2, sum3_r, sum3_l); \ + HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \ + DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \ + res_r = (v8i16) (sum0_r - sum3_r); \ + res_l = (v8i16) (sum0_l - sum3_l); \ + res_r += 15; \ + res_l += 15; \ + res_r >>= 5; \ + res_l >>= 5; \ + CLIP_SH2_0_255(res_r, res_l); \ + out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \ + \ + out; \ +} ) + +#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, \ + mask0, mask1, mask2, mask3, \ + coef0, coef1, coef2) \ +( { \ + v16u8 out; \ + v8i16 res0_r, res1_r; \ + v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ + v8u16 sum4_r, sum5_r, sum6_r, sum7_r; \ + \ + VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r); \ + VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r); \ + HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r); \ + DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r); \ + VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r); \ + VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r); \ + DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r); \ + DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r); \ + res0_r = (v8i16) (sum0_r - sum3_r); \ + res1_r = (v8i16) (sum4_r - sum7_r); \ + res0_r += 15; \ + res1_r += 15; \ + res0_r >>= 5; \ + res1_r >>= 5; \ + CLIP_SH2_0_255(res0_r, res1_r); \ + out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); \ + \ + out; \ +} ) + +#define APPLY_VERT_QPEL_FILTER(inp0, inp1, inp2, inp3, \ + inp4, inp5, inp6, inp7, \ + coef0, coef1, coef2) \ +( { \ + v16u8 res; \ + v8i16 res_r, res_l; \ + v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ + v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \ + \ + ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l); \ + ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l); \ + DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l); \ + HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \ + ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l); \ + ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l); \ + DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \ + DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \ + res_r = (v8i16) (sum0_r - sum3_r); \ + res_l = (v8i16) (sum0_l - sum3_l); \ + SRARI_H2_SH(res_r, res_l, 5); \ + CLIP_SH2_0_255(res_r, res_l); \ + res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \ + \ + res; \ +} ) + +#define APPLY_VERT_QPEL_FILTER_8BYTE(inp00, inp01, inp02, inp03, \ + inp04, inp05, inp06, inp07, \ + inp10, inp11, inp12, inp13, \ + inp14, inp15, inp16, inp17, \ + coef0, coef1, coef2) \ +( { \ + v16u8 res; \ + v8i16 val0, val1; \ + v8u16 sum00, sum01, sum02, sum03; \ + v8u16 sum10, sum11, sum12, sum13; \ + \ + ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13, \ + sum00, sum10, sum03, sum13); \ + DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10); \ + HADD_UB2_UH(sum03, sum13, sum03, sum13); \ + ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11, \ + sum02, sum12, sum01, sum11); \ + DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10); \ + DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13); \ + val0 = (v8i16) (sum00 - sum03); \ + val1 = (v8i16) (sum10 - sum13); \ + SRARI_H2_SH(val0, val1, 5); \ + CLIP_SH2_0_255(val0, val1); \ + res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0); \ + \ + res; \ +} ) + +#define APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp1, inp2, inp3, \ + inp4, inp5, inp6, inp7, \ + coef0, coef1, coef2) \ +( { \ + v16u8 res; \ + v8i16 res_r, res_l; \ + v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ + v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \ + \ + ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l); \ + ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l); \ + DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l); \ + HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \ + ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l); \ + ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l); \ + DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \ + DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \ + res_r = (v8i16) (sum0_r - sum3_r); \ + res_l = (v8i16) (sum0_l - sum3_l); \ + res_r += 15; \ + res_l += 15; \ + res_r >>= 5; \ + res_l >>= 5; \ + CLIP_SH2_0_255(res_r, res_l); \ + res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \ + \ + res; \ +} ) + +#define APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp00, inp01, inp02, inp03, \ + inp04, inp05, inp06, inp07, \ + inp10, inp11, inp12, inp13, \ + inp14, inp15, inp16, inp17, \ + coef0, coef1, coef2) \ +( { \ + v16u8 res; \ + v8i16 val0, val1; \ + v8u16 sum00, sum01, sum02, sum03; \ + v8u16 sum10, sum11, sum12, sum13; \ + \ + ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13, \ + sum00, sum10, sum03, sum13); \ + DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10); \ + HADD_UB2_UH(sum03, sum13, sum03, sum13); \ + ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11, \ + sum02, sum12, sum01, sum11); \ + DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10); \ + DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13); \ + val0 = (v8i16) (sum00 - sum03); \ + val1 = (v8i16) (sum10 - sum13); \ + val0 += 15; \ + val1 += 15; \ + val0 >>= 5; \ + val1 >>= 5; \ + CLIP_SH2_0_255(val0, val1); \ + res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0); \ + \ + res; \ +} ) + +static void horiz_mc_qpel_aver_src0_8width_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, + mask0, mask1, mask2, mask3, + const20, const6, const3); + inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void horiz_mc_qpel_aver_src0_16width_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; + v16u8 res; + v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + v8u16 const20 = (v8u16) __msa_ldi_h(20); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); + LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); + src += (4 * src_stride); + res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, + const20, const6, const3); + res = __msa_aver_u_b(inp0, res); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, + const20, const6, const3); + res = __msa_aver_u_b(inp2, res); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, + const20, const6, const3); + res = __msa_aver_u_b(inp4, res); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, + const20, const6, const3); + res = __msa_aver_u_b(inp6, res); + ST_UB(res, dst); + dst += dst_stride; + } +} + +static void horiz_mc_qpel_8width_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, + mask0, mask1, mask2, mask3, + const20, const6, const3); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void horiz_mc_qpel_16width_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; + v16u8 res; + v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + v8u16 const20 = (v8u16) __msa_ldi_h(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); + LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); + src += (4 * src_stride); + res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, + const20, const6, const3); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, + const20, const6, const3); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, + const20, const6, const3); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, + const20, const6, const3); + ST_UB(res, dst); + dst += dst_stride; + } +} + +static void horiz_mc_qpel_aver_src1_8width_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, + mask0, mask1, mask2, mask3, + const20, const6, const3); + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void horiz_mc_qpel_aver_src1_16width_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; + v16u8 res; + v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + v8u16 const20 = (v8u16) __msa_ldi_h(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); + LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); + src += (4 * src_stride); + res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, + const20, const6, const3); + res = __msa_aver_u_b(res, inp1); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, + const20, const6, const3); + res = __msa_aver_u_b(res, inp3); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, + const20, const6, const3); + res = __msa_aver_u_b(res, inp5); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, + const20, const6, const3); + res = __msa_aver_u_b(res, inp7); + ST_UB(res, dst); + dst += dst_stride; + } +} + +static void horiz_mc_qpel_no_rnd_aver_src0_8width_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + res0 = __msa_ave_u_b(inp0, res0); + res1 = __msa_ave_u_b(inp2, res1); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void horiz_mc_qpel_no_rnd_aver_src0_16width_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; + v16u8 res; + v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + v8u16 const20 = (v8u16) __msa_ldi_h(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); + LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); + src += (4 * src_stride); + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, + const20, const6, const3); + res = __msa_ave_u_b(inp0, res); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask, + const20, const6, const3); + res = __msa_ave_u_b(inp2, res); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask, + const20, const6, const3); + res = __msa_ave_u_b(inp4, res); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask, + const20, const6, const3); + res = __msa_ave_u_b(inp6, res); + ST_UB(res, dst); + dst += dst_stride; + } +} + +static void horiz_mc_qpel_no_rnd_8width_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void horiz_mc_qpel_no_rnd_16width_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; + v16u8 res; + v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + v8u16 const20 = (v8u16) __msa_ldi_h(20); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); + LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); + src += (4 * src_stride); + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, + const20, const6, const3); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask, + const20, const6, const3); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask, + const20, const6, const3); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask, + const20, const6, const3); + ST_UB(res, dst); + dst += dst_stride; + } +} + +static void horiz_mc_qpel_no_rnd_aver_src1_8width_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + res0 = __msa_ave_u_b(inp0, res0); + res1 = __msa_ave_u_b(inp2, res1); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void horiz_mc_qpel_no_rnd_aver_src1_16width_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; + v16u8 res; + v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + v8u16 const20 = (v8u16) __msa_ldi_h(20); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); + LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); + src += (4 * src_stride); + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, + const20, const6, const3); + res = __msa_ave_u_b(res, inp1); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask, + const20, const6, const3); + res = __msa_ave_u_b(res, inp3); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask, + const20, const6, const3); + res = __msa_ave_u_b(res, inp5); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask, + const20, const6, const3); + res = __msa_ave_u_b(res, inp7); + ST_UB(res, dst); + dst += dst_stride; + } +} + +static void horiz_mc_qpel_avg_dst_aver_src0_8width_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3; + v16u8 dst0, dst1, dst2, dst3; + v16u8 res0, res1; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, + mask0, mask1, mask2, mask3, + const20, const6, const3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); + dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); + AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); + AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void horiz_mc_qpel_avg_dst_aver_src0_16width_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; + v16u8 res0, res1; + v16u8 dst0, dst1; + v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + v8u16 const20 = (v8u16) __msa_ldi_h(20); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); + LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); + src += (4 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, + const20, const6, const3); + res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); + AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, + const20, const6, const3); + res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(inp4, res0, inp6, res1, res0, res1); + AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + } +} + +static void horiz_mc_qpel_avg_dst_8width_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3; + v16u8 dst0, dst1, dst2, dst3; + v16u8 res0, res1; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, + mask0, mask1, mask2, mask3, + const20, const6, const3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); + dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); + AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void horiz_mc_qpel_avg_dst_16width_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; + v16u8 res0, res1; + v16u8 dst0, dst1; + v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + v8u16 const20 = (v8u16) __msa_ldi_h(20); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); + LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); + src += (4 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, + const20, const6, const3); + res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, + const20, const6, const3); + res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + } +} + +static void horiz_mc_qpel_avg_dst_aver_src1_8width_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3; + v16u8 dst0, dst1, dst2, dst3; + v16u8 res0, res1; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, + mask0, mask1, mask2, mask3, + const20, const6, const3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); + dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); + AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); + AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void horiz_mc_qpel_avg_dst_aver_src1_16width_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; + v16u8 res0, res1, dst0, dst1; + v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + v8u16 const20 = (v8u16) __msa_ldi_h(20); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); + LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); + src += (4 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, + const20, const6, const3); + res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, inp1, res1, inp3, res0, res1); + AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, + const20, const6, const3); + res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, inp5, res1, inp7, res0, res1); + AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + } +} + + +static void vert_mc_qpel_aver_src0_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; + v16u8 tmp0, tmp1, res0, res1; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + LD_UB2(src, src_stride, inp4, inp5); + src += (2 * src_stride); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2, + inp1, inp2, inp3, inp4, + inp1, inp0, inp0, inp1, + inp2, inp3, inp4, inp5, + const20, const6, const3); + LD_UB2(src, src_stride, inp6, inp7); + src += (2 * src_stride); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0, + inp3, inp4, inp5, inp6, + inp3, inp2, inp1, inp0, + inp4, inp5, inp6, inp7, + const20, const6, const3); + tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); + + inp8 = LD_UB(src); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, + inp5, inp6, inp7, inp8, + inp5, inp4, inp3, inp2, + inp6, inp7, inp8, inp8, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3, + inp7, inp8, inp8, inp7, + inp7, inp6, inp5, inp4, + inp8, inp8, inp7, inp6, + const20, const6, const3); + tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5); + tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7); + AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); +} + +static void vert_mc_qpel_aver_src0_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; + v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; + v16u8 res0; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); + src += (5 * src_stride); + res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2, + inp1, inp2, inp3, inp4, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp0); + ST_UB(res0, dst); + dst += dst_stride; + + inp5 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1, + inp2, inp3, inp4, inp5, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp1); + ST_UB(res0, dst); + dst += dst_stride; + + inp6 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0, + inp3, inp4, inp5, inp6, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp2); + ST_UB(res0, dst); + dst += dst_stride; + + inp7 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0, + inp4, inp5, inp6, inp7, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp3); + ST_UB(res0, dst); + dst += dst_stride; + + LD_UB2(src, src_stride, inp8, inp9); + src += (2 * src_stride); + res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1, + inp5, inp6, inp7, inp8, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp4); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2, + inp6, inp7, inp8, inp9, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp5); + ST_UB(res0, dst); + dst += dst_stride; + + LD_UB2(src, src_stride, inp10, inp11); + src += (2 * src_stride); + res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3, + inp7, inp8, inp9, inp10, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp6); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4, + inp8, inp9, inp10, inp11, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp7); + ST_UB(res0, dst); + dst += dst_stride; + + LD_UB2(src, src_stride, inp12, inp13); + src += (2 * src_stride); + res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5, + inp9, inp10, inp11, inp12, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp8); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6, + inp10, inp11, inp12, inp13, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp9); + ST_UB(res0, dst); + dst += dst_stride; + + LD_UB2(src, src_stride, inp14, inp15); + src += (2 * src_stride); + res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7, + inp11, inp12, inp13, inp14, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp10); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8, + inp12, inp13, inp14, inp15, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp11); + ST_UB(res0, dst); + dst += dst_stride; + + inp16 = LD_UB(src); + res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9, + inp13, inp14, inp15, inp16, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp12); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10, + inp14, inp15, inp16, inp16, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp13); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11, + inp15, inp16, inp16, inp15, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp14); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12, + inp16, inp16, inp15, inp14, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp15); + ST_UB(res0, dst); +} + +static void vert_mc_qpel_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; + v16u8 res0, res1; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + LD_UB2(src, src_stride, inp4, inp5); + src += (2 * src_stride); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2, + inp1, inp2, inp3, inp4, + inp1, inp0, inp0, inp1, + inp2, inp3, inp4, inp5, + const20, const6, const3); + LD_UB2(src, src_stride, inp6, inp7); + src += (2 * src_stride); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0, + inp3, inp4, inp5, inp6, + inp3, inp2, inp1, inp0, + inp4, inp5, inp6, inp7, + const20, const6, const3); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); + + inp8 = LD_UB(src); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, + inp5, inp6, inp7, inp8, + inp5, inp4, inp3, inp2, + inp6, inp7, inp8, inp8, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3, + inp7, inp8, inp8, inp7, + inp7, inp6, inp5, inp4, + inp8, inp8, inp7, inp6, + const20, const6, const3); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); +} + +static void vert_mc_qpel_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; + v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; + v16u8 res0; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + inp4 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2, + inp1, inp2, inp3, inp4, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp5 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1, + inp2, inp3, inp4, inp5, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp6 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0, + inp3, inp4, inp5, inp6, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp7 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0, + inp4, inp5, inp6, inp7, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp8 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1, + inp5, inp6, inp7, inp8, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp9 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2, + inp6, inp7, inp8, inp9, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp10 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3, + inp7, inp8, inp9, inp10, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp11 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4, + inp8, inp9, inp10, inp11, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp12 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5, + inp9, inp10, inp11, inp12, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp13 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6, + inp10, inp11, inp12, inp13, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp14 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7, + inp11, inp12, inp13, inp14, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp15 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8, + inp12, inp13, inp14, inp15, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp16 = LD_UB(src); + res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9, + inp13, inp14, inp15, inp16, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10, + inp14, inp15, inp16, inp16, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11, + inp15, inp16, inp16, inp15, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12, + inp16, inp16, inp15, inp14, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; +} + +static void vert_mc_qpel_aver_src1_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; + v16u8 tmp0, tmp1, res0, res1; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + LD_UB2(src, src_stride, inp4, inp5); + src += (2 * src_stride); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2, + inp1, inp2, inp3, inp4, + inp1, inp0, inp0, inp1, + inp2, inp3, inp4, inp5, + const20, const6, const3); + + LD_UB2(src, src_stride, inp6, inp7); + src += (2 * src_stride); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0, + inp3, inp4, inp5, inp6, + inp3, inp2, inp1, inp0, + inp4, inp5, inp6, inp7, + const20, const6, const3); + tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2); + tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4); + AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); + + inp8 = LD_UB(src); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, + inp5, inp6, inp7, inp8, + inp5, inp4, inp3, inp2, + inp6, inp7, inp8, inp8, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3, + inp7, inp8, inp8, inp7, + inp7, inp6, inp5, inp4, + inp8, inp8, inp7, inp6, + const20, const6, const3); + tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6); + tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8); + AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); + ST8x4_UB(res0, res1, dst, dst_stride); +} + +static void vert_mc_qpel_aver_src1_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; + v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; + v16u8 res0; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + inp4 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2, + inp1, inp2, inp3, inp4, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp1); + ST_UB(res0, dst); + dst += dst_stride; + + inp5 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1, + inp2, inp3, inp4, inp5, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp2); + ST_UB(res0, dst); + dst += dst_stride; + + inp6 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0, + inp3, inp4, inp5, inp6, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp3); + ST_UB(res0, dst); + dst += dst_stride; + + inp7 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0, + inp4, inp5, inp6, inp7, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp4); + ST_UB(res0, dst); + dst += dst_stride; + + inp8 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1, + inp5, inp6, inp7, inp8, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp5); + ST_UB(res0, dst); + dst += dst_stride; + + inp9 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2, + inp6, inp7, inp8, inp9, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp6); + ST_UB(res0, dst); + dst += dst_stride; + + inp10 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3, + inp7, inp8, inp9, inp10, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp7); + ST_UB(res0, dst); + dst += dst_stride; + + inp11 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4, + inp8, inp9, inp10, inp11, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp8); + ST_UB(res0, dst); + dst += dst_stride; + + inp12 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5, + inp9, inp10, inp11, inp12, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp9); + ST_UB(res0, dst); + dst += dst_stride; + + inp13 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6, + inp10, inp11, inp12, inp13, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp10); + ST_UB(res0, dst); + dst += dst_stride; + + inp14 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7, + inp11, inp12, inp13, inp14, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp11); + ST_UB(res0, dst); + dst += dst_stride; + + inp15 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8, + inp12, inp13, inp14, inp15, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp12); + ST_UB(res0, dst); + dst += dst_stride; + + inp16 = LD_UB(src); + res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9, + inp13, inp14, inp15, inp16, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp13); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10, + inp14, inp15, inp16, inp16, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp14); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11, + inp15, inp16, inp16, inp15, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp15); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12, + inp16, inp16, inp15, inp14, + const20, const6, const3); + res0 = __msa_aver_u_b(res0, inp16); + ST_UB(res0, dst); +} + +static void vert_mc_qpel_no_rnd_aver_src0_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; + v16u8 tmp0, tmp1, res0, res1; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + LD_UB2(src, src_stride, inp4, inp5); + src += (2 * src_stride); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2, + inp1, inp2, inp3, inp4, + inp1, inp0, inp0, inp1, + inp2, inp3, inp4, inp5, + const20, const6, const3); + LD_UB2(src, src_stride, inp6, inp7); + src += (2 * src_stride); + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0, + inp3, inp4, inp5, inp6, + inp3, inp2, inp1, inp0, + inp4, inp5, inp6, inp7, + const20, const6, const3); + tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + res0 = __msa_ave_u_b(res0, tmp0); + res1 = __msa_ave_u_b(res1, tmp1); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); + + inp8 = LD_UB(src); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1, + inp5, inp6, inp7, inp8, + inp5, inp4, inp3, inp2, + inp6, inp7, inp8, inp8, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3, + inp7, inp8, inp8, inp7, + inp7, inp6, inp5, inp4, + inp8, inp8, inp7, inp6, + const20, const6, const3); + tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5); + tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7); + res0 = __msa_ave_u_b(res0, tmp0); + res1 = __msa_ave_u_b(res1, tmp1); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); +} + +static void vert_mc_qpel_no_rnd_aver_src0_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; + v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; + v16u8 res0; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); + src += (5 * src_stride); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2, + inp1, inp2, inp3, inp4, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp0); + ST_UB(res0, dst); + dst += dst_stride; + + inp5 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1, + inp2, inp3, inp4, inp5, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp1); + ST_UB(res0, dst); + dst += dst_stride; + + inp6 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0, + inp3, inp4, inp5, inp6, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp2); + ST_UB(res0, dst); + dst += dst_stride; + + inp7 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0, + inp4, inp5, inp6, inp7, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp3); + ST_UB(res0, dst); + dst += dst_stride; + + inp8 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1, + inp5, inp6, inp7, inp8, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp4); + ST_UB(res0, dst); + dst += dst_stride; + + inp9 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2, + inp6, inp7, inp8, inp9, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp5); + ST_UB(res0, dst); + dst += dst_stride; + + inp10 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3, + inp7, inp8, inp9, inp10, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp6); + ST_UB(res0, dst); + dst += dst_stride; + + inp11 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4, + inp8, inp9, inp10, inp11, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp7); + ST_UB(res0, dst); + dst += dst_stride; + + inp12 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5, + inp9, inp10, inp11, inp12, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp8); + ST_UB(res0, dst); + dst += dst_stride; + + inp13 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6, + inp10, inp11, inp12, inp13, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp9); + ST_UB(res0, dst); + dst += dst_stride; + + inp14 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7, + inp11, inp12, inp13, inp14, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp10); + ST_UB(res0, dst); + dst += dst_stride; + + inp15 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8, + inp12, inp13, inp14, inp15, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp11); + ST_UB(res0, dst); + dst += dst_stride; + + inp16 = LD_UB(src); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9, + inp13, inp14, inp15, inp16, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp12); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10, + inp14, inp15, inp16, inp16, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp13); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11, + inp15, inp16, inp16, inp15, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp14); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12, + inp16, inp16, inp15, inp14, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp15); + ST_UB(res0, dst); + dst += dst_stride; +} + +static void vert_mc_qpel_no_rnd_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; + v16u8 res0, res1; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + LD_UB2(src, src_stride, inp4, inp5); + src += (2 * src_stride); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2, + inp1, inp2, inp3, inp4, + inp1, inp0, inp0, inp1, + inp2, inp3, inp4, inp5, + const20, const6, const3); + LD_UB2(src, src_stride, inp6, inp7); + src += (2 * src_stride); + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0, + inp3, inp4, inp5, inp6, + inp3, inp2, inp1, inp0, + inp4, inp5, inp6, inp7, + const20, const6, const3); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); + + inp8 = LD_UB(src); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1, + inp5, inp6, inp7, inp8, + inp5, inp4, inp3, inp2, + inp6, inp7, inp8, inp8, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3, + inp7, inp8, inp8, inp7, + inp7, inp6, inp5, inp4, + inp8, inp8, inp7, inp6, + const20, const6, const3); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); +} + +static void vert_mc_qpel_no_rnd_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; + v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; + v16u8 res0; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); + src += (5 * src_stride); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2, + inp1, inp2, inp3, inp4, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp5 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1, + inp2, inp3, inp4, inp5, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp6 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0, + inp3, inp4, inp5, inp6, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp7 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0, + inp4, inp5, inp6, inp7, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp8 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1, + inp5, inp6, inp7, inp8, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp9 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2, + inp6, inp7, inp8, inp9, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp10 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3, + inp7, inp8, inp9, inp10, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp11 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4, + inp8, inp9, inp10, inp11, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp12 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5, + inp9, inp10, inp11, inp12, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp13 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6, + inp10, inp11, inp12, inp13, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp14 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7, + inp11, inp12, inp13, inp14, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp15 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8, + inp12, inp13, inp14, inp15, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + inp16 = LD_UB(src); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9, + inp13, inp14, inp15, inp16, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10, + inp14, inp15, inp16, inp16, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11, + inp15, inp16, inp16, inp15, + const20, const6, const3); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12, + inp16, inp16, inp15, inp14, + const20, const6, const3); + ST_UB(res0, dst); +} + +static void vert_mc_qpel_no_rnd_aver_src1_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; + v16u8 tmp0, tmp1, res0, res1; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + LD_UB2(src, src_stride, inp4, inp5); + src += (2 * src_stride); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2, + inp1, inp2, inp3, inp4, + inp1, inp0, inp0, inp1, + inp2, inp3, inp4, inp5, + const20, const6, const3); + LD_UB2(src, src_stride, inp6, inp7); + src += (2 * src_stride); + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0, + inp3, inp4, inp5, inp6, + inp3, inp2, inp1, inp0, + inp4, inp5, inp6, inp7, + const20, const6, const3); + tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2); + tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4); + res0 = __msa_ave_u_b(res0, tmp0); + res1 = __msa_ave_u_b(res1, tmp1); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); + + inp8 = LD_UB(src); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1, + inp5, inp6, inp7, inp8, + inp5, inp4, inp3, inp2, + inp6, inp7, inp8, inp8, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3, + inp7, inp8, inp8, inp7, + inp7, inp6, inp5, inp4, + inp8, inp8, inp7, inp6, + const20, const6, const3); + tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6); + tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8); + res0 = __msa_ave_u_b(res0, tmp0); + res1 = __msa_ave_u_b(res1, tmp1); + ST8x4_UB(res0, res1, dst, dst_stride); +} + +static void vert_mc_qpel_no_rnd_aver_src1_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; + v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; + v16u8 res0; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); + src += (5 * src_stride); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2, + inp1, inp2, inp3, inp4, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp1); + ST_UB(res0, dst); + dst += dst_stride; + + inp5 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1, + inp2, inp3, inp4, inp5, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp2); + ST_UB(res0, dst); + dst += dst_stride; + + inp6 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0, + inp3, inp4, inp5, inp6, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp3); + ST_UB(res0, dst); + dst += dst_stride; + + inp7 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0, + inp4, inp5, inp6, inp7, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp4); + ST_UB(res0, dst); + dst += dst_stride; + + inp8 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1, + inp5, inp6, inp7, inp8, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp5); + ST_UB(res0, dst); + dst += dst_stride; + + inp9 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2, + inp6, inp7, inp8, inp9, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp6); + ST_UB(res0, dst); + dst += dst_stride; + + inp10 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3, + inp7, inp8, inp9, inp10, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp7); + ST_UB(res0, dst); + dst += dst_stride; + + inp11 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4, + inp8, inp9, inp10, inp11, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp8); + ST_UB(res0, dst); + dst += dst_stride; + + inp12 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5, + inp9, inp10, inp11, inp12, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp9); + ST_UB(res0, dst); + dst += dst_stride; + + inp13 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6, + inp10, inp11, inp12, inp13, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp10); + ST_UB(res0, dst); + dst += dst_stride; + + inp14 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7, + inp11, inp12, inp13, inp14, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp11); + ST_UB(res0, dst); + dst += dst_stride; + + inp15 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8, + inp12, inp13, inp14, inp15, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp12); + ST_UB(res0, dst); + dst += dst_stride; + + inp16 = LD_UB(src); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9, + inp13, inp14, inp15, inp16, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp13); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10, + inp14, inp15, inp16, inp16, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp14); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11, + inp15, inp16, inp16, inp15, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp15); + ST_UB(res0, dst); + dst += dst_stride; + + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12, + inp16, inp16, inp15, inp14, + const20, const6, const3); + res0 = __msa_ave_u_b(res0, inp16); + ST_UB(res0, dst); +} + +static void vert_mc_qpel_avg_dst_aver_src0_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; + v16u8 dst0, dst1, dst2, dst3; + v16u8 tmp0, tmp1, res0, res1; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + LD_UB2(src, src_stride, inp4, inp5); + src += (2 * src_stride); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2, + inp1, inp2, inp3, inp4, + inp1, inp0, inp0, inp1, + inp2, inp3, inp4, inp5, + const20, const6, const3); + + LD_UB2(src, src_stride, inp6, inp7); + src += (2 * src_stride); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0, + inp3, inp4, inp5, inp6, + inp3, inp2, inp1, inp0, + inp4, inp5, inp6, inp7, + const20, const6, const3); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); + dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); + AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); + AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); + + inp8 = LD_UB(src); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, + inp5, inp6, inp7, inp8, + inp5, inp4, inp3, inp2, + inp6, inp7, inp8, inp8, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3, + inp7, inp8, inp8, inp7, + inp7, inp6, inp5, inp4, + inp8, inp8, inp7, inp6, + const20, const6, const3); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5); + tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7); + dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); + dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); + AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); + AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); + ST8x4_UB(res0, res1, dst, dst_stride); +} + +static void vert_mc_qpel_avg_dst_aver_src0_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; + v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; + v16u8 res0, res1, dst0, dst1; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); + src += (5 * src_stride); + res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2, + inp1, inp2, inp3, inp4, + const20, const6, const3); + + inp5 = LD_UB(src); + src += src_stride; + res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1, + inp2, inp3, inp4, inp5, + const20, const6, const3); + + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, inp0, res1, inp1, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp6 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0, + inp3, inp4, inp5, inp6, + const20, const6, const3); + + inp7 = LD_UB(src); + src += src_stride; + res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0, + inp4, inp5, inp6, inp7, + const20, const6, const3); + + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, inp2, res1, inp3, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp8, inp9); + src += (2 * src_stride); + res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1, + inp5, inp6, inp7, inp8, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2, + inp6, inp7, inp8, inp9, + const20, const6, const3); + + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, inp4, res1, inp5, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp10, inp11); + src += (2 * src_stride); + res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3, + inp7, inp8, inp9, inp10, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4, + inp8, inp9, inp10, inp11, + const20, const6, const3); + + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, inp6, res1, inp7, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp12, inp13); + src += (2 * src_stride); + res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5, + inp9, inp10, inp11, inp12, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6, + inp10, inp11, inp12, inp13, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, inp8, res1, inp9, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp14, inp15); + src += (2 * src_stride); + res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7, + inp11, inp12, inp13, inp14, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8, + inp12, inp13, inp14, inp15, + const20, const6, const3); + + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, inp10, res1, inp11, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp16 = LD_UB(src); + res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9, + inp13, inp14, inp15, inp16, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10, + inp14, inp15, inp16, inp16, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, inp12, res1, inp13, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11, + inp15, inp16, inp16, inp15, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12, + inp16, inp16, inp15, inp14, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, inp14, res1, inp15, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); +} + +static void vert_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; + v16u8 dst0, dst1, dst2, dst3; + v16u8 res0, res1; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + LD_UB2(src, src_stride, inp4, inp5); + src += (2 * src_stride); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2, + inp1, inp2, inp3, inp4, + inp1, inp0, inp0, inp1, + inp2, inp3, inp4, inp5, + const20, const6, const3); + LD_UB2(src, src_stride, inp6, inp7); + src += (2 * src_stride); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0, + inp3, inp4, inp5, inp6, + inp3, inp2, inp1, inp0, + inp4, inp5, inp6, inp7, + const20, const6, const3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); + dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); + AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); + + inp8 = LD_UB(src); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, + inp5, inp6, inp7, inp8, + inp5, inp4, inp3, inp2, + inp6, inp7, inp8, inp8, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3, + inp7, inp8, inp8, inp7, + inp7, inp6, inp5, inp4, + inp8, inp8, inp7, inp6, + const20, const6, const3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); + dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); + AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); +} + +static void vert_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; + v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; + v16u8 res0, res1, dst0, dst1; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); + src += (5 * src_stride); + res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2, + inp1, inp2, inp3, inp4, + const20, const6, const3); + inp5 = LD_UB(src); + src += src_stride; + res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1, + inp2, inp3, inp4, inp5, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp6 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0, + inp3, inp4, inp5, inp6, + const20, const6, const3); + inp7 = LD_UB(src); + src += src_stride; + res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0, + inp4, inp5, inp6, inp7, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp8 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1, + inp5, inp6, inp7, inp8, + const20, const6, const3); + inp9 = LD_UB(src); + src += src_stride; + res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2, + inp6, inp7, inp8, inp9, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp10 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3, + inp7, inp8, inp9, inp10, + const20, const6, const3); + inp11 = LD_UB(src); + src += src_stride; + res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4, + inp8, inp9, inp10, inp11, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp12 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5, + inp9, inp10, inp11, inp12, + const20, const6, const3); + inp13 = LD_UB(src); + src += src_stride; + res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6, + inp10, inp11, inp12, inp13, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp14 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7, + inp11, inp12, inp13, inp14, + const20, const6, const3); + inp15 = LD_UB(src); + src += src_stride; + res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8, + inp12, inp13, inp14, inp15, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp16 = LD_UB(src); + res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9, + inp13, inp14, inp15, inp16, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10, + inp14, inp15, inp16, inp16, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11, + inp15, inp16, inp16, inp15, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12, + inp16, inp16, inp15, inp14, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); +} + +static void vert_mc_qpel_avg_dst_aver_src1_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; + v16u8 dst0, dst1, dst2, dst3; + v16u8 tmp0, tmp1, res0, res1; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + LD_UB2(src, src_stride, inp4, inp5); + src += (2 * src_stride); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2, + inp1, inp2, inp3, inp4, + inp1, inp0, inp0, inp1, + inp2, inp3, inp4, inp5, + const20, const6, const3); + LD_UB2(src, src_stride, inp6, inp7); + src += (2 * src_stride); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0, + inp3, inp4, inp5, inp6, + inp3, inp2, inp1, inp0, + inp4, inp5, inp6, inp7, + const20, const6, const3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2); + tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4); + dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); + dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); + AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); + AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); + ST8x4_UB(res0, res1, dst, dst_stride); + dst += (4 * dst_stride); + + inp8 = LD_UB(src); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, + inp5, inp6, inp7, inp8, + inp5, inp4, inp3, inp2, + inp6, inp7, inp8, inp8, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3, + inp7, inp8, inp8, inp7, + inp7, inp6, inp5, inp4, + inp8, inp8, inp7, inp6, + const20, const6, const3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6); + tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8); + dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); + dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); + AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); + AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); + ST8x4_UB(res0, res1, dst, dst_stride); +} + +static void vert_mc_qpel_avg_dst_aver_src1_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; + v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; + v16u8 res0, res1, dst0, dst1; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); + src += (5 * src_stride); + res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2, + inp1, inp2, inp3, inp4, + const20, const6, const3); + inp5 = LD_UB(src); + src += src_stride; + res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1, + inp2, inp3, inp4, inp5, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, inp1, res1, inp2, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp6 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0, + inp3, inp4, inp5, inp6, + const20, const6, const3); + inp7 = LD_UB(src); + src += src_stride; + res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0, + inp4, inp5, inp6, inp7, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, inp3, res1, inp4, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp8 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1, + inp5, inp6, inp7, inp8, + const20, const6, const3); + inp9 = LD_UB(src); + src += src_stride; + res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2, + inp6, inp7, inp8, inp9, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, inp5, res1, inp6, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp10 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3, + inp7, inp8, inp9, inp10, + const20, const6, const3); + inp11 = LD_UB(src); + src += src_stride; + res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4, + inp8, inp9, inp10, inp11, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, inp7, res1, inp8, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp12 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5, + inp9, inp10, inp11, inp12, + const20, const6, const3); + inp13 = LD_UB(src); + src += src_stride; + res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6, + inp10, inp11, inp12, inp13, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, inp9, res1, inp10, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp14 = LD_UB(src); + src += src_stride; + res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7, + inp11, inp12, inp13, inp14, + const20, const6, const3); + inp15 = LD_UB(src); + src += src_stride; + res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8, + inp12, inp13, inp14, inp15, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, inp11, res1, inp12, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp16 = LD_UB(src); + res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9, + inp13, inp14, inp15, inp16, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10, + inp14, inp15, inp16, inp16, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, inp13, res1, inp14, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); + dst += (2 * dst_stride); + + res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11, + inp15, inp16, inp16, inp15, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12, + inp16, inp16, inp15, inp14, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + AVER_UB2_UB(res0, inp15, res1, inp16, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST_UB2(res0, res1, dst, dst_stride); +} + +static void hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; + v16u8 res; + v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + v8u16 const20 = (v8u16) __msa_ldi_h(20); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); + LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); + src += (4 * src_stride); + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, + const20, const6, const3); + res = __msa_ave_u_b(inp0, res); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask, + const20, const6, const3); + res = __msa_ave_u_b(inp2, res); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask, + const20, const6, const3); + res = __msa_ave_u_b(inp4, res); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask, + const20, const6, const3); + res = __msa_ave_u_b(inp6, res); + ST_UB(res, dst); + dst += dst_stride; + } + + LD_UB2(src, 1, inp0, inp1); + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, + const20, const6, const3); + res = __msa_ave_u_b(inp0, res); + ST_UB(res, dst); +} + +static void hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz0 = __msa_ave_u_b(inp0, res0); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz2 = __msa_ave_u_b(inp2, res1); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz4 = __msa_ave_u_b(inp0, res0); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); + res0 = __msa_ave_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz6 = __msa_ave_u_b(inp2, res1); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + inp0 = LD_UB(src); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, + mask2, mask3, const20, + const6, const3); + horiz8 = __msa_ave_u_b(inp0, res0); + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); + res1 = __msa_ave_u_b(avg1, res1); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + ST8x2_UB(res1, dst, dst_stride); + dst += 2 * dst_stride; + + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); + res0 = __msa_ave_u_b(avg0, res0); + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); + res1 = __msa_ave_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_no_rnd_horiz_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; + v16u8 res; + v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + v8u16 const20 = (v8u16) __msa_ldi_h(20); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); + LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); + src += (4 * src_stride); + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, + const20, const6, const3); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask, + const20, const6, const3); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask, + const20, const6, const3); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask, + const20, const6, const3); + ST_UB(res, dst); + dst += dst_stride; + } + + LD_UB2(src, 1, inp0, inp1); + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, + const20, const6, const3); + ST_UB(res, dst); +} + +static void hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); + res0 = __msa_ave_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + inp0 = LD_UB(src); + horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, + mask2, mask3, const20, + const6, const3); + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); + res1 = __msa_ave_u_b(avg1, res1); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); + res0 = __msa_ave_u_b(avg0, res0); + ST8x2_UB(res1, dst, dst_stride); + dst += (2 * dst_stride); + + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); + res0 = __msa_ave_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); + res1 = __msa_ave_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; + v16u8 res; + v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + v8u16 const20 = (v8u16) __msa_ldi_h(20); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); + LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); + src += (4 * src_stride); + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, + const20, const6, const3); + res = __msa_ave_u_b(res, inp1); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask, + const20, const6, const3); + res = __msa_ave_u_b(res, inp3); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask, + const20, const6, const3); + res = __msa_ave_u_b(res, inp5); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask, + const20, const6, const3); + res = __msa_ave_u_b(res, inp7); + ST_UB(res, dst); + dst += dst_stride; + } + + LD_UB2(src, 1, inp0, inp1); + res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, + const20, const6, const3); + res = __msa_ave_u_b(inp1, res); + ST_UB(res, dst); +} + +static void hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + + inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + horiz0 = __msa_ave_u_b(inp0, res0); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + + inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + horiz2 = __msa_ave_u_b(inp2, res1); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + + inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + horiz4 = __msa_ave_u_b(inp0, res0); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); + res0 = __msa_ave_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + + inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + horiz6 = __msa_ave_u_b(inp2, res1); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + inp0 = LD_UB(src); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, + mask2, mask3, const20, + const6, const3); + inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); + horiz8 = __msa_ave_u_b(inp0, res0); + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); + res1 = __msa_ave_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); + dst += (2 * dst_stride); + + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); + res0 = __msa_ave_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); + res1 = __msa_ave_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz0 = __msa_ave_u_b(inp0, res0); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz2 = __msa_ave_u_b(inp2, res1); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz4 = __msa_ave_u_b(inp0, res0); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + + res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz6 = __msa_ave_u_b(inp2, res1); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + inp0 = LD_UB(src); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, + mask2, mask3, const20, + const6, const3); + horiz8 = __msa_ave_u_b(inp0, res0); + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + ST8x2_UB(res1, dst, dst_stride); + dst += 2 * dst_stride; + + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_no_rnd_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_no_rnd_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + + horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + inp0 = LD_UB(src); + horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, + mask2, mask3, const20, + const6, const3); + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + ST8x2_UB(res1, dst, dst_stride); + dst += 2 * dst_stride; + + + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + + inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + horiz0 = __msa_ave_u_b(inp0, res0); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + + inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + horiz2 = __msa_ave_u_b(inp2, res1); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + + inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + horiz4 = __msa_ave_u_b(inp0, res0); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + + res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + + inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + horiz6 = __msa_ave_u_b(inp2, res1); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + inp0 = LD_UB(src); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, + mask2, mask3, const20, + const6, const3); + inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); + horiz8 = __msa_ave_u_b(inp0, res0); + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + ST8x2_UB(res1, dst, dst_stride); + dst += 2 * dst_stride; + + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz0 = __msa_ave_u_b(inp0, res0); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz2 = __msa_ave_u_b(inp2, res1); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz4 = __msa_ave_u_b(inp0, res0); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); + res0 = __msa_ave_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz6 = __msa_ave_u_b(inp2, res1); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + inp0 = LD_UB(src); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, + mask2, mask3, const20, + const6, const3); + horiz8 = __msa_ave_u_b(inp0, res0); + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); + res1 = __msa_ave_u_b(avg1, res1); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + ST8x2_UB(res1, dst, dst_stride); + dst += 2 * dst_stride; + + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); + res0 = __msa_ave_u_b(avg0, res0); + + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); + res1 = __msa_ave_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); + res0 = __msa_ave_u_b(avg0, res0); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + + horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); + res1 = __msa_ave_u_b(avg1, res1); + inp0 = LD_UB(src); + horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, + mask2, mask3, const20, + const6, const3); + ST8x2_UB(res1, dst, dst_stride); + dst += 2 * dst_stride; + + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); + res0 = __msa_ave_u_b(avg0, res0); + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); + res1 = __msa_ave_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + + inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + horiz0 = __msa_ave_u_b(inp0, res0); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + + inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + horiz2 = __msa_ave_u_b(inp2, res1); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, + mask2, mask3, const20, + const6, const3); + + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + horiz4 = __msa_ave_u_b(inp0, res0); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); + res0 = __msa_ave_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, + mask2, mask3, const20, + const6, const3); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + + inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + horiz6 = __msa_ave_u_b(inp2, res1); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); + res1 = __msa_ave_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp0 = LD_UB(src); + res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, + mask2, mask3, const20, + const6, const3); + inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); + horiz8 = __msa_ave_u_b(inp0, res0); + res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); + res0 = __msa_ave_u_b(avg0, res0); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); + res1 = __msa_ave_u_b(avg1, res1); + ST8x4_UB(res0, res1, dst, dst_stride); +} + +static void hv_mc_qpel_aver_horiz_src0_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; + v16u8 res; + v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + v8u16 const20 = (v8u16) __msa_ldi_h(20); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); + LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); + src += (4 * src_stride); + res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, + const20, const6, const3); + res = __msa_aver_u_b(inp0, res); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, + const20, const6, const3); + res = __msa_aver_u_b(inp2, res); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, + const20, const6, const3); + res = __msa_aver_u_b(inp4, res); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, + const20, const6, const3); + res = __msa_aver_u_b(inp6, res); + ST_UB(res, dst); + dst += dst_stride; + } + + LD_UB2(src, 1, inp0, inp1); + res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3); + res = __msa_aver_u_b(inp0, res); + ST_UB(res, dst); +} + +static void hv_mc_qpel_aver_hv_src00_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_aver_hv_src00_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz0 = __msa_aver_u_b(inp0, res0); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz2 = __msa_aver_u_b(inp2, res1); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz4 = __msa_aver_u_b(inp0, res0); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz6 = __msa_aver_u_b(inp2, res1); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); + res1 = __msa_aver_u_b(avg1, res1); + + inp0 = LD_UB(src); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz8 = __msa_aver_u_b(inp0, res0); + ST8x2_UB(res1, dst, dst_stride); + dst += 2 * dst_stride; + + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); + res0 = __msa_aver_u_b(avg0, res0); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_aver_horiz_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; + v16u8 res; + v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + v8u16 const20 = (v8u16) __msa_ldi_h(20); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); + LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); + src += (4 * src_stride); + res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, + const20, const6, const3); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, + const20, const6, const3); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, + const20, const6, const3); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, + const20, const6, const3); + ST_UB(res, dst); + dst += dst_stride; + } + + LD_UB2(src, 1, inp0, inp1); + res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3); + ST_UB(res, dst); +} + +static void hv_mc_qpel_aver_v_src0_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_aver_v_src0_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + inp0 = LD_UB(src); + horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, + mask0, mask1, mask2, mask3, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); + res1 = __msa_aver_u_b(avg1, res1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + ST8x2_UB(res1, dst, dst_stride); + dst += 2 * dst_stride; + + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); + res0 = __msa_aver_u_b(avg0, res0); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_aver_horiz_src1_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height) +{ + uint8_t loop_count; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; + v16u8 res; + v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + v8u16 const20 = (v8u16) __msa_ldi_h(20); + + for (loop_count = (height >> 2); loop_count--;) { + LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); + LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); + src += (4 * src_stride); + res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, + const20, const6, const3); + res = __msa_aver_u_b(res, inp1); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, + const20, const6, const3); + res = __msa_aver_u_b(res, inp3); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, + const20, const6, const3); + res = __msa_aver_u_b(res, inp5); + ST_UB(res, dst); + dst += dst_stride; + + res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, + const20, const6, const3); + res = __msa_aver_u_b(res, inp7); + ST_UB(res, dst); + dst += dst_stride; + } + + LD_UB2(src, 1, inp0, inp1); + res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3); + res = __msa_aver_u_b(inp1, res); + ST_UB(res, dst); +} + +static void hv_mc_qpel_aver_hv_src10_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_aver_hv_src10_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + + inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + horiz0 = __msa_aver_u_b(inp0, res0); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + + inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + horiz2 = __msa_aver_u_b(inp2, res1); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + + inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + horiz4 = __msa_aver_u_b(inp0, res0); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + + inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + horiz6 = __msa_aver_u_b(inp2, res1); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); + res0 = __msa_aver_u_b(avg0, res0); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + + inp0 = LD_UB(src); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); + res1 = __msa_aver_u_b(avg1, res1); + inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); + horiz8 = __msa_aver_u_b(inp0, res0); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + ST8x2_UB(res1, dst, dst_stride); + dst += 2 * dst_stride; + + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); + res0 = __msa_aver_u_b(avg0, res0); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_aver_h_src0_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_aver_h_src0_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz0 = __msa_aver_u_b(inp0, res0); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz2 = __msa_aver_u_b(inp2, res1); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz4 = __msa_aver_u_b(inp0, res0); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz6 = __msa_aver_u_b(inp2, res1); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + inp0 = LD_UB(src); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz8 = __msa_aver_u_b(inp0, res0); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + ST8x2_UB(res1, dst, dst_stride); + dst += 2 * dst_stride; + + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_8x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + inp0 = LD_UB(src); + horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, + mask0, mask1, mask2, mask3, + const20, const6, const3); + ST8x2_UB(res1, dst, dst_stride); + dst += 2 * dst_stride; + + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_aver_h_src1_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_aver_h_src1_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + + inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + horiz0 = __msa_aver_u_b(inp0, res0); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + + inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + horiz2 = __msa_aver_u_b(inp2, res1); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + + inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + horiz4 = __msa_aver_u_b(inp0, res0); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + + inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + horiz6 = __msa_aver_u_b(inp2, res1); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + inp0 = LD_UB(src); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); + horiz8 = __msa_aver_u_b(inp0, res0); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + ST8x2_UB(res1, dst, dst_stride); + dst += (2 * dst_stride); + + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_aver_hv_src01_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_aver_hv_src01_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz0 = __msa_aver_u_b(inp0, res0); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz2 = __msa_aver_u_b(inp2, res1); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz4 = __msa_aver_u_b(inp0, res0); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz6 = __msa_aver_u_b(inp2, res1); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + inp0 = LD_UB(src); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz8 = __msa_aver_u_b(inp0, res0); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4); + res1 = __msa_aver_u_b(avg1, res1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + ST8x2_UB(res1, dst, dst_stride); + dst += 2 * dst_stride; + + avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6); + res0 = __msa_aver_u_b(avg0, res0); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + + avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); + dst += (2 * dst_stride); +} + +static void hv_mc_qpel_aver_v_src1_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_aver_v_src1_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + inp0 = LD_UB(src); + horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, + mask0, mask1, mask2, mask3, + const20, const6, const3); + avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4); + res1 = __msa_aver_u_b(avg1, res1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + ST8x2_UB(res1, dst, dst_stride); + dst += 2 * dst_stride; + avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6); + res0 = __msa_aver_u_b(avg0, res0); + + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_aver_hv_src11_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_aver_hv_src11_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); + src += (4 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + + inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + horiz0 = __msa_aver_u_b(inp0, res0); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + + inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + horiz2 = __msa_aver_u_b(inp2, res1); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + + inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); + horiz4 = __msa_aver_u_b(inp0, res0); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); + res0 = __msa_aver_u_b(avg0, res0); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + + inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); + horiz6 = __msa_aver_u_b(inp2, res1); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); + res1 = __msa_aver_u_b(avg1, res1); + inp0 = LD_UB(src); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); + horiz8 = __msa_aver_u_b(inp0, res0); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + ST8x2_UB(res1, dst, dst_stride); + dst += 2 * dst_stride; + + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); + res0 = __msa_aver_u_b(avg0, res0); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + ST8x2_UB(res0, dst, dst_stride); + dst += 2 * dst_stride; + + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 dst0, dst1; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz0 = __msa_aver_u_b(inp0, res0); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz2 = __msa_aver_u_b(inp2, res1); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz4 = __msa_aver_u_b(inp0, res0); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + LD_UB2(dst, dst_stride, dst0, dst1); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + res0 = __msa_aver_u_b(avg0, res0); + avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz6 = __msa_aver_u_b(inp2, res1); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + LD_UB2(dst, dst_stride, dst0, dst1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + res1 = __msa_aver_u_b(avg1, res1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp0 = LD_UB(src); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz8 = __msa_aver_u_b(inp0, res0); + LD_UB2(dst, dst_stride, dst0, dst1); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + res0 = __msa_aver_u_b(avg0, res0); + avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + res1 = __msa_aver_u_b(avg1, res1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 dst0, dst1; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, + mask0, mask1, mask2, mask3, + const20, const6, const3); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + LD_UB2(dst, dst_stride, dst0, dst1); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + res0 = __msa_aver_u_b(avg0, res0); + avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + LD_UB2(dst, dst_stride, dst0, dst1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + res1 = __msa_aver_u_b(avg1, res1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp0 = LD_UB(src); + horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, + mask0, mask1, mask2, mask3, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + res0 = __msa_aver_u_b(avg0, res0); + avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + res1 = __msa_aver_u_b(avg1, res1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 dst0, dst1; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz0 = __msa_aver_u_b(inp0, res0); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz2 = __msa_aver_u_b(inp2, res1); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz4 = __msa_aver_u_b(inp0, res0); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + LD_UB2(dst, dst_stride, dst0, dst1); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + res0 = __msa_aver_u_b(avg0, res0); + avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz6 = __msa_aver_u_b(inp2, res1); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + LD_UB2(dst, dst_stride, dst0, dst1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + res1 = __msa_aver_u_b(avg1, res1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp0 = LD_UB(src); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); + horiz8 = __msa_aver_u_b(inp0, res0); + LD_UB2(dst, dst_stride, dst0, dst1); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + res0 = __msa_aver_u_b(avg0, res0); + avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + res1 = __msa_aver_u_b(avg1, res1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 dst0, dst1; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz0 = __msa_aver_u_b(inp0, res0); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz2 = __msa_aver_u_b(inp2, res1); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz4 = __msa_aver_u_b(inp0, res0); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + LD_UB2(dst, dst_stride, dst0, dst1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz6 = __msa_aver_u_b(inp2, res1); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + LD_UB2(dst, dst_stride, dst0, dst1); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp0 = LD_UB(src); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz8 = __msa_aver_u_b(inp0, res0); + LD_UB2(dst, dst_stride, dst0, dst1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); + dst += (2 * dst_stride); +} + +static void hv_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride); + +} + +static void hv_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 dst0, dst1; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, + mask0, mask1, mask2, mask3, + const20, const6, const3); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + inp0 = LD_UB(src); + horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, + mask0, mask1, mask2, mask3, + const20, const6, const3); + LD_UB2(dst, dst_stride, dst0, dst1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 dst0, dst1; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz0 = __msa_aver_u_b(inp0, res0); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz2 = __msa_aver_u_b(inp2, res1); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz4 = __msa_aver_u_b(inp0, res0); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + LD_UB2(dst, dst_stride, dst0, dst1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz6 = __msa_aver_u_b(inp2, res1); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + LD_UB2(dst, dst_stride, dst0, dst1); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp0 = LD_UB(src); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); + horiz8 = __msa_aver_u_b(inp0, res0); + LD_UB2(dst, dst_stride, dst0, dst1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 dst0, dst1; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz0 = __msa_aver_u_b(inp0, res0); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz2 = __msa_aver_u_b(inp2, res1); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB2(dst, dst_stride, dst0, dst1); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz4 = __msa_aver_u_b(inp0, res0); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); + res0 = __msa_aver_u_b(avg0, res0); + avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz6 = __msa_aver_u_b(inp2, res1); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); + res1 = __msa_aver_u_b(avg1, res1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp0 = LD_UB(src); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz8 = __msa_aver_u_b(inp0, res0); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, + horiz5, horiz6, horiz7, horiz8, + horiz5, horiz4, horiz3, horiz2, + horiz6, horiz7, horiz8, horiz8, + const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, + horiz7, horiz8, horiz8, horiz7, + horiz7, horiz6, horiz5, horiz4, + horiz8, horiz8, horiz7, horiz6, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); + res0 = __msa_aver_u_b(avg0, res0); + LD_UB2(dst, dst_stride, dst0, dst1); + avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); + res1 = __msa_aver_u_b(avg1, res1); + LD_UB2(dst, dst_stride, dst0, dst1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 dst0, dst1; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + LD_UB2(dst, dst_stride, dst0, dst1); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, + horiz1, horiz2, horiz3, horiz4, + horiz1, horiz0, horiz0, horiz1, + horiz2, horiz3, horiz4, horiz5, + const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); + res0 = __msa_aver_u_b(avg0, res0); + avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, + mask0, mask1, mask2, mask3, + const20, const6, const3); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, + horiz3, horiz4, horiz5, horiz6, + horiz3, horiz2, horiz1, horiz0, + horiz4, horiz5, horiz6, horiz7, + const20, const6, const3); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); + res1 = __msa_aver_u_b(avg1, res1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp0 = LD_UB(src); + horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, + mask0, mask1, mask2, mask3, + const20, const6, const3); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5, + horiz6, horiz7, horiz8, horiz5, horiz4, + horiz3, horiz2, horiz6, horiz7, horiz8, + horiz8, const20, const6, const3); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7, + horiz8, horiz8, horiz7, horiz7, horiz6, + horiz5, horiz4, horiz8, horiz8, horiz7, + horiz6, const20, const6, const3); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); + res0 = __msa_aver_u_b(avg0, res0); + LD_UB2(dst, dst_stride, dst0, dst1); + avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); + res1 = __msa_aver_u_b(avg1, res1); + LD_UB2(dst, dst_stride, dst0, dst1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); +} + +static void hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t buff[272]; + + hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); + vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride); +} + +static void hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride) +{ + v16u8 inp0, inp1, inp2, inp3; + v16u8 res0, res1, avg0, avg1; + v16u8 horiz0, horiz1, horiz2, horiz3; + v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; + v16u8 dst0, dst1; + v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; + v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; + v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; + v16u8 const20 = (v16u8) __msa_ldi_b(20); + v16u8 const6 = (v16u8) __msa_ldi_b(6); + v16u8 const3 = (v16u8) __msa_ldi_b(3); + + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz0 = __msa_aver_u_b(inp0, res0); + horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + LD_UB2(src, src_stride, inp0, inp1); + src += (2 * src_stride); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz2 = __msa_aver_u_b(inp2, res1); + horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, + const20, const6, const3); + SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + + inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); + horiz4 = __msa_aver_u_b(inp0, res0); + horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); + LD_UB2(dst, dst_stride, dst0, dst1); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, horiz1, + horiz2, horiz3, horiz4, horiz1, horiz0, + horiz0, horiz1, horiz2, horiz3, horiz4, + horiz5, const20, const6, const3); + res0 = __msa_aver_u_b(avg0, res0); + avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(src, src_stride, inp2, inp3); + src += (2 * src_stride); + res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, + const20, const6, const3); + SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + + inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); + horiz6 = __msa_aver_u_b(inp2, res1); + horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); + LD_UB2(dst, dst_stride, dst0, dst1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, horiz3, + horiz4, horiz5, horiz6, horiz3, horiz2, + horiz1, horiz0, horiz4, horiz5, horiz6, + horiz7, const20, const6, const3); + res1 = __msa_aver_u_b(avg1, res1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); + dst += (2 * dst_stride); + + inp0 = LD_UB(src); + res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, + const20, const6, const3); + inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); + horiz8 = __msa_aver_u_b(inp0, res0); + LD_UB2(dst, dst_stride, dst0, dst1); + avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); + res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5, + horiz6, horiz7, horiz8, horiz5, horiz4, + horiz3, horiz2, horiz6, horiz7, horiz8, + horiz8, const20, const6, const3); + res0 = __msa_aver_u_b(avg0, res0); + avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res0 = __msa_aver_u_b(avg0, res0); + ST8x2_UB(res0, dst, dst_stride); + dst += (2 * dst_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); + res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7, + horiz8, horiz8, horiz7, horiz7, horiz6, + horiz5, horiz4, horiz8, horiz8, horiz7, + horiz6, const20, const6, const3); + res1 = __msa_aver_u_b(avg1, res1); + avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + res1 = __msa_aver_u_b(avg1, res1); + ST8x2_UB(res1, dst, dst_stride); +} + +static void copy_8x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + uint64_t src0, src1; + int32_t loop_cnt; + + for (loop_cnt = 4; loop_cnt--;) { + src0 = LD(src); + src += src_stride; + src1 = LD(src); + src += src_stride; + + SD(src0, dst); + dst += dst_stride; + SD(src1, dst); + dst += dst_stride; + } +} + +static void copy_16x16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 src8, src9, src10, src11, src12, src13, src14, src15; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + LD_UB8(src, src_stride, + src8, src9, src10, src11, src12, src13, src14, src15); + + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, + dst, dst_stride); +} + +static void avg_width8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + uint64_t out0, out1, out2, out3; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + + out0 = __msa_copy_u_d((v2i64) dst0, 0); + out1 = __msa_copy_u_d((v2i64) dst1, 0); + out2 = __msa_copy_u_d((v2i64) dst2, 0); + out3 = __msa_copy_u_d((v2i64) dst3, 0); + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avg_width16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + for (cnt = (height / 8); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +void ff_copy_16x16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride) +{ + copy_16x16_msa(src, stride, dest, stride); +} + +void ff_copy_8x8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride) +{ + copy_8x8_msa(src, stride, dest, stride); +} + +void ff_horiz_mc_qpel_aver_src0_8width_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + horiz_mc_qpel_aver_src0_8width_msa(src, stride, dest, stride, 8); +} + +void ff_horiz_mc_qpel_aver_src0_16width_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + horiz_mc_qpel_aver_src0_16width_msa(src, stride, dest, stride, 16); +} + +void ff_horiz_mc_qpel_8width_msa(uint8_t *dest, const uint8_t *src, + ptrdiff_t stride) +{ + horiz_mc_qpel_8width_msa(src, stride, dest, stride, 8); +} + +void ff_horiz_mc_qpel_16width_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + horiz_mc_qpel_16width_msa(src, stride, dest, stride, 16); +} + +void ff_horiz_mc_qpel_aver_src1_8width_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + horiz_mc_qpel_aver_src1_8width_msa(src, stride, dest, stride, 8); +} + +void ff_horiz_mc_qpel_aver_src1_16width_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + horiz_mc_qpel_aver_src1_16width_msa(src, stride, dest, stride, 16); +} + +void ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + horiz_mc_qpel_no_rnd_aver_src0_8width_msa(src, stride, dest, stride, 8); +} + +void ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + horiz_mc_qpel_no_rnd_aver_src0_16width_msa(src, stride, dest, stride, 16); +} + +void ff_horiz_mc_qpel_no_rnd_8width_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + horiz_mc_qpel_no_rnd_8width_msa(src, stride, dest, stride, 8); +} + +void ff_horiz_mc_qpel_no_rnd_16width_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + horiz_mc_qpel_no_rnd_16width_msa(src, stride, dest, stride, 16); +} + +void ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + horiz_mc_qpel_no_rnd_aver_src1_8width_msa(src, stride, dest, stride, 8); +} + +void ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + horiz_mc_qpel_no_rnd_aver_src1_16width_msa(src, stride, dest, stride, 16); +} + +void ff_avg_width8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride) +{ + avg_width8_msa(src, stride, dest, stride, 8); +} + +void ff_avg_width16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride) +{ + avg_width16_msa(src, stride, dest, stride, 16); +} + +void ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + horiz_mc_qpel_avg_dst_aver_src0_8width_msa(src, stride, dest, stride, 8); +} + +void ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + horiz_mc_qpel_avg_dst_aver_src0_16width_msa(src, stride, dest, stride, 16); +} + +void ff_horiz_mc_qpel_avg_dst_8width_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + horiz_mc_qpel_avg_dst_8width_msa(src, stride, dest, stride, 8); +} + +void ff_horiz_mc_qpel_avg_dst_16width_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + horiz_mc_qpel_avg_dst_16width_msa(src, stride, dest, stride, 16); +} + +void ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + horiz_mc_qpel_avg_dst_aver_src1_8width_msa(src, stride, dest, stride, 8); +} + +void ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + horiz_mc_qpel_avg_dst_aver_src1_16width_msa(src, stride, dest, stride, 16); +} + + +void ff_vert_mc_qpel_aver_src0_8x8_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + vert_mc_qpel_aver_src0_8x8_msa(src, stride, dest, stride); +} + +void ff_vert_mc_qpel_aver_src0_16x16_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + vert_mc_qpel_aver_src0_16x16_msa(src, stride, dest, stride); +} + +void ff_vert_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src, + ptrdiff_t stride) +{ + vert_mc_qpel_8x8_msa(src, stride, dest, stride); +} + +void ff_vert_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src, + ptrdiff_t stride) +{ + vert_mc_qpel_16x16_msa(src, stride, dest, stride); +} + +void ff_vert_mc_qpel_aver_src1_8x8_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + vert_mc_qpel_aver_src1_8x8_msa(src, stride, dest, stride); +} + +void ff_vert_mc_qpel_aver_src1_16x16_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + vert_mc_qpel_aver_src1_16x16_msa(src, stride, dest, stride); +} + +void ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + vert_mc_qpel_no_rnd_aver_src0_8x8_msa(src, stride, dest, stride); +} + +void ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + vert_mc_qpel_no_rnd_aver_src0_16x16_msa(src, stride, dest, stride); +} + +void ff_vert_mc_qpel_no_rnd_8x8_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + vert_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride); +} + +void ff_vert_mc_qpel_no_rnd_16x16_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + vert_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride); +} + +void ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + vert_mc_qpel_no_rnd_aver_src1_8x8_msa(src, stride, dest, stride); +} + +void ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + vert_mc_qpel_no_rnd_aver_src1_16x16_msa(src, stride, dest, stride); +} + +void ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + vert_mc_qpel_avg_dst_aver_src0_8x8_msa(src, stride, dest, stride); +} + +void ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + vert_mc_qpel_avg_dst_aver_src0_16x16_msa(src, stride, dest, stride); +} + +void ff_vert_mc_qpel_avg_dst_8x8_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + vert_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride); +} + +void ff_vert_mc_qpel_avg_dst_16x16_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + vert_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride); +} + +void ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + vert_mc_qpel_avg_dst_aver_src1_8x8_msa(src, stride, dest, stride); +} + +void ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + vert_mc_qpel_avg_dst_aver_src1_16x16_msa(src, stride, dest, stride); +} + +/* HV cases */ +void ff_hv_mc_qpel_aver_hv_src00_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_aver_hv_src00_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_aver_hv_src00_8x8_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + hv_mc_qpel_aver_hv_src00_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_aver_v_src0_16x16_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + hv_mc_qpel_aver_v_src0_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_aver_v_src0_8x8_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + hv_mc_qpel_aver_v_src0_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_aver_hv_src10_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_aver_hv_src10_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_aver_hv_src10_8x8_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + hv_mc_qpel_aver_hv_src10_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_aver_h_src0_16x16_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + hv_mc_qpel_aver_h_src0_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_aver_h_src0_8x8_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + hv_mc_qpel_aver_h_src0_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_aver_h_src1_16x16_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + hv_mc_qpel_aver_h_src1_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_aver_h_src1_8x8_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + hv_mc_qpel_aver_h_src1_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_aver_hv_src01_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_aver_hv_src01_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_aver_hv_src01_8x8_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + hv_mc_qpel_aver_hv_src01_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_aver_v_src1_16x16_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + hv_mc_qpel_aver_v_src1_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_aver_v_src1_8x8_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + hv_mc_qpel_aver_v_src1_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_aver_hv_src11_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_aver_hv_src11_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_aver_hv_src11_8x8_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + hv_mc_qpel_aver_hv_src11_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_avg_dst_16x16_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + hv_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_avg_dst_8x8_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + hv_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_no_rnd_16x16_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + hv_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_no_rnd_8x8_msa(uint8_t *dest, + const uint8_t *src, ptrdiff_t stride) +{ + hv_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(src, stride, dest, stride); +} + +void ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(uint8_t *dest, + const uint8_t *src, + ptrdiff_t stride) +{ + hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(src, stride, dest, stride); +} diff --git a/libavcodec/mips/sbrdsp_mips.c b/libavcodec/mips/sbrdsp_mips.c new file mode 100644 index 0000000000..1b0a10608d --- /dev/null +++ b/libavcodec/mips/sbrdsp_mips.c @@ -0,0 +1,911 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Authors: Darko Laus (darko@mips.com) + * Djordje Pesut (djordje@mips.com) + * Mirjana Vulin (mvulin@mips.com) + * + * AAC Spectral Band Replication decoding functions optimized for MIPS + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Reference: libavcodec/sbrdsp.c + */ + +#include "config.h" +#include "libavcodec/sbrdsp.h" +#include "libavutil/mips/asmdefs.h" + +#if HAVE_INLINE_ASM +static void sbr_qmf_pre_shuffle_mips(float *z) +{ + int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6; + float *z1 = &z[66]; + float *z2 = &z[59]; + float *z3 = &z[2]; + float *z4 = z1 + 60; + + /* loop unrolled 5 times */ + __asm__ volatile ( + "lui %[Temp6], 0x8000 \n\t" + "1: \n\t" + "lw %[Temp1], 0(%[z2]) \n\t" + "lw %[Temp2], 4(%[z2]) \n\t" + "lw %[Temp3], 8(%[z2]) \n\t" + "lw %[Temp4], 12(%[z2]) \n\t" + "lw %[Temp5], 16(%[z2]) \n\t" + "xor %[Temp1], %[Temp1], %[Temp6] \n\t" + "xor %[Temp2], %[Temp2], %[Temp6] \n\t" + "xor %[Temp3], %[Temp3], %[Temp6] \n\t" + "xor %[Temp4], %[Temp4], %[Temp6] \n\t" + "xor %[Temp5], %[Temp5], %[Temp6] \n\t" + PTR_ADDIU "%[z2], %[z2], -20 \n\t" + "sw %[Temp1], 32(%[z1]) \n\t" + "sw %[Temp2], 24(%[z1]) \n\t" + "sw %[Temp3], 16(%[z1]) \n\t" + "sw %[Temp4], 8(%[z1]) \n\t" + "sw %[Temp5], 0(%[z1]) \n\t" + "lw %[Temp1], 0(%[z3]) \n\t" + "lw %[Temp2], 4(%[z3]) \n\t" + "lw %[Temp3], 8(%[z3]) \n\t" + "lw %[Temp4], 12(%[z3]) \n\t" + "lw %[Temp5], 16(%[z3]) \n\t" + "sw %[Temp1], 4(%[z1]) \n\t" + "sw %[Temp2], 12(%[z1]) \n\t" + "sw %[Temp3], 20(%[z1]) \n\t" + "sw %[Temp4], 28(%[z1]) \n\t" + "sw %[Temp5], 36(%[z1]) \n\t" + PTR_ADDIU "%[z3], %[z3], 20 \n\t" + PTR_ADDIU "%[z1], %[z1], 40 \n\t" + "bne %[z1], %[z4], 1b \n\t" + "lw %[Temp1], 132(%[z]) \n\t" + "lw %[Temp2], 128(%[z]) \n\t" + "lw %[Temp3], 0(%[z]) \n\t" + "lw %[Temp4], 4(%[z]) \n\t" + "xor %[Temp1], %[Temp1], %[Temp6] \n\t" + "sw %[Temp1], 504(%[z]) \n\t" + "sw %[Temp2], 508(%[z]) \n\t" + "sw %[Temp3], 256(%[z]) \n\t" + "sw %[Temp4], 260(%[z]) \n\t" + + : [Temp1]"=&r"(Temp1), [Temp2]"=&r"(Temp2), + [Temp3]"=&r"(Temp3), [Temp4]"=&r"(Temp4), + [Temp5]"=&r"(Temp5), [Temp6]"=&r"(Temp6), + [z1]"+r"(z1), [z2]"+r"(z2), [z3]"+r"(z3) + : [z4]"r"(z4), [z]"r"(z) + : "memory" + ); +} + +static void sbr_qmf_post_shuffle_mips(float W[32][2], const float *z) +{ + int Temp1, Temp2, Temp3, Temp4, Temp5; + float *W_ptr = (float *)W; + float *z1 = (float *)z; + float *z2 = (float *)&z[60]; + float *z_end = z1 + 32; + + /* loop unrolled 4 times */ + __asm__ volatile ( + "lui %[Temp5], 0x8000 \n\t" + "1: \n\t" + "lw %[Temp1], 0(%[z2]) \n\t" + "lw %[Temp2], 4(%[z2]) \n\t" + "lw %[Temp3], 8(%[z2]) \n\t" + "lw %[Temp4], 12(%[z2]) \n\t" + "xor %[Temp1], %[Temp1], %[Temp5] \n\t" + "xor %[Temp2], %[Temp2], %[Temp5] \n\t" + "xor %[Temp3], %[Temp3], %[Temp5] \n\t" + "xor %[Temp4], %[Temp4], %[Temp5] \n\t" + PTR_ADDIU "%[z2], %[z2], -16 \n\t" + "sw %[Temp1], 24(%[W_ptr]) \n\t" + "sw %[Temp2], 16(%[W_ptr]) \n\t" + "sw %[Temp3], 8(%[W_ptr]) \n\t" + "sw %[Temp4], 0(%[W_ptr]) \n\t" + "lw %[Temp1], 0(%[z1]) \n\t" + "lw %[Temp2], 4(%[z1]) \n\t" + "lw %[Temp3], 8(%[z1]) \n\t" + "lw %[Temp4], 12(%[z1]) \n\t" + "sw %[Temp1], 4(%[W_ptr]) \n\t" + "sw %[Temp2], 12(%[W_ptr]) \n\t" + "sw %[Temp3], 20(%[W_ptr]) \n\t" + "sw %[Temp4], 28(%[W_ptr]) \n\t" + PTR_ADDIU "%[z1], %[z1], 16 \n\t" + PTR_ADDIU "%[W_ptr],%[W_ptr], 32 \n\t" + "bne %[z1], %[z_end], 1b \n\t" + + : [Temp1]"=&r"(Temp1), [Temp2]"=&r"(Temp2), + [Temp3]"=&r"(Temp3), [Temp4]"=&r"(Temp4), + [Temp5]"=&r"(Temp5), [z1]"+r"(z1), + [z2]"+r"(z2), [W_ptr]"+r"(W_ptr) + : [z_end]"r"(z_end) + : "memory" + ); +} + +#if HAVE_MIPSFPU +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 +static void sbr_sum64x5_mips(float *z) +{ + int k; + float *z1; + float f1, f2, f3, f4, f5, f6, f7, f8; + for (k = 0; k < 64; k += 8) { + + z1 = &z[k]; + + /* loop unrolled 8 times */ + __asm__ volatile ( + "lwc1 $f0, 0(%[z1]) \n\t" + "lwc1 $f1, 256(%[z1]) \n\t" + "lwc1 $f2, 4(%[z1]) \n\t" + "lwc1 $f3, 260(%[z1]) \n\t" + "lwc1 $f4, 8(%[z1]) \n\t" + "add.s %[f1], $f0, $f1 \n\t" + "lwc1 $f5, 264(%[z1]) \n\t" + "add.s %[f2], $f2, $f3 \n\t" + "lwc1 $f6, 12(%[z1]) \n\t" + "lwc1 $f7, 268(%[z1]) \n\t" + "add.s %[f3], $f4, $f5 \n\t" + "lwc1 $f8, 16(%[z1]) \n\t" + "lwc1 $f9, 272(%[z1]) \n\t" + "add.s %[f4], $f6, $f7 \n\t" + "lwc1 $f10, 20(%[z1]) \n\t" + "lwc1 $f11, 276(%[z1]) \n\t" + "add.s %[f5], $f8, $f9 \n\t" + "lwc1 $f12, 24(%[z1]) \n\t" + "lwc1 $f13, 280(%[z1]) \n\t" + "add.s %[f6], $f10, $f11 \n\t" + "lwc1 $f14, 28(%[z1]) \n\t" + "lwc1 $f15, 284(%[z1]) \n\t" + "add.s %[f7], $f12, $f13 \n\t" + "lwc1 $f0, 512(%[z1]) \n\t" + "lwc1 $f1, 516(%[z1]) \n\t" + "add.s %[f8], $f14, $f15 \n\t" + "lwc1 $f2, 520(%[z1]) \n\t" + "add.s %[f1], %[f1], $f0 \n\t" + "add.s %[f2], %[f2], $f1 \n\t" + "lwc1 $f3, 524(%[z1]) \n\t" + "add.s %[f3], %[f3], $f2 \n\t" + "lwc1 $f4, 528(%[z1]) \n\t" + "lwc1 $f5, 532(%[z1]) \n\t" + "add.s %[f4], %[f4], $f3 \n\t" + "lwc1 $f6, 536(%[z1]) \n\t" + "add.s %[f5], %[f5], $f4 \n\t" + "add.s %[f6], %[f6], $f5 \n\t" + "lwc1 $f7, 540(%[z1]) \n\t" + "add.s %[f7], %[f7], $f6 \n\t" + "lwc1 $f0, 768(%[z1]) \n\t" + "lwc1 $f1, 772(%[z1]) \n\t" + "add.s %[f8], %[f8], $f7 \n\t" + "lwc1 $f2, 776(%[z1]) \n\t" + "add.s %[f1], %[f1], $f0 \n\t" + "add.s %[f2], %[f2], $f1 \n\t" + "lwc1 $f3, 780(%[z1]) \n\t" + "add.s %[f3], %[f3], $f2 \n\t" + "lwc1 $f4, 784(%[z1]) \n\t" + "lwc1 $f5, 788(%[z1]) \n\t" + "add.s %[f4], %[f4], $f3 \n\t" + "lwc1 $f6, 792(%[z1]) \n\t" + "add.s %[f5], %[f5], $f4 \n\t" + "add.s %[f6], %[f6], $f5 \n\t" + "lwc1 $f7, 796(%[z1]) \n\t" + "add.s %[f7], %[f7], $f6 \n\t" + "lwc1 $f0, 1024(%[z1]) \n\t" + "lwc1 $f1, 1028(%[z1]) \n\t" + "add.s %[f8], %[f8], $f7 \n\t" + "lwc1 $f2, 1032(%[z1]) \n\t" + "add.s %[f1], %[f1], $f0 \n\t" + "add.s %[f2], %[f2], $f1 \n\t" + "lwc1 $f3, 1036(%[z1]) \n\t" + "add.s %[f3], %[f3], $f2 \n\t" + "lwc1 $f4, 1040(%[z1]) \n\t" + "lwc1 $f5, 1044(%[z1]) \n\t" + "add.s %[f4], %[f4], $f3 \n\t" + "lwc1 $f6, 1048(%[z1]) \n\t" + "add.s %[f5], %[f5], $f4 \n\t" + "add.s %[f6], %[f6], $f5 \n\t" + "lwc1 $f7, 1052(%[z1]) \n\t" + "add.s %[f7], %[f7], $f6 \n\t" + "swc1 %[f1], 0(%[z1]) \n\t" + "swc1 %[f2], 4(%[z1]) \n\t" + "add.s %[f8], %[f8], $f7 \n\t" + "swc1 %[f3], 8(%[z1]) \n\t" + "swc1 %[f4], 12(%[z1]) \n\t" + "swc1 %[f5], 16(%[z1]) \n\t" + "swc1 %[f6], 20(%[z1]) \n\t" + "swc1 %[f7], 24(%[z1]) \n\t" + "swc1 %[f8], 28(%[z1]) \n\t" + + : [f1]"=&f"(f1), [f2]"=&f"(f2), [f3]"=&f"(f3), + [f4]"=&f"(f4), [f5]"=&f"(f5), [f6]"=&f"(f6), + [f7]"=&f"(f7), [f8]"=&f"(f8) + : [z1]"r"(z1) + : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", + "$f6", "$f7", "$f8", "$f9", "$f10", "$f11", + "$f12", "$f13", "$f14", "$f15", + "memory" + ); + } +} + +static float sbr_sum_square_mips(float (*x)[2], int n) +{ + float sum0 = 0.0f, sum1 = 0.0f; + float *p_x; + float temp0, temp1, temp2, temp3; + float *loop_end; + p_x = &x[0][0]; + loop_end = p_x + (n >> 1)*4 - 4; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "lwc1 %[temp0], 0(%[p_x]) \n\t" + "lwc1 %[temp1], 4(%[p_x]) \n\t" + "lwc1 %[temp2], 8(%[p_x]) \n\t" + "lwc1 %[temp3], 12(%[p_x]) \n\t" + "1: \n\t" + PTR_ADDIU "%[p_x], %[p_x], 16 \n\t" + "madd.s %[sum0], %[sum0], %[temp0], %[temp0] \n\t" + "lwc1 %[temp0], 0(%[p_x]) \n\t" + "madd.s %[sum1], %[sum1], %[temp1], %[temp1] \n\t" + "lwc1 %[temp1], 4(%[p_x]) \n\t" + "madd.s %[sum0], %[sum0], %[temp2], %[temp2] \n\t" + "lwc1 %[temp2], 8(%[p_x]) \n\t" + "madd.s %[sum1], %[sum1], %[temp3], %[temp3] \n\t" + "bne %[p_x], %[loop_end], 1b \n\t" + " lwc1 %[temp3], 12(%[p_x]) \n\t" + "madd.s %[sum0], %[sum0], %[temp0], %[temp0] \n\t" + "madd.s %[sum1], %[sum1], %[temp1], %[temp1] \n\t" + "madd.s %[sum0], %[sum0], %[temp2], %[temp2] \n\t" + "madd.s %[sum1], %[sum1], %[temp3], %[temp3] \n\t" + ".set pop \n\t" + + : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [temp3]"=&f"(temp3), [sum0]"+f"(sum0), [sum1]"+f"(sum1), + [p_x]"+r"(p_x) + : [loop_end]"r"(loop_end) + : "memory" + ); + return sum0 + sum1; +} + +static void sbr_qmf_deint_bfly_mips(float *v, const float *src0, const float *src1) +{ + int i; + float temp0, temp1, temp2, temp3, temp4, temp5; + float temp6, temp7, temp8, temp9, temp10, temp11; + float *v0 = v; + float *v1 = &v[127]; + float *psrc0 = (float*)src0; + float *psrc1 = (float*)&src1[63]; + + for (i = 0; i < 4; i++) { + + /* loop unrolled 16 times */ + __asm__ volatile( + "lwc1 %[temp0], 0(%[src0]) \n\t" + "lwc1 %[temp1], 0(%[src1]) \n\t" + "lwc1 %[temp3], 4(%[src0]) \n\t" + "lwc1 %[temp4], -4(%[src1]) \n\t" + "lwc1 %[temp6], 8(%[src0]) \n\t" + "lwc1 %[temp7], -8(%[src1]) \n\t" + "lwc1 %[temp9], 12(%[src0]) \n\t" + "lwc1 %[temp10], -12(%[src1]) \n\t" + "add.s %[temp2], %[temp0], %[temp1] \n\t" + "add.s %[temp5], %[temp3], %[temp4] \n\t" + "add.s %[temp8], %[temp6], %[temp7] \n\t" + "add.s %[temp11], %[temp9], %[temp10] \n\t" + "sub.s %[temp0], %[temp0], %[temp1] \n\t" + "sub.s %[temp3], %[temp3], %[temp4] \n\t" + "sub.s %[temp6], %[temp6], %[temp7] \n\t" + "sub.s %[temp9], %[temp9], %[temp10] \n\t" + "swc1 %[temp2], 0(%[v1]) \n\t" + "swc1 %[temp0], 0(%[v0]) \n\t" + "swc1 %[temp5], -4(%[v1]) \n\t" + "swc1 %[temp3], 4(%[v0]) \n\t" + "swc1 %[temp8], -8(%[v1]) \n\t" + "swc1 %[temp6], 8(%[v0]) \n\t" + "swc1 %[temp11], -12(%[v1]) \n\t" + "swc1 %[temp9], 12(%[v0]) \n\t" + "lwc1 %[temp0], 16(%[src0]) \n\t" + "lwc1 %[temp1], -16(%[src1]) \n\t" + "lwc1 %[temp3], 20(%[src0]) \n\t" + "lwc1 %[temp4], -20(%[src1]) \n\t" + "lwc1 %[temp6], 24(%[src0]) \n\t" + "lwc1 %[temp7], -24(%[src1]) \n\t" + "lwc1 %[temp9], 28(%[src0]) \n\t" + "lwc1 %[temp10], -28(%[src1]) \n\t" + "add.s %[temp2], %[temp0], %[temp1] \n\t" + "add.s %[temp5], %[temp3], %[temp4] \n\t" + "add.s %[temp8], %[temp6], %[temp7] \n\t" + "add.s %[temp11], %[temp9], %[temp10] \n\t" + "sub.s %[temp0], %[temp0], %[temp1] \n\t" + "sub.s %[temp3], %[temp3], %[temp4] \n\t" + "sub.s %[temp6], %[temp6], %[temp7] \n\t" + "sub.s %[temp9], %[temp9], %[temp10] \n\t" + "swc1 %[temp2], -16(%[v1]) \n\t" + "swc1 %[temp0], 16(%[v0]) \n\t" + "swc1 %[temp5], -20(%[v1]) \n\t" + "swc1 %[temp3], 20(%[v0]) \n\t" + "swc1 %[temp8], -24(%[v1]) \n\t" + "swc1 %[temp6], 24(%[v0]) \n\t" + "swc1 %[temp11], -28(%[v1]) \n\t" + "swc1 %[temp9], 28(%[v0]) \n\t" + "lwc1 %[temp0], 32(%[src0]) \n\t" + "lwc1 %[temp1], -32(%[src1]) \n\t" + "lwc1 %[temp3], 36(%[src0]) \n\t" + "lwc1 %[temp4], -36(%[src1]) \n\t" + "lwc1 %[temp6], 40(%[src0]) \n\t" + "lwc1 %[temp7], -40(%[src1]) \n\t" + "lwc1 %[temp9], 44(%[src0]) \n\t" + "lwc1 %[temp10], -44(%[src1]) \n\t" + "add.s %[temp2], %[temp0], %[temp1] \n\t" + "add.s %[temp5], %[temp3], %[temp4] \n\t" + "add.s %[temp8], %[temp6], %[temp7] \n\t" + "add.s %[temp11], %[temp9], %[temp10] \n\t" + "sub.s %[temp0], %[temp0], %[temp1] \n\t" + "sub.s %[temp3], %[temp3], %[temp4] \n\t" + "sub.s %[temp6], %[temp6], %[temp7] \n\t" + "sub.s %[temp9], %[temp9], %[temp10] \n\t" + "swc1 %[temp2], -32(%[v1]) \n\t" + "swc1 %[temp0], 32(%[v0]) \n\t" + "swc1 %[temp5], -36(%[v1]) \n\t" + "swc1 %[temp3], 36(%[v0]) \n\t" + "swc1 %[temp8], -40(%[v1]) \n\t" + "swc1 %[temp6], 40(%[v0]) \n\t" + "swc1 %[temp11], -44(%[v1]) \n\t" + "swc1 %[temp9], 44(%[v0]) \n\t" + "lwc1 %[temp0], 48(%[src0]) \n\t" + "lwc1 %[temp1], -48(%[src1]) \n\t" + "lwc1 %[temp3], 52(%[src0]) \n\t" + "lwc1 %[temp4], -52(%[src1]) \n\t" + "lwc1 %[temp6], 56(%[src0]) \n\t" + "lwc1 %[temp7], -56(%[src1]) \n\t" + "lwc1 %[temp9], 60(%[src0]) \n\t" + "lwc1 %[temp10], -60(%[src1]) \n\t" + "add.s %[temp2], %[temp0], %[temp1] \n\t" + "add.s %[temp5], %[temp3], %[temp4] \n\t" + "add.s %[temp8], %[temp6], %[temp7] \n\t" + "add.s %[temp11], %[temp9], %[temp10] \n\t" + "sub.s %[temp0], %[temp0], %[temp1] \n\t" + "sub.s %[temp3], %[temp3], %[temp4] \n\t" + "sub.s %[temp6], %[temp6], %[temp7] \n\t" + "sub.s %[temp9], %[temp9], %[temp10] \n\t" + "swc1 %[temp2], -48(%[v1]) \n\t" + "swc1 %[temp0], 48(%[v0]) \n\t" + "swc1 %[temp5], -52(%[v1]) \n\t" + "swc1 %[temp3], 52(%[v0]) \n\t" + "swc1 %[temp8], -56(%[v1]) \n\t" + "swc1 %[temp6], 56(%[v0]) \n\t" + "swc1 %[temp11], -60(%[v1]) \n\t" + "swc1 %[temp9], 60(%[v0]) \n\t" + PTR_ADDIU " %[src0], %[src0], 64 \n\t" + PTR_ADDIU " %[src1], %[src1], -64 \n\t" + PTR_ADDIU " %[v0], %[v0], 64 \n\t" + PTR_ADDIU " %[v1], %[v1], -64 \n\t" + + : [v0]"+r"(v0), [v1]"+r"(v1), [src0]"+r"(psrc0), [src1]"+r"(psrc1), + [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5), + [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8), + [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), [temp11]"=&f"(temp11) + : + :"memory" + ); + } +} + +static void sbr_autocorrelate_mips(const float x[40][2], float phi[3][2][2]) +{ + int i; + float real_sum_0 = 0.0f; + float real_sum_1 = 0.0f; + float real_sum_2 = 0.0f; + float imag_sum_1 = 0.0f; + float imag_sum_2 = 0.0f; + float *p_x, *p_phi; + float temp0, temp1, temp2, temp3, temp4, temp5, temp6; + float temp7, temp_r, temp_r1, temp_r2, temp_r3, temp_r4; + p_x = (float*)&x[0][0]; + p_phi = &phi[0][0][0]; + + __asm__ volatile ( + "lwc1 %[temp0], 8(%[p_x]) \n\t" + "lwc1 %[temp1], 12(%[p_x]) \n\t" + "lwc1 %[temp2], 16(%[p_x]) \n\t" + "lwc1 %[temp3], 20(%[p_x]) \n\t" + "lwc1 %[temp4], 24(%[p_x]) \n\t" + "lwc1 %[temp5], 28(%[p_x]) \n\t" + "mul.s %[temp_r], %[temp1], %[temp1] \n\t" + "mul.s %[temp_r1], %[temp1], %[temp3] \n\t" + "mul.s %[temp_r2], %[temp1], %[temp2] \n\t" + "mul.s %[temp_r3], %[temp1], %[temp5] \n\t" + "mul.s %[temp_r4], %[temp1], %[temp4] \n\t" + "madd.s %[temp_r], %[temp_r], %[temp0], %[temp0] \n\t" + "madd.s %[temp_r1], %[temp_r1], %[temp0], %[temp2] \n\t" + "msub.s %[temp_r2], %[temp_r2], %[temp0], %[temp3] \n\t" + "madd.s %[temp_r3], %[temp_r3], %[temp0], %[temp4] \n\t" + "msub.s %[temp_r4], %[temp_r4], %[temp0], %[temp5] \n\t" + "add.s %[real_sum_0], %[real_sum_0], %[temp_r] \n\t" + "add.s %[real_sum_1], %[real_sum_1], %[temp_r1] \n\t" + "add.s %[imag_sum_1], %[imag_sum_1], %[temp_r2] \n\t" + "add.s %[real_sum_2], %[real_sum_2], %[temp_r3] \n\t" + "add.s %[imag_sum_2], %[imag_sum_2], %[temp_r4] \n\t" + PTR_ADDIU "%[p_x], %[p_x], 8 \n\t" + + : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5), + [real_sum_0]"+f"(real_sum_0), [real_sum_1]"+f"(real_sum_1), + [imag_sum_1]"+f"(imag_sum_1), [real_sum_2]"+f"(real_sum_2), + [temp_r]"=&f"(temp_r), [temp_r1]"=&f"(temp_r1), [temp_r2]"=&f"(temp_r2), + [temp_r3]"=&f"(temp_r3), [temp_r4]"=&f"(temp_r4), + [p_x]"+r"(p_x), [imag_sum_2]"+f"(imag_sum_2) + : + : "memory" + ); + + for (i = 0; i < 12; i++) { + __asm__ volatile ( + "lwc1 %[temp0], 8(%[p_x]) \n\t" + "lwc1 %[temp1], 12(%[p_x]) \n\t" + "lwc1 %[temp2], 16(%[p_x]) \n\t" + "lwc1 %[temp3], 20(%[p_x]) \n\t" + "lwc1 %[temp4], 24(%[p_x]) \n\t" + "lwc1 %[temp5], 28(%[p_x]) \n\t" + "mul.s %[temp_r], %[temp1], %[temp1] \n\t" + "mul.s %[temp_r1], %[temp1], %[temp3] \n\t" + "mul.s %[temp_r2], %[temp1], %[temp2] \n\t" + "mul.s %[temp_r3], %[temp1], %[temp5] \n\t" + "mul.s %[temp_r4], %[temp1], %[temp4] \n\t" + "madd.s %[temp_r], %[temp_r], %[temp0], %[temp0] \n\t" + "madd.s %[temp_r1], %[temp_r1], %[temp0], %[temp2] \n\t" + "msub.s %[temp_r2], %[temp_r2], %[temp0], %[temp3] \n\t" + "madd.s %[temp_r3], %[temp_r3], %[temp0], %[temp4] \n\t" + "msub.s %[temp_r4], %[temp_r4], %[temp0], %[temp5] \n\t" + "add.s %[real_sum_0], %[real_sum_0], %[temp_r] \n\t" + "add.s %[real_sum_1], %[real_sum_1], %[temp_r1] \n\t" + "add.s %[imag_sum_1], %[imag_sum_1], %[temp_r2] \n\t" + "add.s %[real_sum_2], %[real_sum_2], %[temp_r3] \n\t" + "add.s %[imag_sum_2], %[imag_sum_2], %[temp_r4] \n\t" + "lwc1 %[temp0], 32(%[p_x]) \n\t" + "lwc1 %[temp1], 36(%[p_x]) \n\t" + "mul.s %[temp_r], %[temp3], %[temp3] \n\t" + "mul.s %[temp_r1], %[temp3], %[temp5] \n\t" + "mul.s %[temp_r2], %[temp3], %[temp4] \n\t" + "mul.s %[temp_r3], %[temp3], %[temp1] \n\t" + "mul.s %[temp_r4], %[temp3], %[temp0] \n\t" + "madd.s %[temp_r], %[temp_r], %[temp2], %[temp2] \n\t" + "madd.s %[temp_r1], %[temp_r1], %[temp2], %[temp4] \n\t" + "msub.s %[temp_r2], %[temp_r2], %[temp2], %[temp5] \n\t" + "madd.s %[temp_r3], %[temp_r3], %[temp2], %[temp0] \n\t" + "msub.s %[temp_r4], %[temp_r4], %[temp2], %[temp1] \n\t" + "add.s %[real_sum_0], %[real_sum_0], %[temp_r] \n\t" + "add.s %[real_sum_1], %[real_sum_1], %[temp_r1] \n\t" + "add.s %[imag_sum_1], %[imag_sum_1], %[temp_r2] \n\t" + "add.s %[real_sum_2], %[real_sum_2], %[temp_r3] \n\t" + "add.s %[imag_sum_2], %[imag_sum_2], %[temp_r4] \n\t" + "lwc1 %[temp2], 40(%[p_x]) \n\t" + "lwc1 %[temp3], 44(%[p_x]) \n\t" + "mul.s %[temp_r], %[temp5], %[temp5] \n\t" + "mul.s %[temp_r1], %[temp5], %[temp1] \n\t" + "mul.s %[temp_r2], %[temp5], %[temp0] \n\t" + "mul.s %[temp_r3], %[temp5], %[temp3] \n\t" + "mul.s %[temp_r4], %[temp5], %[temp2] \n\t" + "madd.s %[temp_r], %[temp_r], %[temp4], %[temp4] \n\t" + "madd.s %[temp_r1], %[temp_r1], %[temp4], %[temp0] \n\t" + "msub.s %[temp_r2], %[temp_r2], %[temp4], %[temp1] \n\t" + "madd.s %[temp_r3], %[temp_r3], %[temp4], %[temp2] \n\t" + "msub.s %[temp_r4], %[temp_r4], %[temp4], %[temp3] \n\t" + "add.s %[real_sum_0], %[real_sum_0], %[temp_r] \n\t" + "add.s %[real_sum_1], %[real_sum_1], %[temp_r1] \n\t" + "add.s %[imag_sum_1], %[imag_sum_1], %[temp_r2] \n\t" + "add.s %[real_sum_2], %[real_sum_2], %[temp_r3] \n\t" + "add.s %[imag_sum_2], %[imag_sum_2], %[temp_r4] \n\t" + PTR_ADDIU "%[p_x], %[p_x], 24 \n\t" + + : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5), + [real_sum_0]"+f"(real_sum_0), [real_sum_1]"+f"(real_sum_1), + [imag_sum_1]"+f"(imag_sum_1), [real_sum_2]"+f"(real_sum_2), + [temp_r]"=&f"(temp_r), [temp_r1]"=&f"(temp_r1), + [temp_r2]"=&f"(temp_r2), [temp_r3]"=&f"(temp_r3), + [temp_r4]"=&f"(temp_r4), [p_x]"+r"(p_x), + [imag_sum_2]"+f"(imag_sum_2) + : + : "memory" + ); + } + __asm__ volatile ( + "lwc1 %[temp0], -296(%[p_x]) \n\t" + "lwc1 %[temp1], -292(%[p_x]) \n\t" + "lwc1 %[temp2], 8(%[p_x]) \n\t" + "lwc1 %[temp3], 12(%[p_x]) \n\t" + "lwc1 %[temp4], -288(%[p_x]) \n\t" + "lwc1 %[temp5], -284(%[p_x]) \n\t" + "lwc1 %[temp6], -280(%[p_x]) \n\t" + "lwc1 %[temp7], -276(%[p_x]) \n\t" + "madd.s %[temp_r], %[real_sum_0], %[temp0], %[temp0] \n\t" + "madd.s %[temp_r1], %[real_sum_0], %[temp2], %[temp2] \n\t" + "madd.s %[temp_r2], %[real_sum_1], %[temp0], %[temp4] \n\t" + "madd.s %[temp_r3], %[imag_sum_1], %[temp0], %[temp5] \n\t" + "madd.s %[temp_r], %[temp_r], %[temp1], %[temp1] \n\t" + "madd.s %[temp_r1], %[temp_r1], %[temp3], %[temp3] \n\t" + "madd.s %[temp_r2], %[temp_r2], %[temp1], %[temp5] \n\t" + "nmsub.s %[temp_r3], %[temp_r3], %[temp1], %[temp4] \n\t" + "lwc1 %[temp4], 16(%[p_x]) \n\t" + "lwc1 %[temp5], 20(%[p_x]) \n\t" + "swc1 %[temp_r], 40(%[p_phi]) \n\t" + "swc1 %[temp_r1], 16(%[p_phi]) \n\t" + "swc1 %[temp_r2], 24(%[p_phi]) \n\t" + "swc1 %[temp_r3], 28(%[p_phi]) \n\t" + "madd.s %[temp_r], %[real_sum_1], %[temp2], %[temp4] \n\t" + "madd.s %[temp_r1], %[imag_sum_1], %[temp2], %[temp5] \n\t" + "madd.s %[temp_r2], %[real_sum_2], %[temp0], %[temp6] \n\t" + "madd.s %[temp_r3], %[imag_sum_2], %[temp0], %[temp7] \n\t" + "madd.s %[temp_r], %[temp_r], %[temp3], %[temp5] \n\t" + "nmsub.s %[temp_r1], %[temp_r1], %[temp3], %[temp4] \n\t" + "madd.s %[temp_r2], %[temp_r2], %[temp1], %[temp7] \n\t" + "nmsub.s %[temp_r3], %[temp_r3], %[temp1], %[temp6] \n\t" + "swc1 %[temp_r], 0(%[p_phi]) \n\t" + "swc1 %[temp_r1], 4(%[p_phi]) \n\t" + "swc1 %[temp_r2], 8(%[p_phi]) \n\t" + "swc1 %[temp_r3], 12(%[p_phi]) \n\t" + + : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5), + [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp_r]"=&f"(temp_r), + [real_sum_0]"+f"(real_sum_0), [real_sum_1]"+f"(real_sum_1), + [real_sum_2]"+f"(real_sum_2), [imag_sum_1]"+f"(imag_sum_1), + [temp_r2]"=&f"(temp_r2), [temp_r3]"=&f"(temp_r3), + [temp_r1]"=&f"(temp_r1), [p_phi]"+r"(p_phi), + [imag_sum_2]"+f"(imag_sum_2) + : [p_x]"r"(p_x) + : "memory" + ); +} + +static void sbr_hf_gen_mips(float (*X_high)[2], const float (*X_low)[2], + const float alpha0[2], const float alpha1[2], + float bw, int start, int end) +{ + float alpha[4]; + int i; + float *p_x_low = (float*)&X_low[0][0] + 2*start; + float *p_x_high = &X_high[0][0] + 2*start; + float temp0, temp1, temp2, temp3, temp4, temp5, temp6; + float temp7, temp8, temp9, temp10, temp11, temp12; + + alpha[0] = alpha1[0] * bw * bw; + alpha[1] = alpha1[1] * bw * bw; + alpha[2] = alpha0[0] * bw; + alpha[3] = alpha0[1] * bw; + + for (i = start; i < end; i++) { + __asm__ volatile ( + "lwc1 %[temp0], -16(%[p_x_low]) \n\t" + "lwc1 %[temp1], -12(%[p_x_low]) \n\t" + "lwc1 %[temp2], -8(%[p_x_low]) \n\t" + "lwc1 %[temp3], -4(%[p_x_low]) \n\t" + "lwc1 %[temp5], 0(%[p_x_low]) \n\t" + "lwc1 %[temp6], 4(%[p_x_low]) \n\t" + "lwc1 %[temp7], 0(%[alpha]) \n\t" + "lwc1 %[temp8], 4(%[alpha]) \n\t" + "lwc1 %[temp9], 8(%[alpha]) \n\t" + "lwc1 %[temp10], 12(%[alpha]) \n\t" + PTR_ADDIU "%[p_x_high], %[p_x_high], 8 \n\t" + PTR_ADDIU "%[p_x_low], %[p_x_low], 8 \n\t" + "mul.s %[temp11], %[temp1], %[temp8] \n\t" + "msub.s %[temp11], %[temp11], %[temp0], %[temp7] \n\t" + "madd.s %[temp11], %[temp11], %[temp2], %[temp9] \n\t" + "nmsub.s %[temp11], %[temp11], %[temp3], %[temp10] \n\t" + "add.s %[temp11], %[temp11], %[temp5] \n\t" + "swc1 %[temp11], -8(%[p_x_high]) \n\t" + "mul.s %[temp12], %[temp1], %[temp7] \n\t" + "madd.s %[temp12], %[temp12], %[temp0], %[temp8] \n\t" + "madd.s %[temp12], %[temp12], %[temp3], %[temp9] \n\t" + "madd.s %[temp12], %[temp12], %[temp2], %[temp10] \n\t" + "add.s %[temp12], %[temp12], %[temp6] \n\t" + "swc1 %[temp12], -4(%[p_x_high]) \n\t" + + : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5), + [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8), + [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), [temp11]"=&f"(temp11), + [temp12]"=&f"(temp12), [p_x_high]"+r"(p_x_high), + [p_x_low]"+r"(p_x_low) + : [alpha]"r"(alpha) + : "memory" + ); + } +} + +static void sbr_hf_g_filt_mips(float (*Y)[2], const float (*X_high)[40][2], + const float *g_filt, int m_max, intptr_t ixh) +{ + const float *p_x, *p_g, *loop_end; + float *p_y; + float temp0, temp1, temp2; + + p_g = &g_filt[0]; + p_y = &Y[0][0]; + p_x = &X_high[0][ixh][0]; + loop_end = p_g + m_max; + + __asm__ volatile( + ".set push \n\t" + ".set noreorder \n\t" + "1: \n\t" + "lwc1 %[temp0], 0(%[p_g]) \n\t" + "lwc1 %[temp1], 0(%[p_x]) \n\t" + "lwc1 %[temp2], 4(%[p_x]) \n\t" + "mul.s %[temp1], %[temp1], %[temp0] \n\t" + "mul.s %[temp2], %[temp2], %[temp0] \n\t" + PTR_ADDIU "%[p_g], %[p_g], 4 \n\t" + PTR_ADDIU "%[p_x], %[p_x], 320 \n\t" + "swc1 %[temp1], 0(%[p_y]) \n\t" + "swc1 %[temp2], 4(%[p_y]) \n\t" + "bne %[p_g], %[loop_end], 1b \n\t" + PTR_ADDIU "%[p_y], %[p_y], 8 \n\t" + ".set pop \n\t" + + : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), + [temp2]"=&f"(temp2), [p_x]"+r"(p_x), + [p_y]"+r"(p_y), [p_g]"+r"(p_g) + : [loop_end]"r"(loop_end) + : "memory" + ); +} + +static void sbr_hf_apply_noise_0_mips(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max) +{ + int m; + + for (m = 0; m < m_max; m++){ + + float *Y1=&Y[m][0]; + float *ff_table; + float y0,y1, temp1, temp2, temp4, temp5; + int temp0, temp3; + const float *s_m1=&s_m[m]; + const float *q_filt1= &q_filt[m]; + + __asm__ volatile( + "lwc1 %[y0], 0(%[Y1]) \n\t" + "lwc1 %[temp1], 0(%[s_m1]) \n\t" + "addiu %[noise], %[noise], 1 \n\t" + "andi %[noise], %[noise], 0x1ff \n\t" + "sll %[temp0], %[noise], 3 \n\t" + PTR_ADDU "%[ff_table],%[ff_sbr_noise_table], %[temp0] \n\t" + "add.s %[y0], %[y0], %[temp1] \n\t" + "mfc1 %[temp3], %[temp1] \n\t" + "bne %[temp3], $0, 1f \n\t" + "lwc1 %[y1], 4(%[Y1]) \n\t" + "lwc1 %[temp2], 0(%[q_filt1]) \n\t" + "lwc1 %[temp4], 0(%[ff_table]) \n\t" + "lwc1 %[temp5], 4(%[ff_table]) \n\t" + "madd.s %[y0], %[y0], %[temp2], %[temp4] \n\t" + "madd.s %[y1], %[y1], %[temp2], %[temp5] \n\t" + "swc1 %[y1], 4(%[Y1]) \n\t" + "1: \n\t" + "swc1 %[y0], 0(%[Y1]) \n\t" + + : [ff_table]"=&r"(ff_table), [y0]"=&f"(y0), [y1]"=&f"(y1), + [temp0]"=&r"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [temp3]"=&r"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5) + : [ff_sbr_noise_table]"r"(ff_sbr_noise_table), [noise]"r"(noise), + [Y1]"r"(Y1), [s_m1]"r"(s_m1), [q_filt1]"r"(q_filt1) + : "memory" + ); + } +} + +static void sbr_hf_apply_noise_1_mips(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max) +{ + float y0,y1,temp1, temp2, temp4, temp5; + int temp0, temp3, m; + float phi_sign = 1 - 2 * (kx & 1); + + for (m = 0; m < m_max; m++) { + + float *ff_table; + float *Y1=&Y[m][0]; + const float *s_m1=&s_m[m]; + const float *q_filt1= &q_filt[m]; + + __asm__ volatile( + "lwc1 %[y1], 4(%[Y1]) \n\t" + "lwc1 %[temp1], 0(%[s_m1]) \n\t" + "lw %[temp3], 0(%[s_m1]) \n\t" + "addiu %[noise], %[noise], 1 \n\t" + "andi %[noise], %[noise], 0x1ff \n\t" + "sll %[temp0], %[noise], 3 \n\t" + PTR_ADDU "%[ff_table],%[ff_sbr_noise_table],%[temp0] \n\t" + "madd.s %[y1], %[y1], %[temp1], %[phi_sign] \n\t" + "bne %[temp3], $0, 1f \n\t" + "lwc1 %[y0], 0(%[Y1]) \n\t" + "lwc1 %[temp2], 0(%[q_filt1]) \n\t" + "lwc1 %[temp4], 0(%[ff_table]) \n\t" + "lwc1 %[temp5], 4(%[ff_table]) \n\t" + "madd.s %[y0], %[y0], %[temp2], %[temp4] \n\t" + "madd.s %[y1], %[y1], %[temp2], %[temp5] \n\t" + "swc1 %[y0], 0(%[Y1]) \n\t" + "1: \n\t" + "swc1 %[y1], 4(%[Y1]) \n\t" + + : [ff_table] "=&r" (ff_table), [y0] "=&f" (y0), [y1] "=&f" (y1), + [temp0] "=&r" (temp0), [temp1] "=&f" (temp1), [temp2] "=&f" (temp2), + [temp3] "=&r" (temp3), [temp4] "=&f" (temp4), [temp5] "=&f" (temp5) + : [ff_sbr_noise_table] "r" (ff_sbr_noise_table), [noise] "r" (noise), + [Y1] "r" (Y1), [s_m1] "r" (s_m1), [q_filt1] "r" (q_filt1), + [phi_sign] "f" (phi_sign) + : "memory" + ); + phi_sign = -phi_sign; + } +} + +static void sbr_hf_apply_noise_2_mips(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max) +{ + int m; + float *ff_table; + float y0,y1, temp0, temp1, temp2, temp3, temp4, temp5; + + for (m = 0; m < m_max; m++) { + + float *Y1=&Y[m][0]; + const float *s_m1=&s_m[m]; + const float *q_filt1= &q_filt[m]; + + __asm__ volatile( + "lwc1 %[y0], 0(%[Y1]) \n\t" + "lwc1 %[temp1], 0(%[s_m1]) \n\t" + "addiu %[noise], %[noise], 1 \n\t" + "andi %[noise], %[noise], 0x1ff \n\t" + "sll %[temp0], %[noise], 3 \n\t" + PTR_ADDU "%[ff_table],%[ff_sbr_noise_table],%[temp0] \n\t" + "sub.s %[y0], %[y0], %[temp1] \n\t" + "mfc1 %[temp3], %[temp1] \n\t" + "bne %[temp3], $0, 1f \n\t" + "lwc1 %[y1], 4(%[Y1]) \n\t" + "lwc1 %[temp2], 0(%[q_filt1]) \n\t" + "lwc1 %[temp4], 0(%[ff_table]) \n\t" + "lwc1 %[temp5], 4(%[ff_table]) \n\t" + "madd.s %[y0], %[y0], %[temp2], %[temp4] \n\t" + "madd.s %[y1], %[y1], %[temp2], %[temp5] \n\t" + "swc1 %[y1], 4(%[Y1]) \n\t" + "1: \n\t" + "swc1 %[y0], 0(%[Y1]) \n\t" + + : [temp0]"=&r"(temp0), [ff_table]"=&r"(ff_table), [y0]"=&f"(y0), + [y1]"=&f"(y1), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [temp3]"=&r"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5) + : [ff_sbr_noise_table]"r"(ff_sbr_noise_table), [noise]"r"(noise), + [Y1]"r"(Y1), [s_m1]"r"(s_m1), [q_filt1]"r"(q_filt1) + : "memory" + ); + } +} + +static void sbr_hf_apply_noise_3_mips(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max) +{ + float phi_sign = 1 - 2 * (kx & 1); + int m; + + for (m = 0; m < m_max; m++) { + + float *Y1=&Y[m][0]; + float *ff_table; + float y0,y1, temp1, temp2, temp4, temp5; + int temp0, temp3; + const float *s_m1=&s_m[m]; + const float *q_filt1= &q_filt[m]; + + __asm__ volatile( + "lwc1 %[y1], 4(%[Y1]) \n\t" + "lwc1 %[temp1], 0(%[s_m1]) \n\t" + "addiu %[noise], %[noise], 1 \n\t" + "andi %[noise], %[noise], 0x1ff \n\t" + "sll %[temp0], %[noise], 3 \n\t" + PTR_ADDU "%[ff_table],%[ff_sbr_noise_table], %[temp0] \n\t" + "nmsub.s %[y1], %[y1], %[temp1], %[phi_sign] \n\t" + "mfc1 %[temp3], %[temp1] \n\t" + "bne %[temp3], $0, 1f \n\t" + "lwc1 %[y0], 0(%[Y1]) \n\t" + "lwc1 %[temp2], 0(%[q_filt1]) \n\t" + "lwc1 %[temp4], 0(%[ff_table]) \n\t" + "lwc1 %[temp5], 4(%[ff_table]) \n\t" + "madd.s %[y0], %[y0], %[temp2], %[temp4] \n\t" + "madd.s %[y1], %[y1], %[temp2], %[temp5] \n\t" + "swc1 %[y0], 0(%[Y1]) \n\t" + "1: \n\t" + "swc1 %[y1], 4(%[Y1]) \n\t" + + : [ff_table]"=&r"(ff_table), [y0]"=&f"(y0), [y1]"=&f"(y1), + [temp0]"=&r"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [temp3]"=&r"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5) + : [ff_sbr_noise_table]"r"(ff_sbr_noise_table), [noise]"r"(noise), + [Y1]"r"(Y1), [s_m1]"r"(s_m1), [q_filt1]"r"(q_filt1), + [phi_sign]"f"(phi_sign) + : "memory" + ); + phi_sign = -phi_sign; + } +} +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_MIPSFPU */ +#endif /* HAVE_INLINE_ASM */ + +void ff_sbrdsp_init_mips(SBRDSPContext *s) +{ +#if HAVE_INLINE_ASM + s->qmf_pre_shuffle = sbr_qmf_pre_shuffle_mips; + s->qmf_post_shuffle = sbr_qmf_post_shuffle_mips; +#if HAVE_MIPSFPU +#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 + s->sum64x5 = sbr_sum64x5_mips; + s->sum_square = sbr_sum_square_mips; + s->qmf_deint_bfly = sbr_qmf_deint_bfly_mips; + s->autocorrelate = sbr_autocorrelate_mips; + s->hf_gen = sbr_hf_gen_mips; + s->hf_g_filt = sbr_hf_g_filt_mips; + + s->hf_apply_noise[0] = sbr_hf_apply_noise_0_mips; + s->hf_apply_noise[1] = sbr_hf_apply_noise_1_mips; + s->hf_apply_noise[2] = sbr_hf_apply_noise_2_mips; + s->hf_apply_noise[3] = sbr_hf_apply_noise_3_mips; +#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ +#endif /* HAVE_MIPSFPU */ +#endif /* HAVE_INLINE_ASM */ +} diff --git a/libavcodec/mips/simple_idct_mmi.c b/libavcodec/mips/simple_idct_mmi.c new file mode 100644 index 0000000000..628e13f7d2 --- /dev/null +++ b/libavcodec/mips/simple_idct_mmi.c @@ -0,0 +1,816 @@ +/* + * Loongson SIMD optimized simple idct + * + * Copyright (c) 2015 Loongson Technology Corporation Limited + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "idctdsp_mips.h" +#include "constants.h" + +#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 +#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + +#define ROW_SHIFT 11 +#define COL_SHIFT 20 + +DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= { + 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, + 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, + C4, C4, C4, C4, + C4, -C4, C4, -C4, + C2, C6, C2, C6, + C6, -C2, C6, -C2, + C1, C3, C1, C3, + C5, C7, C5, C7, + C3, -C7, C3, -C7, + -C1, -C5, -C1, -C5, + C5, -C1, C5, -C1, + C7, C3, C7, C3, + C7, -C5, C7, -C5, + C3, -C1, C3, -C1 +}; + +void ff_simple_idct_mmi(int16_t *block) +{ + DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; + int16_t * const temp= (int16_t*)align_tmp; + + __asm__ volatile ( +#undef DC_COND_IDCT +#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, rarg, shift) \ + "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\ + "ldc1 $f2, " #src4 " \n\t" /* R6 R2 r6 r2 */\ + "ldc1 $f4, " #src1 " \n\t" /* R3 R1 r3 r1 */\ + "ldc1 $f6, " #src5 " \n\t" /* R7 R5 r7 r5 */\ + "ldc1 $f8, %3 \n\t" \ + "and $f8, $f8, $f0 \n\t" \ + "or $f8, $f8, $f2 \n\t" \ + "or $f8, $f8, $f4 \n\t" \ + "or $f8, $f8, $f6 \n\t" \ + "packsswh $f8, $f8, $f8 \n\t" \ + "li $11, " #shift " \n\t" \ + "mfc1 $10, $f8 \n\t" \ + "mtc1 $11, $f18 \n\t" \ + "beqz $10, 1f \n\t" \ + "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\ + "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "ldc1 $f10, 32(%2) \n\t" /* C6 C2 C6 C2 */\ + "pmaddhw $f10, $f10, $f2 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "ldc1 $f12, 40(%2) \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddhw $f2, $f2, $f12 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "ldc1 $f14, 48(%2) \n\t" /* C3 C1 C3 C1 */\ + "ldc1 $f16, " #rarg " \n\t" \ + "pmaddhw $f14, $f14, $f4 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + #rounder " $f8, $f8, $f16 \n\t" \ + "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "paddw $f8, $f8, $f10 \n\t" /* A0 a0 */\ + "psubw $f12, $f12, $f10 \n\t" /* A3 a3 */\ + "ldc1 $f10, 56(%2) \n\t" /* C7 C5 C7 C5 */\ + "ldc1 $f16, " #rarg " \n\t" \ + "pmaddhw $f10, $f10, $f6 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + #rounder " $f0, $f0, $f16 \n\t" \ + "paddw $f2, $f2, $f0 \n\t" /* A1 a1 */\ + "ldc1 $f16, 64(%2) \n\t" \ + "paddw $f0, $f0, $f0 \n\t" \ + "psubw $f0, $f0, $f2 \n\t" /* A2 a2 */\ + "pmaddhw $f4, $f4, $f16 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddw $f14, $f14, $f10 \n\t" /* B0 b0 */\ + "ldc1 $f10, 72(%2) \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddhw $f10, $f10, $f6 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddw $f14, $f14, $f8 \n\t" /* A0+B0 a0+b0 */\ + "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\ + "psubw $f8, $f8, $f14 \n\t" /* A0-B0 a0-b0 */\ + "paddw $f10, $f10, $f4 \n\t" /* B1 b1 */\ + "psraw $f14, $f14, $f18 \n\t" \ + "psraw $f8, $f8, $f18 \n\t" \ + "mov.d $f4, $f2 \n\t" /* A1 a1 */\ + "paddw $f2, $f2, $f10 \n\t" /* A1+B1 a1+b1 */\ + "psubw $f4, $f4, $f10 \n\t" /* A1-B1 a1-b1 */\ + "psraw $f2, $f2, $f18 \n\t" \ + "psraw $f4, $f4, $f18 \n\t" \ + "packsswh $f14, $f14, $f2 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ + "packsswh $f4, $f4, $f8 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ + "sdc1 $f14, " #dst " \n\t" \ + "ldc1 $f2, " #src1 " \n\t" /* R3 R1 r3 r1 */\ + "ldc1 $f8, 80(%2) \n\t" /* -C1 C5 -C1 C5 */\ + "sdc1 $f4, 24+" #dst " \n\t" \ + "pmaddhw $f8, $f8, $f2 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "ldc1 $f16, 96(%2) \n\t" \ + "ldc1 $f14, 88(%2) \n\t" /* C3 C7 C3 C7 */\ + "pmaddhw $f2, $f2, $f16 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddhw $f14, $f14, $f6 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "ldc1 $f16, 104(%2) \n\t" \ + "mov.d $f4, $f0 \n\t" /* A2 a2 */\ + "pmaddhw $f6, $f6, $f16 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddw $f8, $f8, $f14 \n\t" /* B2 b2 */\ + "paddw $f4, $f4, $f8 \n\t" /* A2+B2 a2+b2 */\ + "psubw $f0, $f0, $f8 \n\t" /* a2-B2 a2-b2 */\ + "psraw $f4, $f4, $f18 \n\t" \ + "psraw $f0, $f0, $f18 \n\t" \ + "mov.d $f8, $f12 \n\t" /* A3 a3 */\ + "paddw $f6, $f6, $f2 \n\t" /* B3 b3 */\ + "paddw $f12, $f12, $f6 \n\t" /* A3+B3 a3+b3 */\ + "psubw $f8, $f8, $f6 \n\t" /* a3-B3 a3-b3 */\ + "psraw $f12, $f12, $f18 \n\t" \ + "packsswh $f4, $f4, $f12 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ + "sdc1 $f4, 8+" #dst " \n\t" \ + "psraw $f8, $f8, $f18 \n\t" \ + "packsswh $f8, $f8, $f0 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ + "sdc1 $f8, 16+" #dst " \n\t" \ + "b 2f \n\t" \ + "1: \n\t" \ + "li $10, 16 \n\t" \ + "mtc1 $10, $f16 \n\t" \ + "psllw $f0, $f0, $f16 \n\t" \ + "ldc1 $f16, %4 \n\t" \ + "paddw $f0, $f0, $f16 \n\t" \ + "li $10, 13 \n\t" \ + "mtc1 $10, $f16 \n\t" \ + "psraw $f0, $f0, $f16 \n\t" \ + "packsswh $f0, $f0, $f0 \n\t" \ + "sdc1 $f0, " #dst " \n\t" \ + "sdc1 $f0, 8+" #dst " \n\t" \ + "sdc1 $f0, 16+" #dst " \n\t" \ + "sdc1 $f0, 24+" #dst " \n\t" \ + "2: \n\t" + +#undef Z_COND_IDCT +#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, rarg, shift, bt) \ + "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\ + "ldc1 $f2, " #src4 " \n\t" /* R6 R2 r6 r2 */\ + "ldc1 $f4, " #src1 " \n\t" /* R3 R1 r3 r1 */\ + "ldc1 $f6, " #src5 " \n\t" /* R7 R5 r7 r5 */\ + "mov.d $f8, $f0 \n\t" \ + "or $f8, $f8, $f2 \n\t" \ + "or $f8, $f8, $f4 \n\t" \ + "or $f8, $f8, $f6 \n\t" \ + "packsswh $f8, $f8, $f8 \n\t" \ + "mfc1 $10, $f8 \n\t" \ + "beqz $10, " #bt " \n\t" \ + "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\ + "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "ldc1 $f10, 32(%2) \n\t" /* C6 C2 C6 C2 */\ + "pmaddhw $f10, $f10, $f2 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "ldc1 $f12, 40(%2) \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddhw $f2, $f2, $f12 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "ldc1 $f14, 48(%2) \n\t" /* C3 C1 C3 C1 */\ + "ldc1 $f16, " #rarg " \n\t" \ + "pmaddhw $f14, $f14, $f4 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + #rounder " $f8, $f8, $f16 \n\t" \ + "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "paddw $f8, $f8, $f10 \n\t" /* A0 a0 */\ + "psubw $f12, $f12, $f10 \n\t" /* A3 a3 */\ + "ldc1 $f10, 56(%2) \n\t" /* C7 C5 C7 C5 */\ + "ldc1 $f16, " #rarg " \n\t" \ + "pmaddhw $f10, $f10, $f6 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + #rounder " $f0, $f0, $f16 \n\t" \ + "paddw $f2, $f2, $f0 \n\t" /* A1 a1 */\ + "paddw $f0, $f0, $f0 \n\t" \ + "ldc1 $f16, 64(%2) \n\t" \ + "psubw $f0, $f0, $f2 \n\t" /* A2 a2 */\ + "pmaddhw $f4, $f4, $f16 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddw $f14, $f14, $f10 \n\t" /* B0 b0 */\ + "ldc1 $f10, 72(%2) \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddhw $f10, $f10, $f6 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddw $f14, $f14, $f8 \n\t" /* A0+B0 a0+b0 */\ + "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\ + "li $10, " #shift " \n\t" \ + "psubw $f8, $f8, $f14 \n\t" /* A0-B0 a0-b0 */\ + "mtc1 $10, $f18 \n\t" \ + "paddw $f10, $f10, $f4 \n\t" /* B1 b1 */\ + "psraw $f14, $f14, $f18 \n\t" \ + "psraw $f8, $f8, $f18 \n\t" \ + "mov.d $f4, $f2 \n\t" /* A1 a1 */\ + "paddw $f2, $f2, $f10 \n\t" /* A1+B1 a1+b1 */\ + "psubw $f4, $f4, $f10 \n\t" /* A1-B1 a1-b1 */\ + "psraw $f2, $f2, $f18 \n\t" \ + "psraw $f4, $f4, $f18 \n\t" \ + "packsswh $f14, $f14, $f2 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ + "packsswh $f4, $f4, $f8 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ + "sdc1 $f14, " #dst " \n\t" \ + "ldc1 $f2, " #src1 " \n\t" /* R3 R1 r3 r1 */\ + "ldc1 $f8, 80(%2) \n\t" /* -C1 C5 -C1 C5 */\ + "sdc1 $f4, 24+" #dst " \n\t" \ + "pmaddhw $f8, $f8, $f2 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "ldc1 $f16, 96(%2) \n\t" \ + "ldc1 $f14, 88(%2) \n\t" /* C3 C7 C3 C7 */\ + "pmaddhw $f2, $f2, $f16 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddhw $f14, $f14, $f6 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "ldc1 $f16, 104(%2) \n\t" \ + "mov.d $f4, $f0 \n\t" /* A2 a2 */\ + "pmaddhw $f6, $f6, $f16 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddw $f8, $f8, $f14 \n\t" /* B2 b2 */\ + "paddw $f4, $f4, $f8 \n\t" /* A2+B2 a2+b2 */\ + "psubw $f0, $f0, $f8 \n\t" /* a2-B2 a2-b2 */\ + "psraw $f4, $f4, $f18 \n\t" \ + "psraw $f0, $f0, $f18 \n\t" \ + "mov.d $f8, $f12 \n\t" /* A3 a3 */\ + "paddw $f6, $f6, $f2 \n\t" /* B3 b3 */\ + "paddw $f12, $f12, $f6 \n\t" /* A3+B3 a3+b3 */\ + "psubw $f8, $f8, $f6 \n\t" /* a3-B3 a3-b3 */\ + "psraw $f12, $f12, $f18 \n\t" \ + "packsswh $f4, $f4, $f12 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ + "sdc1 $f4, 8+" #dst " \n\t" \ + "psraw $f8, $f8, $f18 \n\t" \ + "packsswh $f8, $f8, $f0 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ + "sdc1 $f8, 16+" #dst " \n\t" \ + + //IDCT( src0, src4, src1, src5, dst, rounder, shift) + DC_COND_IDCT(0(%0), 8(%0), 16(%0), 24(%0), 0(%1), paddw,8(%2), 11) + Z_COND_IDCT(32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddw,(%2), 11, 4f) + Z_COND_IDCT(64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddw,(%2), 11, 2f) + Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1), paddw,(%2), 11, 1f) + +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, shift) \ + "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\ + "ldc1 $f2, " #src4 " \n\t" /* R6 R2 r6 r2 */\ + "ldc1 $f4, " #src1 " \n\t" /* R3 R1 r3 r1 */\ + "ldc1 $f6, " #src5 " \n\t" /* R7 R5 r7 r5 */\ + "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\ + "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "ldc1 $f10, 32(%2) \n\t" /* C6 C2 C6 C2 */\ + "pmaddhw $f10, $f10, $f2 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "ldc1 $f12, 40(%2) \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddhw $f2, $f2, $f12 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "ldc1 $f14, 48(%2) \n\t" /* C3 C1 C3 C1 */\ + "pmaddhw $f14, $f14, $f4 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "paddw $f8, $f8, $f10 \n\t" /* A0 a0 */\ + "psubw $f12, $f12, $f10 \n\t" /* A3 a3 */\ + "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "paddw $f0, $f0, $f2 \n\t" /* A1 a1 */\ + "psubw $f10, $f10, $f2 \n\t" /* A2 a2 */\ + "ldc1 $f2, 56(%2) \n\t" /* C7 C5 C7 C5 */\ + "ldc1 $f16, 64(%2) \n\t" \ + "pmaddhw $f2, $f2, $f6 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "pmaddhw $f4, $f4, $f16 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "li $10, " #shift " \n\t" \ + "paddw $f14, $f14, $f2 \n\t" /* B0 b0 */\ + "ldc1 $f2, 72(%2) \n\t" /* -C5 -C1 -C5 -C1 */\ + "mtc1 $10, $f18 \n\t" \ + "pmaddhw $f2, $f2, $f6 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddw $f14, $f14, $f8 \n\t" /* A0+B0 a0+b0 */\ + "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\ + "psubw $f8, $f8, $f14 \n\t" /* A0-B0 a0-b0 */\ + "paddw $f2, $f2, $f4 \n\t" /* B1 b1 */\ + "psraw $f14, $f14, $f18 \n\t" \ + "psraw $f8, $f8, $f18 \n\t" \ + "mov.d $f4, $f0 \n\t" /* A1 a1 */\ + "paddw $f0, $f0, $f2 \n\t" /* A1+B1 a1+b1 */\ + "psubw $f4, $f4, $f2 \n\t" /* A1-B1 a1-b1 */\ + "psraw $f0, $f0, $f18 \n\t" \ + "psraw $f4, $f4, $f18 \n\t" \ + "packsswh $f14, $f14, $f14 \n\t" /* A0+B0 a0+b0 */\ + "swc1 $f14, " #dst " \n\t" \ + "packsswh $f0, $f0, $f0 \n\t" /* A1+B1 a1+b1 */\ + "swc1 $f0, 16+" #dst " \n\t" \ + "packsswh $f4, $f4, $f4 \n\t" /* A1-B1 a1-b1 */\ + "swc1 $f4, 96+" #dst " \n\t" \ + "packsswh $f8, $f8, $f8 \n\t" /* A0-B0 a0-b0 */\ + "swc1 $f8, 112+" #dst " \n\t" \ + "ldc1 $f0, " #src1 " \n\t" /* R3 R1 r3 r1 */\ + "ldc1 $f8, 80(%2) \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddhw $f8, $f8, $f0 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "ldc1 $f16, 96(%2) \n\t" \ + "ldc1 $f14, 88(%2) \n\t" /* C3 C7 C3 C7 */\ + "pmaddhw $f0, $f0, $f16 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddhw $f14, $f14, $f6 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "ldc1 $f16, 104(%2) \n\t" \ + "mov.d $f4, $f10 \n\t" /* A2 a2 */\ + "pmaddhw $f6, $f6, $f16 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddw $f8, $f8, $f14 \n\t" /* B2 b2 */\ + "paddw $f4, $f4, $f8 \n\t" /* A2+B2 a2+b2 */\ + "psubw $f10, $f10, $f8 \n\t" /* a2-B2 a2-b2 */\ + "psraw $f4, $f4, $f18 \n\t" \ + "psraw $f10, $f10, $f18 \n\t" \ + "mov.d $f8, $f12 \n\t" /* A3 a3 */\ + "paddw $f6, $f6, $f0 \n\t" /* B3 b3 */\ + "paddw $f12, $f12, $f6 \n\t" /* A3+B3 a3+b3 */\ + "psubw $f8, $f8, $f6 \n\t" /* a3-B3 a3-b3 */\ + "psraw $f12, $f12, $f18 \n\t" \ + "psraw $f8, $f8, $f18 \n\t" \ + "packsswh $f4, $f4, $f4 \n\t" /* A2+B2 a2+b2 */\ + "packsswh $f12, $f12, $f12 \n\t" /* A3+B3 a3+b3 */\ + "swc1 $f4, 32+" #dst " \n\t" \ + "packsswh $f8, $f8, $f8 \n\t" /* A3-B3 a3-b3 */\ + "packsswh $f10, $f10, $f10 \n\t" /* A2-B2 a2-b2 */\ + "swc1 $f12, 48+" #dst " \n\t" \ + "swc1 $f8, 64+" #dst " \n\t" \ + "swc1 $f10, 80+" #dst " \n\t" + + //IDCT( src0, src4, src1, src5, dst, shift) + IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) + IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) + IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) + IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) + "b 9f \n\t" + + "# .p2align 4 \n\t" + "4: \n\t" + Z_COND_IDCT(64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddw,(%2), 11, 6f) + Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 5f) + +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, shift) \ + "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\ + "ldc1 $f2, " #src4 " \n\t" /* R6 R2 r6 r2 */\ + "ldc1 $f6, " #src5 " \n\t" /* R7 R5 r7 r5 */\ + "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\ + "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "ldc1 $f10, 32(%2) \n\t" /* C6 C2 C6 C2 */\ + "pmaddhw $f10, $f10, $f2 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "ldc1 $f12, 40(%2) \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddhw $f2, $f2, $f12 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "paddw $f8, $f8, $f10 \n\t" /* A0 a0 */\ + "psubw $f12, $f12, $f10 \n\t" /* A3 a3 */\ + "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "paddw $f0, $f0, $f2 \n\t" /* A1 a1 */\ + "psubw $f10, $f10, $f2 \n\t" /* A2 a2 */\ + "ldc1 $f2, 56(%2) \n\t" /* C7 C5 C7 C5 */\ + "li $10, " #shift " \n\t" \ + "pmaddhw $f2, $f2, $f6 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "ldc1 $f14, 72(%2) \n\t" /* -C5 -C1 -C5 -C1 */\ + "mtc1 $10, $f18 \n\t" \ + "pmaddhw $f14, $f14, $f6 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddw $f2, $f2, $f8 \n\t" /* A0+B0 a0+b0 */\ + "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\ + "psubw $f8, $f8, $f2 \n\t" /* A0-B0 a0-b0 */\ + "psraw $f2, $f2, $f18 \n\t" \ + "psraw $f8, $f8, $f18 \n\t" \ + "mov.d $f4, $f0 \n\t" /* A1 a1 */\ + "paddw $f0, $f0, $f14 \n\t" /* A1+B1 a1+b1 */\ + "psubw $f4, $f4, $f14 \n\t" /* A1-B1 a1-b1 */\ + "psraw $f0, $f0, $f18 \n\t" \ + "psraw $f4, $f4, $f18 \n\t" \ + "packsswh $f2, $f2, $f2 \n\t" /* A0+B0 a0+b0 */\ + "swc1 $f2, " #dst " \n\t" \ + "packsswh $f0, $f0, $f0 \n\t" /* A1+B1 a1+b1 */\ + "swc1 $f0, 16+" #dst " \n\t" \ + "packsswh $f4, $f4, $f4 \n\t" /* A1-B1 a1-b1 */\ + "swc1 $f4, 96+" #dst " \n\t" \ + "packsswh $f8, $f8, $f8 \n\t" /* A0-B0 a0-b0 */\ + "swc1 $f8, 112+" #dst " \n\t" \ + "ldc1 $f2, 88(%2) \n\t" /* C3 C7 C3 C7 */\ + "ldc1 $f16, 104(%2) \n\t" \ + "pmaddhw $f2, $f2, $f6 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "mov.d $f4, $f10 \n\t" /* A2 a2 */\ + "pmaddhw $f6, $f6, $f16 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddw $f4, $f4, $f2 \n\t" /* A2+B2 a2+b2 */\ + "psubw $f10, $f10, $f2 \n\t" /* a2-B2 a2-b2 */\ + "psraw $f4, $f4, $f18 \n\t" \ + "psraw $f10, $f10, $f18 \n\t" \ + "mov.d $f2, $f12 \n\t" /* A3 a3 */\ + "paddw $f12, $f12, $f6 \n\t" /* A3+B3 a3+b3 */\ + "psubw $f2, $f2, $f6 \n\t" /* a3-B3 a3-b3 */\ + "psraw $f12, $f12, $f18 \n\t" \ + "psraw $f2, $f2, $f18 \n\t" \ + "packsswh $f4, $f4, $f4 \n\t" /* A2+B2 a2+b2 */\ + "packsswh $f12, $f12, $f12 \n\t" /* A3+B3 a3+b3 */\ + "swc1 $f4, 32+" #dst " \n\t" \ + "packsswh $f2, $f2, $f2 \n\t" /* A3-B3 a3-b3 */\ + "packsswh $f10, $f10, $f10 \n\t" /* A2-B2 a2-b2 */\ + "swc1 $f12, 48+" #dst " \n\t" \ + "swc1 $f2, 64+" #dst " \n\t" \ + "swc1 $f10, 80+" #dst " \n\t" + + //IDCT( src0, src4, src1, src5, dst, shift) + IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) + IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) + IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) + IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) + "b 9f \n\t" + + "# .p2align 4 \n\t" + "6: \n\t" + Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 7f) + +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, shift) \ + "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\ + "ldc1 $f6, " #src5 " \n\t" /* R7 R5 r7 r5 */\ + "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\ + "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "ldc1 $f2, 56(%2) \n\t" /* C7 C5 C7 C5 */\ + "pmaddhw $f2, $f2, $f6 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "ldc1 $f14, 72(%2) \n\t" /* -C5 -C1 -C5 -C1 */\ + "li $10, " #shift " \n\t" \ + "pmaddhw $f14, $f14, $f6 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddw $f2, $f2, $f8 \n\t" /* A0+B0 a0+b0 */\ + "mtc1 $10, $f18 \n\t" \ + "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\ + "psubw $f8, $f8, $f2 \n\t" /* A0-B0 a0-b0 */\ + "psraw $f2, $f2, $f18 \n\t" \ + "psraw $f8, $f8, $f18 \n\t" \ + "mov.d $f4, $f0 \n\t" /* A1 a1 */\ + "paddw $f0, $f0, $f14 \n\t" /* A1+B1 a1+b1 */\ + "psubw $f4, $f4, $f14 \n\t" /* A1-B1 a1-b1 */\ + "psraw $f0, $f0, $f18 \n\t" \ + "psraw $f4, $f4, $f18 \n\t" \ + "packsswh $f2, $f2, $f2 \n\t" /* A0+B0 a0+b0 */\ + "swc1 $f2, " #dst " \n\t" \ + "packsswh $f0, $f0, $f0 \n\t" /* A1+B1 a1+b1 */\ + "swc1 $f0, 16+" #dst " \n\t" \ + "packsswh $f4, $f4, $f4 \n\t" /* A1-B1 a1-b1 */\ + "swc1 $f4, 96+" #dst " \n\t" \ + "packsswh $f8, $f8, $f8 \n\t" /* A0-B0 a0-b0 */\ + "swc1 $f8, 112+" #dst " \n\t" \ + "ldc1 $f2, 88(%2) \n\t" /* C3 C7 C3 C7 */\ + "ldc1 $f16, 104(%2) \n\t" \ + "pmaddhw $f2, $f2, $f6 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "mov.d $f4, $f10 \n\t" /* A2 a2 */\ + "pmaddhw $f6, $f6, $f16 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddw $f4, $f4, $f2 \n\t" /* A2+B2 a2+b2 */\ + "psubw $f10, $f10, $f2 \n\t" /* a2-B2 a2-b2 */\ + "psraw $f4, $f4, $f18 \n\t" \ + "psraw $f10, $f10, $f18 \n\t" \ + "mov.d $f2, $f12 \n\t" /* A3 a3 */\ + "paddw $f12, $f12, $f6 \n\t" /* A3+B3 a3+b3 */\ + "psubw $f2, $f2, $f6 \n\t" /* a3-B3 a3-b3 */\ + "psraw $f12, $f12, $f18 \n\t" \ + "psraw $f2, $f2, $f18 \n\t" \ + "packsswh $f4, $f4, $f4 \n\t" /* A2+B2 a2+b2 */\ + "packsswh $f12, $f12, $f12 \n\t" /* A3+B3 a3+b3 */\ + "swc1 $f4, 32+" #dst " \n\t" \ + "packsswh $f2, $f2, $f2 \n\t" /* A3-B3 a3-b3 */\ + "packsswh $f10, $f10, $f10 \n\t" /* A2-B2 a2-b2 */\ + "swc1 $f12, 48+" #dst " \n\t" \ + "swc1 $f2, 64+" #dst " \n\t" \ + "swc1 $f10, 80+" #dst " \n\t" + + //IDCT( src0, src4, src1, src5, dst, shift) + IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) + IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) + IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) + IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) + "b 9f \n\t" + + "# .p2align 4 \n\t" + "2: \n\t" + Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 3f) + +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, shift) \ + "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\ + "ldc1 $f4, " #src1 " \n\t" /* R3 R1 r3 r1 */\ + "ldc1 $f6, " #src5 " \n\t" /* R7 R5 r7 r5 */\ + "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\ + "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "ldc1 $f14, 48(%2) \n\t" /* C3 C1 C3 C1 */\ + "pmaddhw $f14, $f14, $f4 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "ldc1 $f2, 56(%2) \n\t" /* C7 C5 C7 C5 */\ + "pmaddhw $f2, $f2, $f6 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "ldc1 $f16, 64(%2) \n\t" \ + "pmaddhw $f4, $f4, $f16 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddw $f14, $f14, $f2 \n\t" /* B0 b0 */\ + "ldc1 $f2, 72(%2) \n\t" /* -C5 -C1 -C5 -C1 */\ + "li $10, " #shift " \n\t" \ + "pmaddhw $f2, $f2, $f6 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddw $f14, $f14, $f8 \n\t" /* A0+B0 a0+b0 */\ + "mtc1 $10, $f18 \n\t" \ + "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\ + "psubw $f8, $f8, $f14 \n\t" /* A0-B0 a0-b0 */\ + "paddw $f2, $f2, $f4 \n\t" /* B1 b1 */\ + "psraw $f14, $f14, $f18 \n\t" \ + "psraw $f8, $f8, $f18 \n\t" \ + "mov.d $f4, $f0 \n\t" /* A1 a1 */\ + "paddw $f0, $f0, $f2 \n\t" /* A1+B1 a1+b1 */\ + "psubw $f4, $f4, $f2 \n\t" /* A1-B1 a1-b1 */\ + "psraw $f0, $f0, $f18 \n\t" \ + "psraw $f4, $f4, $f18 \n\t" \ + "packsswh $f14, $f14, $f14 \n\t" /* A0+B0 a0+b0 */\ + "swc1 $f14, " #dst " \n\t" \ + "packsswh $f0, $f0, $f0 \n\t" /* A1+B1 a1+b1 */\ + "swc1 $f0, 16+" #dst " \n\t" \ + "packsswh $f4, $f4, $f4 \n\t" /* A1-B1 a1-b1 */\ + "swc1 $f4, 96+" #dst " \n\t" \ + "packsswh $f8, $f8, $f8 \n\t" /* A0-B0 a0-b0 */\ + "swc1 $f8, 112+" #dst " \n\t" \ + "ldc1 $f0, " #src1 " \n\t" /* R3 R1 r3 r1 */\ + "ldc1 $f8, 80(%2) \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddhw $f8, $f8, $f0 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "ldc1 $f14, 88(%2) \n\t" /* C3 C7 C3 C7 */\ + "ldc1 $f16, 96(%2) \n\t" \ + "pmaddhw $f0, $f0, $f16 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddhw $f14, $f14, $f6 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "mov.d $f4, $f10 \n\t" /* A2 a2 */\ + "ldc1 $f16, 104(%2) \n\t" \ + "pmaddhw $f6, $f6, $f16 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddw $f8, $f8, $f14 \n\t" /* B2 b2 */\ + "paddw $f4, $f4, $f8 \n\t" /* A2+B2 a2+b2 */\ + "psubw $f10, $f10, $f8 \n\t" /* a2-B2 a2-b2 */\ + "psraw $f4, $f4, $f18 \n\t" \ + "psraw $f10, $f10, $f18 \n\t" \ + "mov.d $f8, $f12 \n\t" /* A3 a3 */\ + "paddw $f6, $f6, $f0 \n\t" /* B3 b3 */\ + "paddw $f12, $f12, $f6 \n\t" /* A3+B3 a3+b3 */\ + "psubw $f8, $f8, $f6 \n\t" /* a3-B3 a3-b3 */\ + "psraw $f12, $f12, $f18 \n\t" \ + "psraw $f8, $f8, $f18 \n\t" \ + "packsswh $f4, $f4, $f4 \n\t" /* A2+B2 a2+b2 */\ + "packsswh $f12, $f12, $f12 \n\t" /* A3+B3 a3+b3 */\ + "swc1 $f4, 32+" #dst " \n\t" \ + "packsswh $f8, $f8, $f8 \n\t" /* A3-B3 a3-b3 */\ + "packsswh $f10, $f10, $f10 \n\t" /* A2-B2 a2-b2 */\ + "swc1 $f12, 48+" #dst " \n\t" \ + "swc1 $f8, 64+" #dst " \n\t" \ + "swc1 $f10, 80+" #dst " \n\t" + + //IDCT( src0, src4, src1, src5, dst, shift) + IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) + IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) + IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) + IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) + "b 9f \n\t" + + "# .p2align 4 \n\t" + "3: \n\t" + +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, shift) \ + "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\ + "ldc1 $f4, " #src1 " \n\t" /* R3 R1 r3 r1 */\ + "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\ + "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "ldc1 $f14, 48(%2) \n\t" /* C3 C1 C3 C1 */\ + "pmaddhw $f14, $f14, $f4 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "ldc1 $f6, 64(%2) \n\t" \ + "pmaddhw $f6, $f6, $f4 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "li $10, " #shift " \n\t" \ + "paddw $f14, $f14, $f8 \n\t" /* A0+B0 a0+b0 */\ + "mtc1 $10, $f18 \n\t" \ + "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\ + "psubw $f8, $f8, $f14 \n\t" /* A0-B0 a0-b0 */\ + "psraw $f14, $f14, $f18 \n\t" \ + "psraw $f8, $f8, $f18 \n\t" \ + "mov.d $f2, $f0 \n\t" /* A1 a1 */\ + "paddw $f0, $f0, $f6 \n\t" /* A1+B1 a1+b1 */\ + "psubw $f2, $f2, $f6 \n\t" /* A1-B1 a1-b1 */\ + "psraw $f0, $f0, $f18 \n\t" \ + "psraw $f2, $f2, $f18 \n\t" \ + "packsswh $f14, $f14, $f14 \n\t" /* A0+B0 a0+b0 */\ + "swc1 $f14, " #dst " \n\t" \ + "packsswh $f0, $f0, $f0 \n\t" /* A1+B1 a1+b1 */\ + "swc1 $f0, 16+" #dst " \n\t" \ + "packsswh $f2, $f2, $f2 \n\t" /* A1-B1 a1-b1 */\ + "swc1 $f2, 96+" #dst " \n\t" \ + "packsswh $f8, $f8, $f8 \n\t" /* A0-B0 a0-b0 */\ + "swc1 $f8, 112+" #dst " \n\t" \ + "ldc1 $f8, 80(%2) \n\t" /* -C1 C5 -C1 C5 */\ + "ldc1 $f16, 96(%2) \n\t" \ + "pmaddhw $f8, $f8, $f4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "pmaddhw $f4, $f4, $f16 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "mov.d $f2, $f10 \n\t" /* A2 a2 */\ + "paddw $f2, $f2, $f8 \n\t" /* A2+B2 a2+b2 */\ + "psubw $f10, $f10, $f8 \n\t" /* a2-B2 a2-b2 */\ + "psraw $f2, $f2, $f18 \n\t" \ + "psraw $f10, $f10, $f18 \n\t" \ + "mov.d $f8, $f12 \n\t" /* A3 a3 */\ + "paddw $f12, $f12, $f4 \n\t" /* A3+B3 a3+b3 */\ + "psubw $f8, $f8, $f4 \n\t" /* a3-B3 a3-b3 */\ + "psraw $f12, $f12, $f18 \n\t" \ + "psraw $f8, $f8, $f18 \n\t" \ + "packsswh $f2, $f2, $f2 \n\t" /* A2+B2 a2+b2 */\ + "packsswh $f12, $f12, $f12 \n\t" /* A3+B3 a3+b3 */\ + "swc1 $f2, 32+" #dst " \n\t" \ + "packsswh $f8, $f8, $f8 \n\t" /* A3-B3 a3-b3 */\ + "packsswh $f10, $f10, $f10 \n\t" /* A2-B2 a2-b2 */\ + "swc1 $f12, 48+" #dst " \n\t" \ + "swc1 $f8, 64+" #dst " \n\t" \ + "swc1 $f10, 80+" #dst " \n\t" + + //IDCT( src0, src4, src1, src5, dst, shift) + IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) + IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) + IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) + IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) + "b 9f \n\t" + + "# .p2align 4 \n\t" + "5: \n\t" + +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, shift) \ + "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\ + "ldc1 $f2, " #src4 " \n\t" /* R6 R2 r6 r2 */\ + "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\ + "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "ldc1 $f10, 32(%2) \n\t" /* C6 C2 C6 C2 */\ + "pmaddhw $f10, $f10, $f2 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "ldc1 $f12, 40(%2) \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddhw $f2, $f2, $f12 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "paddw $f8, $f8, $f10 \n\t" /* A0 a0 */\ + "psubw $f12, $f12, $f10 \n\t" /* A3 a3 */\ + "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "paddw $f0, $f0, $f2 \n\t" /* A1 a1 */\ + "psubw $f10, $f10, $f2 \n\t" /* A2 a2 */\ + "ldc1 $f4, 8+" #src0 " \n\t" /* R4 R0 r4 r0 */\ + "ldc1 $f6, 8+" #src4 " \n\t" /* R6 R2 r6 r2 */\ + "ldc1 $f2, 16(%2) \n\t" /* C4 C4 C4 C4 */\ + "pmaddhw $f2, $f2, $f4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "ldc1 $f14, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddhw $f4, $f4, $f14 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "ldc1 $f14, 32(%2) \n\t" /* C6 C2 C6 C2 */\ + "ldc1 $f16, 40(%2) \n\t" \ + "pmaddhw $f14, $f14, $f6 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "pmaddhw $f6, $f6, $f16 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "paddw $f14, $f14, $f2 \n\t" /* A0 a0 */\ + "paddw $f2, $f2, $f2 \n\t" /* 2C0 2c0 */\ + "psubw $f2, $f2, $f14 \n\t" /* A3 a3 */\ + "li $10, " #shift " \n\t" \ + "paddw $f6, $f6, $f4 \n\t" /* A1 a1 */\ + "mtc1 $10, $f18 \n\t" \ + "paddw $f4, $f4, $f4 \n\t" /* 2C1 2c1 */\ + "psubw $f4, $f4, $f6 \n\t" /* A2 a2 */\ + "psraw $f8, $f8, $f18 \n\t" \ + "psraw $f14, $f14, $f18 \n\t" \ + "psraw $f6, $f6, $f18 \n\t" \ + "packsswh $f8, $f8, $f14 \n\t" /* A0 a0 */\ + "sdc1 $f8, " #dst " \n\t" \ + "psraw $f0, $f0, $f18 \n\t" \ + "packsswh $f0, $f0, $f6 \n\t" /* A1 a1 */\ + "sdc1 $f0, 16+" #dst " \n\t" \ + "sdc1 $f0, 96+" #dst " \n\t" \ + "sdc1 $f8, 112+" #dst " \n\t" \ + "psraw $f10, $f10, $f18 \n\t" \ + "psraw $f12, $f12, $f18 \n\t" \ + "psraw $f4, $f4, $f18 \n\t" \ + "packsswh $f10, $f10, $f4 \n\t" /* A2-B2 a2-b2 */\ + "sdc1 $f10, 32+" #dst " \n\t" \ + "psraw $f2, $f2, $f18 \n\t" \ + "packsswh $f12, $f12, $f2 \n\t" /* A3+B3 a3+b3 */\ + "sdc1 $f12, 48+" #dst " \n\t" \ + "sdc1 $f12, 64+" #dst " \n\t" \ + "sdc1 $f10, 80+" #dst " \n\t" + + //IDCT( src0, src4, src1, src5, dst, shift) + IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) + IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) + "b 9f \n\t" + + "# .p2align 4 \n\t" + "1: \n\t" + +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, shift) \ + "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\ + "ldc1 $f2, " #src4 " \n\t" /* R6 R2 r6 r2 */\ + "ldc1 $f4, " #src1 " \n\t" /* R3 R1 r3 r1 */\ + "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\ + "li $10, " #shift " \n\t" \ + "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\ + "mtc1 $10, $f18 \n\t" \ + "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "ldc1 $f10, 32(%2) \n\t" /* C6 C2 C6 C2 */\ + "pmaddhw $f10, $f10, $f2 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "ldc1 $f12, 40(%2) \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddhw $f2, $f2, $f12 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "ldc1 $f14, 48(%2) \n\t" /* C3 C1 C3 C1 */\ + "pmaddhw $f14, $f14, $f4 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "paddw $f8, $f8, $f10 \n\t" /* A0 a0 */\ + "psubw $f12, $f12, $f10 \n\t" /* A3 a3 */\ + "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "paddw $f0, $f0, $f2 \n\t" /* A1 a1 */\ + "psubw $f10, $f10, $f2 \n\t" /* A2 a2 */\ + "ldc1 $f2, 64(%2) \n\t" \ + "pmaddhw $f2, $f2, $f4 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddw $f14, $f14, $f8 \n\t" /* A0+B0 a0+b0 */\ + "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\ + "psubw $f8, $f8, $f14 \n\t" /* A0-B0 a0-b0 */\ + "psraw $f14, $f14, $f18 \n\t" \ + "psraw $f8, $f8, $f18 \n\t" \ + "mov.d $f6, $f0 \n\t" /* A1 a1 */\ + "paddw $f0, $f0, $f2 \n\t" /* A1+B1 a1+b1 */\ + "psubw $f6, $f6, $f2 \n\t" /* A1-B1 a1-b1 */\ + "psraw $f0, $f0, $f18 \n\t" \ + "psraw $f6, $f6, $f18 \n\t" \ + "packsswh $f14, $f14, $f14 \n\t" /* A0+B0 a0+b0 */\ + "swc1 $f14, " #dst " \n\t" \ + "packsswh $f0, $f0, $f0 \n\t" /* A1+B1 a1+b1 */\ + "swc1 $f0, 16+" #dst " \n\t" \ + "packsswh $f6, $f6, $f6 \n\t" /* A1-B1 a1-b1 */\ + "swc1 $f6, 96+" #dst " \n\t" \ + "packsswh $f8, $f8, $f8 \n\t" /* A0-B0 a0-b0 */\ + "swc1 $f8, 112+" #dst " \n\t" \ + "ldc1 $f8, 80(%2) \n\t" /* -C1 C5 -C1 C5 */\ + "ldc1 $f16, 96(%2) \n\t" \ + "pmaddhw $f8, $f8, $f4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "pmaddhw $f4, $f4, $f16 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "mov.d $f6, $f10 \n\t" /* A2 a2 */\ + "paddw $f6, $f6, $f8 \n\t" /* A2+B2 a2+b2 */\ + "psubw $f10, $f10, $f8 \n\t" /* a2-B2 a2-b2 */\ + "psraw $f6, $f6, $f18 \n\t" \ + "psraw $f10, $f10, $f18 \n\t" \ + "mov.d $f8, $f12 \n\t" /* A3 a3 */\ + "paddw $f12, $f12, $f4 \n\t" /* A3+B3 a3+b3 */\ + "psubw $f8, $f8, $f4 \n\t" /* a3-B3 a3-b3 */\ + "psraw $f12, $f12, $f18 \n\t" \ + "packsswh $f6, $f6, $f6 \n\t" /* A2+B2 a2+b2 */\ + "swc1 $f6, 32+" #dst " \n\t" \ + "psraw $f8, $f8, $f18 \n\t" \ + "packsswh $f12, $f12, $f12 \n\t" /* A3+B3 a3+b3 */\ + "swc1 $f12, 48+" #dst " \n\t" \ + "packsswh $f8, $f8, $f8 \n\t" /* A3-B3 a3-b3 */\ + "packsswh $f10, $f10, $f10 \n\t" /* A2-B2 a2-b2 */\ + "swc1 $f8, 64+" #dst " \n\t" \ + "swc1 $f10, 80+" #dst " \n\t" + + //IDCT( src0, src4, src1, src5, dst, shift) + IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) + IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) + IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) + IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) + "b 9f \n\t" + + "# .p2align 4 \n\t" + "7: \n\t" + +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, shift) \ + "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\ + "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\ + "li $10, " #shift " \n\t" \ + "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "mtc1 $10, $f18 \n\t" \ + "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "psraw $f8, $f8, $f18 \n\t" \ + "psraw $f0, $f0, $f18 \n\t" \ + "ldc1 $f4, 8+" #src0 " \n\t" /* R4 R0 r4 r0 */\ + "ldc1 $f2, 16(%2) \n\t" /* C4 C4 C4 C4 */\ + "pmaddhw $f2, $f2, $f4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "ldc1 $f14, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddhw $f4, $f4, $f14 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "ldc1 $f14, 32(%2) \n\t" /* C6 C2 C6 C2 */\ + "psraw $f2, $f2, $f18 \n\t" \ + "packsswh $f8, $f8, $f2 \n\t" /* A0 a0 */\ + "sdc1 $f8, " #dst " \n\t" \ + "psraw $f4, $f4, $f18 \n\t" \ + "packsswh $f0, $f0, $f4 \n\t" /* A1 a1 */\ + "sdc1 $f0, 16+" #dst " \n\t" \ + "sdc1 $f0, 96+" #dst " \n\t" \ + "sdc1 $f8, 112+" #dst " \n\t" \ + "sdc1 $f0, 32+" #dst " \n\t" \ + "sdc1 $f8, 48+" #dst " \n\t" \ + "sdc1 $f8, 64+" #dst " \n\t" \ + "sdc1 $f0, 80+" #dst " \n\t" + + //IDCT( src0, src4, src1, src5, dst, shift) + IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) + IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) + + "9: \n\t" + ::"r"(block),"r"(temp),"r"(coeffs),"m"(ff_wm1010),"m"(ff_d40000) + : "$10","$11" + ); +} diff --git a/libavcodec/mips/simple_idct_msa.c b/libavcodec/mips/simple_idct_msa.c new file mode 100644 index 0000000000..bd8b31012f --- /dev/null +++ b/libavcodec/mips/simple_idct_msa.c @@ -0,0 +1,573 @@ +/* + * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "idctdsp_mips.h" + +static void simple_idct_msa(int16_t *block) +{ + int32_t const_val; + v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 }; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 w1, w3, w5, w7; + v8i16 const0, const1, const2, const3, const4, const5, const6, const7; + v4i32 temp0_r, temp1_r, temp2_r, temp3_r; + v4i32 temp0_l, temp1_l, temp2_l, temp3_l; + v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l; + v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l; + v4i32 w2, w4, w6; + v8i16 select_vec, temp; + v8i16 zero = { 0 }; + v4i32 const_val0 = __msa_ldi_w(1); + v4i32 const_val1 = __msa_ldi_w(1); + + LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7); + const_val0 <<= 10; + const_val = 16383 * ((1 << 19) / 16383); + const_val1 = __msa_insert_w(const_val0, 0, const_val); + const_val1 = __msa_splati_w(const_val1, 0); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7; + select_vec = __msa_clti_u_h((v8u16) select_vec, 1); + UNPCK_SH_SW(in0, a0_r, a0_l); + UNPCK_SH_SW(in2, temp3_r, temp3_l); + temp = in0 << 3; + w2 = (v4i32) __msa_splati_h(weights, 2); + w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2); + w4 = (v4i32) __msa_splati_h(weights, 4); + w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4); + w6 = (v4i32) __msa_splati_h(weights, 6); + w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6); + MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l); + ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l); + MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l, + temp1_r, temp1_l, temp2_r, temp2_l); + BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l, + temp2_l, temp2_r, temp1_l, temp1_r, + a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r); + UNPCK_SH_SW(in4, temp0_r, temp0_l); + UNPCK_SH_SW(in6, temp3_r, temp3_l); + MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l); + MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l, + temp2_r, temp2_l, temp1_r, temp1_l); + ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l); + SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l, + a1_r, a1_l, a2_r, a2_l); + ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l, + a3_r, a3_l, a0_r, a0_l); + SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l); + ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l); + SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l); + ILVRL_H2_SW(in1, in3, b3_r, b3_l); + SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7); + ILVRL_H2_SW(in5, in7, temp0_r, temp0_l); + ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5, + const0, const1, const2, const3); + ILVR_H2_SH(w5, w7, w7, w3, const4, const6); + const5 = __msa_ilvod_h(-w1, -w5); + const7 = __msa_ilvod_h(w3, -w1); + DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3, + b0_r, b1_r, b2_r, b3_r); + DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r, + const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r); + DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3, + b0_l, b1_l, b2_l, b3_l); + DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l, + const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l); + BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l, + b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r, + temp0_r, temp0_l, temp1_r, temp1_l, + temp2_r, temp2_l, temp3_r, temp3_l, + a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r); + SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11); + SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11); + PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, + temp2_l, temp2_r, temp3_l, temp3_r, + temp0_r, temp1_r, temp2_r, temp3_r); + in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp, + (v16u8) select_vec); + in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp, + (v16u8) select_vec); + in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp, + (v16u8) select_vec); + in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp, + (v16u8) select_vec); + SRA_4V(a3_r, a3_l, a2_r, a2_l, 11); + SRA_4V(a1_r, a1_l, a0_r, a0_l, 11); + PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, + a0_r, a1_r, a2_r, a3_r); + in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec); + in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec); + in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec); + in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + + UNPCK_SH_SW(in0, a0_r, a0_l); + UNPCK_SH_SW(in2, temp3_r, temp3_l); + w2 = (v4i32) __msa_splati_h(weights, 2); + w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2); + w4 = (v4i32) __msa_splati_h(weights, 4); + w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4); + w6 = (v4i32) __msa_splati_h(weights, 6); + w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6); + MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l); + ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l); + MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l, + temp1_r, temp1_l, temp2_r, temp2_l); + BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l, + temp2_l, temp2_r, temp1_l, temp1_r, + a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r); + UNPCK_SH_SW(in4, temp0_r, temp0_l); + UNPCK_SH_SW(in6, temp3_r, temp3_l); + MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l); + MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l, + temp2_r, temp2_l, temp1_r, temp1_l); + ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l); + SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l, + a1_r, a1_l, a2_r, a2_l); + ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l, + a3_r, a3_l, a0_r, a0_l); + SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l); + ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l); + SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l); + ILVRL_H2_SW(in1, in3, b3_r, b3_l); + SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7); + ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5, + const0, const1, const2, const3); + DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3, + b0_r, b1_r, b2_r, b3_r); + DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3, + b0_l, b1_l, b2_l, b3_l); + ILVRL_H2_SW(in5, in7, temp0_r, temp0_l); + ILVR_H2_SH(w5, w7, w7, w3, const4, const6); + const5 = __msa_ilvod_h(-w1, -w5); + const7 = __msa_ilvod_h(w3, -w1); + DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r, + const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r); + DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l, + const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l); + BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l, + b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r, + temp0_r, temp0_l, temp1_r, temp1_l, + temp2_r, temp2_l, temp3_r, temp3_l, + a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r); + SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20); + SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20); + PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r, + temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r); + SRA_4V(a3_r, a3_l, a2_r, a2_l, 20); + SRA_4V(a1_r, a1_l, a0_r, a0_l, 20); + PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, + a0_r, a1_r, a2_r, a3_r); + ST_SW8(temp0_r, temp1_r, temp2_r, temp3_r, a3_r, a2_r, a1_r, a0_r, + block, 8); +} + +static void simple_idct_put_msa(uint8_t *dst, int32_t dst_stride, + int16_t *block) +{ + int32_t const_val; + uint64_t tmp0, tmp1, tmp2, tmp3; + v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 }; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 w1, w3, w5, w7; + v8i16 const0, const1, const2, const3, const4, const5, const6, const7; + v4i32 temp0_r, temp1_r, temp2_r, temp3_r; + v4i32 temp0_l, temp1_l, temp2_l, temp3_l; + v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l; + v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l; + v4i32 w2, w4, w6; + v8i16 select_vec, temp; + v8i16 zero = { 0 }; + v4i32 const_val0 = __msa_ldi_w(1); + v4i32 const_val1 = __msa_ldi_w(1); + + LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7); + const_val0 <<= 10; + const_val = 16383 * ((1 << 19) / 16383); + const_val1 = __msa_insert_w(const_val0, 0, const_val); + const_val1 = __msa_splati_w(const_val1, 0); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7; + select_vec = __msa_clti_u_h((v8u16) select_vec, 1); + UNPCK_SH_SW(in0, a0_r, a0_l); + UNPCK_SH_SW(in2, temp3_r, temp3_l); + temp = in0 << 3; + w2 = (v4i32) __msa_splati_h(weights, 2); + w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2); + w4 = (v4i32) __msa_splati_h(weights, 4); + w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4); + w6 = (v4i32) __msa_splati_h(weights, 6); + w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6); + MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l); + ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l); + MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l); + MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l); + BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l, + temp2_l, temp2_r, temp1_l, temp1_r, + a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r); + UNPCK_SH_SW(in4, temp0_r, temp0_l); + UNPCK_SH_SW(in6, temp3_r, temp3_l); + MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l); + MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l); + MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l); + ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l); + SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l); + SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l); + ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l); + ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l); + SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l); + ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l); + SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l); + ILVRL_H2_SW(in1, in3, b3_r, b3_l); + SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7); + ILVRL_H2_SW(in5, in7, temp0_r, temp0_l); + ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5, + const0, const1, const2, const3); + ILVR_H2_SH(w5, w7, w7, w3, const4, const6); + const5 = __msa_ilvod_h(-w1, -w5); + const7 = __msa_ilvod_h(w3, -w1); + DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3, + b0_r, b1_r, b2_r, b3_r); + DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r, + const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r); + DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3, + b0_l, b1_l, b2_l, b3_l); + DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l, + const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l); + BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l, + b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r, + temp0_r, temp0_l, temp1_r, temp1_l, + temp2_r, temp2_l, temp3_r, temp3_l, + a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r); + SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11); + SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11); + PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, + temp2_l, temp2_r, temp3_l, temp3_r, + temp0_r, temp1_r, temp2_r, temp3_r); + in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp, + (v16u8) select_vec); + in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp, + (v16u8) select_vec); + in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp, + (v16u8) select_vec); + in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp, + (v16u8) select_vec); + SRA_4V(a3_r, a3_l, a2_r, a2_l, 11); + SRA_4V(a1_r, a1_l, a0_r, a0_l, 11); + PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, + a0_r, a1_r, a2_r, a3_r); + in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec); + in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec); + in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec); + in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + UNPCK_SH_SW(in0, a0_r, a0_l); + UNPCK_SH_SW(in2, temp3_r, temp3_l); + w2 = (v4i32) __msa_splati_h(weights, 2); + w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2); + w4 = (v4i32) __msa_splati_h(weights, 4); + w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4); + w6 = (v4i32) __msa_splati_h(weights, 6); + w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6); + MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l); + ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l); + MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l); + MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l); + BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l, + temp2_l, temp2_r, temp1_l, temp1_r, + a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r); + UNPCK_SH_SW(in4, temp0_r, temp0_l); + UNPCK_SH_SW(in6, temp3_r, temp3_l); + MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l); + MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l); + MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l); + ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l); + SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l); + SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l); + ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l); + ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l); + SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l); + ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l); + SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l); + ILVRL_H2_SW(in1, in3, b3_r, b3_l); + SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7); + ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5, + const0, const1, const2, const3); + DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3, + b0_r, b1_r, b2_r, b3_r); + DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3, + b0_l, b1_l, b2_l, b3_l); + ILVRL_H2_SW(in5, in7, temp0_r, temp0_l); + ILVR_H2_SH(w5, w7, w7, w3, const4, const6); + const5 = __msa_ilvod_h(-w1, -w5); + const7 = __msa_ilvod_h(w3, -w1); + DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r, + const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r); + DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l, + const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l); + BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l, + b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r, + temp0_r, temp0_l, temp1_r, temp1_l, + temp2_r, temp2_l, temp3_r, temp3_l, + a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r); + SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20); + SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20); + SRA_4V(a3_r, a3_l, a2_r, a2_l, 20); + SRA_4V(a1_r, a1_l, a0_r, a0_l, 20); + PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r, + temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r); + PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, + a0_r, a1_r, a2_r, a3_r); + temp0_r = (v4i32) CLIP_SH_0_255(temp0_r); + temp1_r = (v4i32) CLIP_SH_0_255(temp1_r); + temp2_r = (v4i32) CLIP_SH_0_255(temp2_r); + temp3_r = (v4i32) CLIP_SH_0_255(temp3_r); + PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r, + temp2_r, temp2_r, temp3_r, temp3_r, + temp0_r, temp1_r, temp2_r, temp3_r); + tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1); + tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1); + tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1); + tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1); + SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += 4 * dst_stride; + a0_r = (v4i32) CLIP_SH_0_255(a0_r); + a1_r = (v4i32) CLIP_SH_0_255(a1_r); + a2_r = (v4i32) CLIP_SH_0_255(a2_r); + a3_r = (v4i32) CLIP_SH_0_255(a3_r); + PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r, + a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r); + tmp3 = __msa_copy_u_d((v2i64) a0_r, 1); + tmp2 = __msa_copy_u_d((v2i64) a1_r, 1); + tmp1 = __msa_copy_u_d((v2i64) a2_r, 1); + tmp0 = __msa_copy_u_d((v2i64) a3_r, 1); + SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += 4 * dst_stride; +} + +static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride, + int16_t *block) +{ + int32_t const_val; + uint64_t tmp0, tmp1, tmp2, tmp3; + v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 }; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 w1, w3, w5, w7; + v8i16 const0, const1, const2, const3, const4, const5, const6, const7; + v4i32 temp0_r, temp1_r, temp2_r, temp3_r; + v4i32 temp4_r, temp5_r, temp6_r, temp7_r, temp8_r; + v4i32 temp0_l, temp1_l, temp2_l, temp3_l; + v4i32 temp4_l, temp5_l, temp6_l, temp7_l, temp8_l; + v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l; + v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l; + v4i32 w2, w4, w6; + v8i16 select_vec, temp; + v8i16 zero = { 0 }; + v4i32 const_val0 = __msa_ldi_w(1); + v4i32 const_val1 = __msa_ldi_w(1); + + const_val0 <<= 10; + const_val = 16383 * ((1 << 19) / 16383); + const_val1 = __msa_insert_w(const_val0, 0, const_val); + const_val1 = __msa_splati_w(const_val1, 0); + LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + + select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7; + select_vec = __msa_clti_u_h((v8u16) select_vec, 1); + UNPCK_SH_SW(in0, a0_r, a0_l); + UNPCK_SH_SW(in2, temp3_r, temp3_l); + ILVRL_H2_SW(in1, in3, b3_r, b3_l); + UNPCK_SH_SW(in4, temp4_r, temp4_l); + UNPCK_SH_SW(in6, temp7_r, temp7_l); + ILVRL_H2_SW(in5, in7, temp8_r, temp8_l); + temp = in0 << 3; + SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7); + ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5, + const0, const1, const2, const3); + ILVR_H2_SH(w5, w7, w7, w3, const4, const6); + const5 = __msa_ilvod_h(-w1, -w5); + const7 = __msa_ilvod_h(w3, -w1); + DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3, + b0_r, b1_r, b2_r, b3_r); + DPADD_SH4_SW(temp8_r, temp8_r, temp8_r, temp8_r, + const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r); + DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3, + b0_l, b1_l, b2_l, b3_l); + DPADD_SH4_SW(temp8_l, temp8_l, temp8_l, temp8_l, + const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l); + w2 = (v4i32) __msa_splati_h(weights, 2); + w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2); + w4 = (v4i32) __msa_splati_h(weights, 4); + w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4); + w6 = (v4i32) __msa_splati_h(weights, 6); + w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6); + MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l); + ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l); + MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l); + MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l); + BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l, + temp2_l, temp2_r, temp1_l, temp1_r, + a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r); + MUL2(temp4_r, w4, temp4_l, w4, temp4_r, temp4_l); + MUL2(temp7_r, w2, temp7_l, w2, temp6_r, temp6_l); + MUL2(temp7_r, w6, temp7_l, w6, temp5_r, temp5_l); + ADD2(a0_r, temp4_r, a0_l, temp4_l, a0_r, a0_l); + SUB2(a1_r, temp4_r, a1_l, temp4_l, a1_r, a1_l); + SUB2(a2_r, temp4_r, a2_l, temp4_l, a2_r, a2_l); + ADD2(a3_r, temp4_r, a3_l, temp4_l, a3_r, a3_l); + ADD2(a0_r, temp5_r, a0_l, temp5_l, a0_r, a0_l); + SUB2(a1_r, temp6_r, a1_l, temp6_l, a1_r, a1_l); + ADD2(a2_r, temp6_r, a2_l, temp6_l, a2_r, a2_l); + SUB2(a3_r, temp5_r, a3_l, temp5_l, a3_r, a3_l); + BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l, + b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r, + temp0_r, temp0_l, temp1_r, temp1_l, + temp2_r, temp2_l, temp3_r, temp3_l, + a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r); + SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11); + SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11); + PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, + temp2_l, temp2_r, temp3_l, temp3_r, + temp0_r, temp1_r, temp2_r, temp3_r); + in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp, + (v16u8) select_vec); + in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp, + (v16u8) select_vec); + in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp, + (v16u8) select_vec); + in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp, + (v16u8) select_vec); + SRA_4V(a3_r, a3_l, a2_r, a2_l, 11); + SRA_4V(a1_r, a1_l, a0_r, a0_l, 11); + PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, + a0_r, a1_r, a2_r, a3_r); + in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec); + in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec); + in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec); + in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + + UNPCK_SH_SW(in0, a0_r, a0_l); + UNPCK_SH_SW(in2, temp3_r, temp3_l); + MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l); + ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l); + MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l); + MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l); + BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l, + temp2_l, temp2_r, temp1_l, temp1_r, + a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r); + UNPCK_SH_SW(in4, temp0_r, temp0_l); + UNPCK_SH_SW(in6, temp3_r, temp3_l); + MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l); + MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l); + MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l); + ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l); + SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l); + SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l); + ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l); + ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l); + SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l); + ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l); + SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l); + ILVRL_H2_SW(in1, in3, b3_r, b3_l); + ILVRL_H2_SW(in5, in7, temp0_r, temp0_l); + DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3, + b0_r, b1_r, b2_r, b3_r); + DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3, + b0_l, b1_l, b2_l, b3_l); + DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r, + const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r); + DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l, + const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l); + BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l, + b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r, + temp0_r, temp0_l, temp1_r, temp1_l, + temp2_r, temp2_l, temp3_r, temp3_l, + a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r); + SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20); + SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20); + LD_SH4(dst, dst_stride, in0, in1, in2, in3); + PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r, + temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r); + ILVR_B4_SW(zero, in0, zero, in1, zero, in2, zero, in3, + temp0_l, temp1_l, temp2_l, temp3_l); + temp0_r = (v4i32) ((v8i16) (temp0_r) + (v8i16) (temp0_l)); + temp1_r = (v4i32) ((v8i16) (temp1_r) + (v8i16) (temp1_l)); + temp2_r = (v4i32) ((v8i16) (temp2_r) + (v8i16) (temp2_l)); + temp3_r = (v4i32) ((v8i16) (temp3_r) + (v8i16) (temp3_l)); + temp0_r = (v4i32) CLIP_SH_0_255(temp0_r); + temp1_r = (v4i32) CLIP_SH_0_255(temp1_r); + temp2_r = (v4i32) CLIP_SH_0_255(temp2_r); + temp3_r = (v4i32) CLIP_SH_0_255(temp3_r); + PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r, + temp2_r, temp2_r, temp3_r, temp3_r, + temp0_r, temp1_r, temp2_r, temp3_r); + tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1); + tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1); + tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1); + tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1); + SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + + SRA_4V(a3_r, a3_l, a2_r, a2_l, 20); + SRA_4V(a1_r, a1_l, a0_r, a0_l, 20); + LD_SH4(dst + 4 * dst_stride, dst_stride, in4, in5, in6, in7); + PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, + a0_r, a1_r, a2_r, a3_r); + ILVR_B4_SW(zero, in4, zero, in5, zero, in6, zero, in7, + a3_l, a2_l, a1_l, a0_l); + a3_r = (v4i32) ((v8i16) (a3_r) + (v8i16) (a3_l)); + a2_r = (v4i32) ((v8i16) (a2_r) + (v8i16) (a2_l)); + a1_r = (v4i32) ((v8i16) (a1_r) + (v8i16) (a1_l)); + a0_r = (v4i32) ((v8i16) (a0_r) + (v8i16) (a0_l)); + a3_r = (v4i32) CLIP_SH_0_255(a3_r); + a2_r = (v4i32) CLIP_SH_0_255(a2_r); + a1_r = (v4i32) CLIP_SH_0_255(a1_r); + a0_r = (v4i32) CLIP_SH_0_255(a0_r); + PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r, + a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r); + tmp0 = __msa_copy_u_d((v2i64) a3_r, 1); + tmp1 = __msa_copy_u_d((v2i64) a2_r, 1); + tmp2 = __msa_copy_u_d((v2i64) a1_r, 1); + tmp3 = __msa_copy_u_d((v2i64) a0_r, 1); + SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride); +} + +void ff_simple_idct_msa(int16_t *block) +{ + simple_idct_msa(block); +} + +void ff_simple_idct_put_msa(uint8_t *dst, int32_t dst_stride, int16_t *block) +{ + simple_idct_put_msa(dst, dst_stride, block); +} + +void ff_simple_idct_add_msa(uint8_t *dst, int32_t dst_stride, int16_t *block) +{ + simple_idct_add_msa(dst, dst_stride, block); +} diff --git a/libavcodec/mips/vp8_idct_msa.c b/libavcodec/mips/vp8_idct_msa.c new file mode 100644 index 0000000000..11ac9ff83e --- /dev/null +++ b/libavcodec/mips/vp8_idct_msa.c @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <string.h> +#include "libavcodec/vp8dsp.h" +#include "libavutil/mips/generic_macros_msa.h" +#include "vp8dsp_mips.h" + +static const int cospi8sqrt2minus1 = 20091; +static const int sinpi8sqrt2 = 35468; + +#define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) \ +{ \ + v4i32 a1_m, b1_m, c1_m, d1_m; \ + v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \ + v4i32 const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m; \ + \ + const_cospi8sqrt2minus1_m = __msa_fill_w(cospi8sqrt2minus1); \ + sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2); \ + a1_m = in0 + in2; \ + b1_m = in0 - in2; \ + c_tmp1_m = ((in1) * sinpi8_sqrt2_m) >> 16; \ + c_tmp2_m = in3 + (((in3) * const_cospi8sqrt2minus1_m) >> 16); \ + c1_m = c_tmp1_m - c_tmp2_m; \ + d_tmp1_m = (in1) + (((in1) * const_cospi8sqrt2minus1_m) >> 16); \ + d_tmp2_m = ((in3) * sinpi8_sqrt2_m) >> 16; \ + d1_m = d_tmp1_m + d_tmp2_m; \ + BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ +} + +void ff_vp8_idct_add_msa(uint8_t *dst, int16_t input[16], ptrdiff_t stride) +{ + v8i16 input0, input1; + v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; + v4i32 res0, res1, res2, res3; + v16i8 zero = { 0 }; + v16i8 pred0, pred1, pred2, pred3, dest0, dest1; + v16i8 mask = { 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 }; + + /* load short vector elements of 4x4 block */ + LD_SH2(input, 8, input0, input1); + UNPCK_SH_SW(input0, in0, in1); + UNPCK_SH_SW(input1, in2, in3); + VP8_IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3); + /* transpose the block */ + TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3); + VP8_IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3); + SRARI_W4_SW(vt0, vt1, vt2, vt3, 3); + /* transpose the block */ + TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3); + LD_SB4(dst, stride, pred0, pred1, pred2, pred3); + ILVR_B4_SW(zero, pred0, zero, pred1, zero, pred2, zero, pred3, + res0, res1, res2, res3); + ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, + res0, res1, res2, res3); + ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3); + res0 = CLIP_SW_0_255(res0); + res1 = CLIP_SW_0_255(res1); + res2 = CLIP_SW_0_255(res2); + res3 = CLIP_SW_0_255(res3); + VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1); + ST4x4_UB(dest0, dest1, 0, 1, 0, 1, dst, stride); + + memset(input, 0, 4 * 4 * sizeof(*input)); +} + +void ff_vp8_idct_dc_add_msa(uint8_t *dst, int16_t in_dc[16], ptrdiff_t stride) +{ + v8i16 vec; + v8i16 res0, res1, res2, res3; + v16i8 zero = { 0 }; + v16i8 pred0, pred1, pred2, pred3, dest0, dest1; + v16i8 mask = { 0, 2, 4, 6, 16, 18, 20, 22, 0, 0, 0, 0, 0, 0, 0, 0 }; + + vec = __msa_fill_h(in_dc[0]); + vec = __msa_srari_h(vec, 3); + LD_SB4(dst, stride, pred0, pred1, pred2, pred3); + ILVR_B4_SH(zero, pred0, zero, pred1, zero, pred2, zero, pred3, + res0, res1, res2, res3); + ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); + CLIP_SH4_0_255(res0, res1, res2, res3); + VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1); + ST4x4_UB(dest0, dest1, 0, 1, 0, 1, dst, stride); + + in_dc[0] = 0; +} + +void ff_vp8_luma_dc_wht_msa(int16_t block[4][4][16], int16_t input[16]) +{ + int16_t *mb_dq_coeff = &block[0][0][0]; + v8i16 input0, input1; + v4i32 in0, in1, in2, in3, a1, b1, c1, d1; + v4i32 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; + + /* load short vector elements of 4x4 block */ + LD_SH2(input, 8, input0, input1); + UNPCK_SH_SW(input0, in0, in1); + UNPCK_SH_SW(input1, in2, in3); + BUTTERFLY_4(in0, in1, in2, in3, a1, b1, c1, d1); + BUTTERFLY_4(a1, d1, c1, b1, hz0, hz1, hz3, hz2); + /* transpose the block */ + TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3); + BUTTERFLY_4(hz0, hz1, hz2, hz3, a1, b1, c1, d1); + BUTTERFLY_4(a1, d1, c1, b1, vt0, vt1, vt3, vt2); + ADD4(vt0, 3, vt1, 3, vt2, 3, vt3, 3, vt0, vt1, vt2, vt3); + SRA_4V(vt0, vt1, vt2, vt3, 3); + mb_dq_coeff[0] = __msa_copy_s_h((v8i16) vt0, 0); + mb_dq_coeff[16] = __msa_copy_s_h((v8i16) vt1, 0); + mb_dq_coeff[32] = __msa_copy_s_h((v8i16) vt2, 0); + mb_dq_coeff[48] = __msa_copy_s_h((v8i16) vt3, 0); + mb_dq_coeff[64] = __msa_copy_s_h((v8i16) vt0, 2); + mb_dq_coeff[80] = __msa_copy_s_h((v8i16) vt1, 2); + mb_dq_coeff[96] = __msa_copy_s_h((v8i16) vt2, 2); + mb_dq_coeff[112] = __msa_copy_s_h((v8i16) vt3, 2); + mb_dq_coeff[128] = __msa_copy_s_h((v8i16) vt0, 4); + mb_dq_coeff[144] = __msa_copy_s_h((v8i16) vt1, 4); + mb_dq_coeff[160] = __msa_copy_s_h((v8i16) vt2, 4); + mb_dq_coeff[176] = __msa_copy_s_h((v8i16) vt3, 4); + mb_dq_coeff[192] = __msa_copy_s_h((v8i16) vt0, 6); + mb_dq_coeff[208] = __msa_copy_s_h((v8i16) vt1, 6); + mb_dq_coeff[224] = __msa_copy_s_h((v8i16) vt2, 6); + mb_dq_coeff[240] = __msa_copy_s_h((v8i16) vt3, 6); + + memset(input, 0, 4 * 4 * sizeof(int16_t)); +} + +void ff_vp8_idct_dc_add4y_msa(uint8_t *dst, int16_t block[4][16], + ptrdiff_t stride) +{ + ff_vp8_idct_dc_add_msa(dst, &block[0][0], stride); + ff_vp8_idct_dc_add_msa(dst + 4, &block[1][0], stride); + ff_vp8_idct_dc_add_msa(dst + 8, &block[2][0], stride); + ff_vp8_idct_dc_add_msa(dst + 12, &block[3][0], stride); +} + +void ff_vp8_idct_dc_add4uv_msa(uint8_t *dst, int16_t block[4][16], + ptrdiff_t stride) +{ + ff_vp8_idct_dc_add_msa(dst, &block[0][0], stride); + ff_vp8_idct_dc_add_msa(dst + 4, &block[1][0], stride); + ff_vp8_idct_dc_add_msa(dst + stride * 4, &block[2][0], stride); + ff_vp8_idct_dc_add_msa(dst + stride * 4 + 4, &block[3][0], stride); +} diff --git a/libavcodec/mips/vp8_lpf_msa.c b/libavcodec/mips/vp8_lpf_msa.c new file mode 100644 index 0000000000..359096174a --- /dev/null +++ b/libavcodec/mips/vp8_lpf_msa.c @@ -0,0 +1,690 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/vp8dsp.h" +#include "libavutil/mips/generic_macros_msa.h" +#include "vp8dsp_mips.h" + +#define VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask) \ +{ \ + v16u8 p1_a_sub_q1, p0_a_sub_q0; \ + \ + p0_a_sub_q0 = __msa_asub_u_b(p0, q0); \ + p1_a_sub_q1 = __msa_asub_u_b(p1, q1); \ + p1_a_sub_q1 = (v16u8) __msa_srli_b((v16i8) p1_a_sub_q1, 1); \ + p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0); \ + mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1); \ + mask = ((v16u8) mask <= b_limit); \ +} + +#define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out, \ + mask_in, hev_in) \ +{ \ + v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ + v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ + v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ + \ + p1_m = (v16i8) __msa_xori_b(p1_in_out, 0x80); \ + p0_m = (v16i8) __msa_xori_b(p0_in_out, 0x80); \ + q0_m = (v16i8) __msa_xori_b(q0_in_out, 0x80); \ + q1_m = (v16i8) __msa_xori_b(q1_in_out, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + \ + filt = filt & (v16i8) hev_in; \ + \ + q0_sub_p0 = q0_m - p0_m; \ + filt_sign = __msa_clti_s_b(filt, 0); \ + \ + cnst3h = __msa_ldi_h(3); \ + q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h); \ + filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \ + filt_r += q0_sub_p0_r; \ + filt_r = __msa_sat_s_h(filt_r, 7); \ + \ + q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_l = __msa_dotp_s_h((v16i8) q0_sub_p0_l, (v16i8) cnst3h); \ + filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt); \ + filt_l += q0_sub_p0_l; \ + filt_l = __msa_sat_s_h(filt_l, 7); \ + \ + filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r); \ + filt = filt & (v16i8) mask_in; \ + \ + cnst4b = __msa_ldi_b(4); \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= 3; \ + \ + cnst3b = __msa_ldi_b(3); \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= 3; \ + \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + q0_in_out = __msa_xori_b((v16u8) q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + p0_in_out = __msa_xori_b((v16u8) p0_m, 0x80); \ + \ + filt = __msa_srari_b(filt1, 1); \ + hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \ + filt = filt & (v16i8) hev_in; \ + \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1_in_out = __msa_xori_b((v16u8) q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1_in_out = __msa_xori_b((v16u8) p1_m, 0x80); \ +} + +#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) \ +{ \ + v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, q0_sub_p0_sign; \ + v16i8 filt, filt1, filt2, cnst4b, cnst3b, filt_sign; \ + v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ + \ + p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + \ + q0_sub_p0 = q0_m - p0_m; \ + filt_sign = __msa_clti_s_b(filt, 0); \ + \ + cnst3h = __msa_ldi_h(3); \ + q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0); \ + q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0); \ + q0_sub_p0_r *= cnst3h; \ + filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \ + filt_r += q0_sub_p0_r; \ + filt_r = __msa_sat_s_h(filt_r, 7); \ + \ + q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0); \ + q0_sub_p0_l *= cnst3h; \ + filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt); \ + filt_l += q0_sub_p0_l; \ + filt_l = __msa_sat_s_h(filt_l, 7); \ + \ + filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r); \ + filt = filt & (v16i8) (mask); \ + \ + cnst4b = __msa_ldi_b(4); \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= 3; \ + \ + cnst3b = __msa_ldi_b(3); \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= 3; \ + \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + q0_in = __msa_xori_b((v16u8) q0_m, 0x80); \ + p0_in = __msa_xori_b((v16u8) p0_m, 0x80); \ +} + +#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \ +{ \ + v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \ + v16i8 filt, q0_sub_p0, cnst4b, cnst3b; \ + v16i8 u, filt1, filt2, filt_sign, q0_sub_p0_sign; \ + v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_r, u_r, u_l, filt_l; \ + v8i16 cnst3h, cnst27h, cnst18h, cnst63h; \ + \ + cnst3h = __msa_ldi_h(3); \ + \ + p2_m = (v16i8) __msa_xori_b(p2, 0x80); \ + p1_m = (v16i8) __msa_xori_b(p1, 0x80); \ + p0_m = (v16i8) __msa_xori_b(p0, 0x80); \ + q0_m = (v16i8) __msa_xori_b(q0, 0x80); \ + q1_m = (v16i8) __msa_xori_b(q1, 0x80); \ + q2_m = (v16i8) __msa_xori_b(q2, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + q0_sub_p0 = q0_m - p0_m; \ + q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0); \ + filt_sign = __msa_clti_s_b(filt, 0); \ + \ + /* right part */ \ + q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0); \ + q0_sub_p0_r *= cnst3h; \ + filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \ + filt_r = filt_r + q0_sub_p0_r; \ + filt_r = __msa_sat_s_h(filt_r, 7); \ + \ + /* left part */ \ + q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0); \ + q0_sub_p0_l *= cnst3h; \ + filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt); \ + filt_l = filt_l + q0_sub_p0_l; \ + filt_l = __msa_sat_s_h(filt_l, 7); \ + \ + /* combine left and right part */ \ + filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r); \ + filt = filt & (v16i8) mask; \ + filt2 = filt & (v16i8) hev; \ + \ + /* filt_val &= ~hev */ \ + hev = __msa_xori_b(hev, 0xff); \ + filt = filt & (v16i8) hev; \ + cnst4b = __msa_ldi_b(4); \ + filt1 = __msa_adds_s_b(filt2, cnst4b); \ + filt1 >>= 3; \ + cnst3b = __msa_ldi_b(3); \ + filt2 = __msa_adds_s_b(filt2, cnst3b); \ + filt2 >>= 3; \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + \ + filt_sign = __msa_clti_s_b(filt, 0); \ + ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \ + \ + cnst27h = __msa_ldi_h(27); \ + cnst63h = __msa_ldi_h(63); \ + \ + /* right part */ \ + u_r = filt_r * cnst27h; \ + u_r += cnst63h; \ + u_r >>= 7; \ + u_r = __msa_sat_s_h(u_r, 7); \ + /* left part */ \ + u_l = filt_l * cnst27h; \ + u_l += cnst63h; \ + u_l >>= 7; \ + u_l = __msa_sat_s_h(u_l, 7); \ + /* combine left and right part */ \ + u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r); \ + q0_m = __msa_subs_s_b(q0_m, u); \ + q0 = __msa_xori_b((v16u8) q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, u); \ + p0 = __msa_xori_b((v16u8) p0_m, 0x80); \ + cnst18h = __msa_ldi_h(18); \ + u_r = filt_r * cnst18h; \ + u_r += cnst63h; \ + u_r >>= 7; \ + u_r = __msa_sat_s_h(u_r, 7); \ + \ + /* left part */ \ + u_l = filt_l * cnst18h; \ + u_l += cnst63h; \ + u_l >>= 7; \ + u_l = __msa_sat_s_h(u_l, 7); \ + /* combine left and right part */ \ + u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r); \ + q1_m = __msa_subs_s_b(q1_m, u); \ + q1 = __msa_xori_b((v16u8) q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, u); \ + p1 = __msa_xori_b((v16u8) p1_m, 0x80); \ + u_r = filt_r << 3; \ + u_r += filt_r + cnst63h; \ + u_r >>= 7; \ + u_r = __msa_sat_s_h(u_r, 7); \ + \ + /* left part */ \ + u_l = filt_l << 3; \ + u_l += filt_l + cnst63h; \ + u_l >>= 7; \ + u_l = __msa_sat_s_h(u_l, 7); \ + /* combine left and right part */ \ + u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r); \ + q2_m = __msa_subs_s_b(q2_m, u); \ + q2 = __msa_xori_b((v16u8) q2_m, 0x80); \ + p2_m = __msa_adds_s_b(p2_m, u); \ + p2 = __msa_xori_b((v16u8) p2_m, 0x80); \ +} + +#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \ + q0_in, q1_in, q2_in, q3_in, \ + limit_in, b_limit_in, thresh_in, \ + hev_out, mask_out, flat_out) \ +{ \ + v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ + v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ + \ + /* absolute subtraction of pixel values */ \ + p3_asub_p2_m = __msa_asub_u_b((p3_in), (p2_in)); \ + p2_asub_p1_m = __msa_asub_u_b((p2_in), (p1_in)); \ + p1_asub_p0_m = __msa_asub_u_b((p1_in), (p0_in)); \ + q1_asub_q0_m = __msa_asub_u_b((q1_in), (q0_in)); \ + q2_asub_q1_m = __msa_asub_u_b((q2_in), (q1_in)); \ + q3_asub_q2_m = __msa_asub_u_b((q3_in), (q2_in)); \ + p0_asub_q0_m = __msa_asub_u_b((p0_in), (q0_in)); \ + p1_asub_q1_m = __msa_asub_u_b((p1_in), (q1_in)); \ + /* calculation of hev */ \ + flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ + hev_out = (thresh_in) < (v16u8) flat_out; \ + /* calculation of mask */ \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ + p1_asub_q1_m >>= 1; \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ + mask_out = (b_limit_in) < p0_asub_q0_m; \ + mask_out = __msa_max_u_b(flat_out, mask_out); \ + p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ + mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ + q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ + mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ + mask_out = (limit_in) < (v16u8) mask_out; \ + mask_out = __msa_xori_b(mask_out, 0xff); \ +} + +#define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride) \ +{ \ + uint16_t tmp0_h; \ + uint32_t tmp0_w; \ + \ + tmp0_w = __msa_copy_u_w((v4i32) in0, in0_idx); \ + tmp0_h = __msa_copy_u_h((v8i16) in1, in1_idx); \ + SW(tmp0_w, pdst); \ + SH(tmp0_h, pdst + stride); \ +} + +void ff_vp8_v_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in, + int limit_in, int thresh_in) +{ + uint8_t *temp_src; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 mask, hev, flat, thresh, limit, b_limit; + + b_limit = (v16u8) __msa_fill_b(b_limit_in); + limit = (v16u8) __msa_fill_b(limit_in); + thresh = (v16u8) __msa_fill_b(thresh_in); + /* load vector elements */ + temp_src = src - (pitch << 2); + LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + /* store vector elements */ + temp_src = src - 3 * pitch; + ST_UB4(p2, p1, p0, q0, temp_src, pitch); + temp_src += (4 * pitch); + ST_UB2(q1, q2, temp_src, pitch); +} + +void ff_vp8_v_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v, + ptrdiff_t pitch, int b_limit_in, int limit_in, + int thresh_in) +{ + uint8_t *temp_src; + uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 mask, hev, flat, thresh, limit, b_limit; + v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; + v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; + + b_limit = (v16u8) __msa_fill_b(b_limit_in); + limit = (v16u8) __msa_fill_b(limit_in); + thresh = (v16u8) __msa_fill_b(thresh_in); + + temp_src = src_u - (pitch << 2); + LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u); + temp_src = src_v - (pitch << 2); + LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v); + + /* rht 8 element of p3 are u pixel and left 8 element of p3 are v pixel */ + ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0); + ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + + p2_d = __msa_copy_u_d((v2i64) p2, 0); + p1_d = __msa_copy_u_d((v2i64) p1, 0); + p0_d = __msa_copy_u_d((v2i64) p0, 0); + q0_d = __msa_copy_u_d((v2i64) q0, 0); + q1_d = __msa_copy_u_d((v2i64) q1, 0); + q2_d = __msa_copy_u_d((v2i64) q2, 0); + src_u -= (pitch * 3); + SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch); + src_u += 4 * pitch; + SD(q1_d, src_u); + src_u += pitch; + SD(q2_d, src_u); + + p2_d = __msa_copy_u_d((v2i64) p2, 1); + p1_d = __msa_copy_u_d((v2i64) p1, 1); + p0_d = __msa_copy_u_d((v2i64) p0, 1); + q0_d = __msa_copy_u_d((v2i64) q0, 1); + q1_d = __msa_copy_u_d((v2i64) q1, 1); + q2_d = __msa_copy_u_d((v2i64) q2, 1); + src_v -= (pitch * 3); + SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch); + src_v += 4 * pitch; + SD(q1_d, src_v); + src_v += pitch; + SD(q2_d, src_v); +} + +void ff_vp8_h_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in, + int limit_in, int thresh_in) +{ + uint8_t *temp_src; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 mask, hev, flat, thresh, limit, b_limit; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; + v16u8 row9, row10, row11, row12, row13, row14, row15; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + b_limit = (v16u8) __msa_fill_b(b_limit_in); + limit = (v16u8) __msa_fill_b(limit_in); + thresh = (v16u8) __msa_fill_b(thresh_in); + temp_src = src - 4; + LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7); + temp_src += (8 * pitch); + LD_UB8(temp_src, pitch, + row8, row9, row10, row11, row12, row13, row14, row15); + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, + row8, row9, row10, row11, row12, row13, row14, row15, + p3, p2, p1, p0, q0, q1, q2, q3); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4); + ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7); + ILVRL_B2_SH(q2, q1, tmp2, tmp5); + + temp_src = src - 3; + VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4); +} + +void ff_vp8_h_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v, + ptrdiff_t pitch, int b_limit_in, int limit_in, + int thresh_in) +{ + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 mask, hev, flat, thresh, limit, b_limit; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; + v16u8 row9, row10, row11, row12, row13, row14, row15; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + b_limit = (v16u8) __msa_fill_b(b_limit_in); + limit = (v16u8) __msa_fill_b(limit_in); + thresh = (v16u8) __msa_fill_b(thresh_in); + + LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); + LD_UB8(src_v - 4, pitch, + row8, row9, row10, row11, row12, row13, row14, row15); + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, + row8, row9, row10, row11, row12, row13, row14, row15, + p3, p2, p1, p0, q0, q1, q2, q3); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + + ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4); + ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7); + ILVRL_B2_SH(q2, q1, tmp2, tmp5); + + src_u -= 3; + VP8_ST6x1_UB(tmp3, 0, tmp2, 0, src_u, 4); + src_u += pitch; + VP8_ST6x1_UB(tmp3, 1, tmp2, 1, src_u, 4); + src_u += pitch; + VP8_ST6x1_UB(tmp3, 2, tmp2, 2, src_u, 4); + src_u += pitch; + VP8_ST6x1_UB(tmp3, 3, tmp2, 3, src_u, 4); + src_u += pitch; + VP8_ST6x1_UB(tmp4, 0, tmp2, 4, src_u, 4); + src_u += pitch; + VP8_ST6x1_UB(tmp4, 1, tmp2, 5, src_u, 4); + src_u += pitch; + VP8_ST6x1_UB(tmp4, 2, tmp2, 6, src_u, 4); + src_u += pitch; + VP8_ST6x1_UB(tmp4, 3, tmp2, 7, src_u, 4); + + src_v -= 3; + VP8_ST6x1_UB(tmp6, 0, tmp5, 0, src_v, 4); + src_v += pitch; + VP8_ST6x1_UB(tmp6, 1, tmp5, 1, src_v, 4); + src_v += pitch; + VP8_ST6x1_UB(tmp6, 2, tmp5, 2, src_v, 4); + src_v += pitch; + VP8_ST6x1_UB(tmp6, 3, tmp5, 3, src_v, 4); + src_v += pitch; + VP8_ST6x1_UB(tmp7, 0, tmp5, 4, src_v, 4); + src_v += pitch; + VP8_ST6x1_UB(tmp7, 1, tmp5, 5, src_v, 4); + src_v += pitch; + VP8_ST6x1_UB(tmp7, 2, tmp5, 6, src_v, 4); + src_v += pitch; + VP8_ST6x1_UB(tmp7, 3, tmp5, 7, src_v, 4); +} + +void ff_vp8_v_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch, + int b_limit_ptr) +{ + v16u8 p1, p0, q1, q0; + v16u8 mask, b_limit; + + b_limit = (v16u8) __msa_fill_b(b_limit_ptr); + /* load vector elements */ + LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1); + VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask); + VP8_SIMPLE_FILT(p1, p0, q0, q1, mask); + ST_UB2(p0, q0, (src - pitch), pitch); +} + +void ff_vp8_h_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch, + int b_limit_ptr) +{ + uint8_t *temp_src; + v16u8 p1, p0, q1, q0; + v16u8 mask, b_limit; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; + v16u8 row9, row10, row11, row12, row13, row14, row15; + v8i16 tmp0, tmp1; + + b_limit = (v16u8) __msa_fill_b(b_limit_ptr); + temp_src = src - 2; + LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7); + temp_src += (8 * pitch); + LD_UB8(temp_src, pitch, + row8, row9, row10, row11, row12, row13, row14, row15); + TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, + row8, row9, row10, row11, row12, row13, row14, row15, + p1, p0, q0, q1); + VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask); + VP8_SIMPLE_FILT(p1, p0, q0, q1, mask); + ILVRL_B2_SH(q0, p0, tmp1, tmp0); + + src -= 1; + ST2x4_UB(tmp1, 0, src, pitch); + src += 4 * pitch; + ST2x4_UB(tmp1, 4, src, pitch); + src += 4 * pitch; + ST2x4_UB(tmp0, 0, src, pitch); + src += 4 * pitch; + ST2x4_UB(tmp0, 4, src, pitch); + src += 4 * pitch; +} + +void ff_vp8_v_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v, + ptrdiff_t pitch, int b_limit_in, + int limit_in, int thresh_in) +{ + uint64_t p1_d, p0_d, q0_d, q1_d; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 mask, hev, flat, thresh, limit, b_limit; + v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; + v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; + + thresh = (v16u8) __msa_fill_b(thresh_in); + limit = (v16u8) __msa_fill_b(limit_in); + b_limit = (v16u8) __msa_fill_b(b_limit_in); + + src_u = src_u - (pitch << 2); + LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u); + src_u += (5 * pitch); + src_v = src_v - (pitch << 2); + LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v); + src_v += (5 * pitch); + + /* right 8 element of p3 are u pixel and + left 8 element of p3 are v pixel */ + ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0); + ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + + p1_d = __msa_copy_u_d((v2i64) p1, 0); + p0_d = __msa_copy_u_d((v2i64) p0, 0); + q0_d = __msa_copy_u_d((v2i64) q0, 0); + q1_d = __msa_copy_u_d((v2i64) q1, 0); + SD4(q1_d, q0_d, p0_d, p1_d, src_u, (- pitch)); + + p1_d = __msa_copy_u_d((v2i64) p1, 1); + p0_d = __msa_copy_u_d((v2i64) p0, 1); + q0_d = __msa_copy_u_d((v2i64) q0, 1); + q1_d = __msa_copy_u_d((v2i64) q1, 1); + SD4(q1_d, q0_d, p0_d, p1_d, src_v, (- pitch)); +} + +void ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v, + ptrdiff_t pitch, int b_limit_in, + int limit_in, int thresh_in) +{ + uint8_t *temp_src_u, *temp_src_v; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 mask, hev, flat, thresh, limit, b_limit; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; + v16u8 row9, row10, row11, row12, row13, row14, row15; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + thresh = (v16u8) __msa_fill_b(thresh_in); + limit = (v16u8) __msa_fill_b(limit_in); + b_limit = (v16u8) __msa_fill_b(b_limit_in); + + LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); + LD_UB8(src_v - 4, pitch, + row8, row9, row10, row11, row12, row13, row14, row15); + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, + row8, row9, row10, row11, row12, row13, row14, row15, + p3, p2, p1, p0, q0, q1, q2, q3); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3); + tmp0 = (v4i32) __msa_ilvl_b((v16i8) p0, (v16i8) p1); + tmp1 = (v4i32) __msa_ilvl_b((v16i8) q1, (v16i8) q0); + ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5); + + temp_src_u = src_u - 2; + ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch); + temp_src_u += 4 * pitch; + ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch); + + temp_src_v = src_v - 2; + ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch); + temp_src_v += 4 * pitch; + ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch); +} + +void ff_vp8_v_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch, + int32_t e, int32_t i, int32_t h) +{ + v16u8 mask, hev, flat; + v16u8 thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + thresh = (v16u8) __msa_fill_b(h); + b_limit = (v16u8) __msa_fill_b(e); + limit = (v16u8) __msa_fill_b(i); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + + ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); +} + +void ff_vp8_h_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch, + int32_t e, int32_t i, int32_t h) +{ + v16u8 mask, hev, flat; + v16u8 thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); + LD_UB8(src - 4 + (8 * pitch), pitch, + row8, row9, row10, row11, row12, row13, row14, row15); + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, + row8, row9, row10, row11, row12, row13, row14, row15, + p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8) __msa_fill_b(h); + b_limit = (v16u8) __msa_fill_b(e); + limit = (v16u8) __msa_fill_b(i); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); + ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); + + src -= 2; + ST4x8_UB(tmp2, tmp3, src, pitch); + src += (8 * pitch); + ST4x8_UB(tmp4, tmp5, src, pitch); +} diff --git a/libavcodec/mips/vp8_mc_msa.c b/libavcodec/mips/vp8_mc_msa.c new file mode 100644 index 0000000000..2bf0abd8c9 --- /dev/null +++ b/libavcodec/mips/vp8_mc_msa.c @@ -0,0 +1,2332 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/vp8dsp.h" +#include "libavutil/mips/generic_macros_msa.h" +#include "vp8dsp_mips.h" + +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static const int8_t subpel_filters_msa[7][8] = { + {-6, 123, 12, -1, 0, 0, 0, 0}, + {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */ + {-9, 93, 50, -6, 0, 0, 0, 0}, + {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */ + {-6, 50, 93, -9, 0, 0, 0, 0}, + {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */ + {-1, 12, 123, -6, 0, 0, 0, 0}, +}; + +static const int8_t bilinear_filters_msa[7][2] = { + {112, 16}, + {96, 32}, + {80, 48}, + {64, 64}, + {48, 80}, + {32, 96}, + {16, 112} +}; + +#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, \ + filt_h0, filt_h1, filt_h2) \ +( { \ + v16i8 vec0_m, vec1_m, vec2_m; \ + v8i16 hz_out_m; \ + \ + VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \ + vec0_m, vec1_m, vec2_m); \ + hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, \ + filt_h0, filt_h1, filt_h2); \ + \ + hz_out_m = __msa_srari_h(hz_out_m, 7); \ + hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ + \ + hz_out_m; \ +} ) + +#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ + mask0, mask1, mask2, \ + filt0, filt1, filt2, \ + out0, out1) \ +{ \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \ + \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ + DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ + DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ + DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \ +} + +#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ + mask0, mask1, mask2, \ + filt0, filt1, filt2, \ + out0, out1, out2, out3) \ +{ \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ + out0, out1, out2, out3); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ + out0, out1, out2, out3); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \ + out0, out1, out2, out3); \ +} + +#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \ +( { \ + v8i16 tmp0; \ + \ + tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \ + tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \ + \ + tmp0; \ +} ) + +#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \ +( { \ + v16i8 vec0_m, vec1_m; \ + v8i16 hz_out_m; \ + \ + VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \ + hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \ + \ + hz_out_m = __msa_srari_h(hz_out_m, 7); \ + hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ + \ + hz_out_m; \ +} ) + +#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ + mask0, mask1, filt0, filt1, \ + out0, out1) \ +{ \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ + \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ + DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ + DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ +} + +#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ + mask0, mask1, filt0, filt1, \ + out0, out1, out2, out3) \ +{ \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ + out0, out1, out2, out3); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ + DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ + out0, out1, out2, out3); \ +} + +static void common_hz_6t_4x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, filt0, filt1, filt2; + v16u8 mask0, mask1, mask2, out; + v8i16 filt, out0, out1; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 2; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + filt0, filt1, filt2, out0, out1); + SRARI_H2_SH(out0, out1, 7); + SAT_SH2_SH(out0, out1, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, filt0, filt1, filt2; + v16u8 mask0, mask1, mask2, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 2; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + filt0, filt1, filt2, out0, out1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + filt0, filt1, filt2, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter = subpel_filters_msa[mx - 1]; + + if (4 == height) { + common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = subpel_filters_msa[mx - 1]; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2; + v16u8 mask0, mask1, mask2, tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + + src -= 2; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + filt0, filt1, filt2, out0, out1, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + + for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + filt0, filt1, filt2, out0, out1, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = subpel_filters_msa[mx - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2; + v16u8 mask0, mask1, mask2, out; + v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 2; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + src += (4 * src_stride); + + HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + filt0, filt1, filt2, out0, out1, out2, out3); + HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2, + filt0, filt1, filt2, out4, out5, out6, out7); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SRARI_H4_SH(out4, out5, out6, out7, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out4, out5, out6, out7, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out4, out5); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out6, out7); + ST_UB(out, dst); + dst += dst_stride; + } +} + +void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = subpel_filters_msa[my - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; + v16u8 out; + v8i16 filt, out10, out32; + + src -= (2 * src_stride); + + filt = LD_SH(filter); + SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + XORI_B2_128_SB(src2110, src4332); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src5, src6, src7, src8); + src += (4 * src_stride); + + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, + src65_r, src76_r, src87_r); + ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); + XORI_B2_128_SB(src6554, src8776); + out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); + out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); + SRARI_H2_SH(out10, out32, 7); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + src2110 = src6554; + src4332 = src8776; + src4 = src8; + } +} + +void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = subpel_filters_msa[my - 1]; + v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; + v16i8 src109_r, filt0, filt1, filt2; + v16u8 tmp0, tmp1; + v8i16 filt, out0_r, out1_r, out2_r, out3_r; + + src -= (2 * src_stride); + + filt = LD_SH(filter); + SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3, + src10_r, src32_r, src21_r, src43_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2); + out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2); + out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2); + out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); + tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src76_r; + src32_r = src98_r; + src21_r = src87_r; + src43_r = src109_r; + src4 = src10; + } +} + +void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = subpel_filters_msa[my - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; + v16i8 src65_l, src87_l, filt0, filt1, filt2; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt; + + src -= (2 * src_stride); + + filt = LD_SH(filter); + SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r, + src32_r, src43_r, src21_r); + ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l, + src32_l, src43_l, src21_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src5, src6, src7, src8); + src += (4 * src_stride); + + XORI_B4_128_SB(src5, src6, src7, src8); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, + src65_r, src76_r, src87_r); + ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, + src65_l, src76_l, src87_l); + out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, + filt2); + out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, + filt2); + out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, + filt2); + out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, + filt2); + out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, + filt2); + out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, + filt2); + out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, + filt2); + out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, + filt2); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src21_r = src65_r; + src43_r = src87_r; + src10_l = src54_l; + src32_l = src76_l; + src21_l = src65_l; + src43_l = src87_l; + src4 = src8; + } +} + +void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; + const int8_t *filter_vert = subpel_filters_msa[my - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 filt_hz0, filt_hz1, filt_hz2; + v16u8 mask0, mask1, mask2, out; + v8i16 tmp0, tmp1; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= (2 + 2 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); + + filt = LD_SH(filter_vert); + SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8); + hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src, src_stride, src5, src6); + src += (2 * src_stride); + + XORI_B2_128_SB(src5, src6); + hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8); + + LD_SB2(src, src_stride, src7, src8); + src += (2 * src_stride); + + XORI_B2_128_SB(src7, src8); + hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8); + + out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); + tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); + tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); + + SRARI_H2_SH(tmp0, tmp1, 7); + SAT_SH2_SH(tmp0, tmp1, 7); + out = PCKEV_XORI128_UB(tmp0, tmp1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out3 = hz_out7; + out0 = out2; + out1 = out3; + } +} + +void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; + const int8_t *filter_vert = subpel_filters_msa[my - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 filt_hz0, filt_hz1, filt_hz2; + v16u8 mask0, mask1, mask2, vec0, vec1; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 tmp0, tmp1, tmp2, tmp3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= (2 + 2 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + + filt = LD_SH(filter_vert); + SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src5, src6, src7, src8); + src += (4 * src_stride); + + XORI_B4_128_SB(src5, src6, src7, src8); + hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); + tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5); + tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); + + hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); + tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2); + + hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7); + tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2); + + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + vec0 = PCKEV_XORI128_UB(tmp0, tmp1); + vec1 = PCKEV_XORI128_UB(tmp2, tmp3); + ST8x4_UB(vec0, vec1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out4 = hz_out8; + out0 = out2; + out1 = out7; + out3 = out5; + out4 = out6; + } +} + + +void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + int32_t multiple8_cnt; + + for (multiple8_cnt = 2; multiple8_cnt--;) { + ff_put_vp8_epel8_h6v6_msa(dst, dst_stride, src, src_stride, height, + mx, my); + + src += 8; + dst += 8; + } +} + +static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; + v8i16 filt, out0, out1; + v16u8 out; + + mask0 = LD_SB(&mc_filt_mask_arr[16]); + src -= 1; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, + filt0, filt1, out0, out1); + SRARI_H2_SH(out0, out1, 7); + SAT_SH2_SH(out0, out1, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; + v16u8 out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_SB(&mc_filt_mask_arr[16]); + src -= 1; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, + filt0, filt1, out0, out1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, + filt0, filt1, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 filt0, filt1, mask0, mask1; + v16u8 out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_SB(&mc_filt_mask_arr[16]); + src -= 1; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, + filt0, filt1, out0, out1); + HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, + filt0, filt1, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, + filt0, filt1, out0, out1); + HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, + filt0, filt1, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter = subpel_filters_msa[mx - 1]; + + if (4 == height) { + common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } else if (16 == height) { + common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter); + } +} + +void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = subpel_filters_msa[mx - 1]; + v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; + v16u8 tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_SB(&mc_filt_mask_arr[0]); + src -= 1; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, + filt1, out0, out1, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = subpel_filters_msa[mx - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 filt0, filt1, mask0, mask1; + v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7; + v16u8 out; + + mask0 = LD_SB(&mc_filt_mask_arr[0]); + src -= 1; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, + filt1, out0, out1, out2, out3); + HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0, + filt1, out4, out5, out6, out7); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SRARI_H4_SH(out4, out5, out6, out7, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out4, out5, out6, out7, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out4, out5); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out6, out7); + ST_UB(out, dst); + dst += dst_stride; + } +} + +void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = subpel_filters_msa[my - 1]; + v16i8 src0, src1, src2, src3, src4, src5; + v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; + v16i8 src2110, src4332, filt0, filt1; + v8i16 filt, out10, out32; + v16u8 out; + + src -= src_stride; + + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB3(src, src_stride, src3, src4, src5); + src += (3 * src_stride); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r); + src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128); + out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1); + + src2 = LD_SB(src); + src += (src_stride); + ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r); + src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r); + src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); + out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1); + SRARI_H2_SH(out10, out32, 7); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = subpel_filters_msa[my - 1]; + v16i8 src0, src1, src2, src7, src8, src9, src10; + v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1; + v16u8 tmp0, tmp1; + v8i16 filt, out0_r, out1_r, out2_r, out3_r; + + src -= src_stride; + + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9, + src72_r, src87_r, src98_r, src109_r); + out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1); + out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1); + out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1); + out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); + tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src98_r; + src21_r = src109_r; + src2 = src10; + } +} + +void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = subpel_filters_msa[my - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l; + v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= src_stride; + + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src3, src4, src5, src6); + src += (4 * src_stride); + + XORI_B4_128_SB(src3, src4, src5, src6); + ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, + src32_r, src43_r, src54_r, src65_r); + ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, + src32_l, src43_l, src54_l, src65_l); + out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1); + out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1); + out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1); + out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1); + out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1); + out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1); + out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1); + out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src21_r = src65_r; + src10_l = src54_l; + src21_l = src65_l; + src2 = src6; + } +} + +void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; + const int8_t *filter_vert = subpel_filters_msa[my - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1; + v16u8 mask0, mask1, out; + v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= (1 + 1 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); + + mask1 = mask0 + 2; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1); + vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + + filt = LD_SH(filter_vert); + SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src3, src4, src5, src6); + src += (4 * src_stride); + + XORI_B2_128_SB(src3, src4); + hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8); + vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2); + tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); + + XORI_B2_128_SB(src5, src6); + hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1); + hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8); + vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); + tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1); + + SRARI_H2_SH(tmp0, tmp1, 7); + SAT_SH2_SH(tmp0, tmp1, 7); + out = PCKEV_XORI128_UB(tmp0, tmp1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out1 = hz_out5; + vec0 = vec2; + } +} + +void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; + const int8_t *filter_vert = subpel_filters_msa[my - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1; + v16u8 mask0, mask1, out0, out1; + v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 vec0, vec1, vec2, vec3, vec4; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= (1 + 1 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); + + mask1 = mask0 + 2; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1); + ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2); + + filt = LD_SH(filter_vert); + SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src3, src4, src5, src6); + src += (4 * src_stride); + + XORI_B4_128_SB(src3, src4, src5, src6); + hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1); + vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2); + tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); + + hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1); + vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3); + tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1); + + hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1); + vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1); + + hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1); + ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1); + tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); + + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + out0 = PCKEV_XORI128_UB(tmp0, tmp1); + out1 = PCKEV_XORI128_UB(tmp2, tmp3); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + vec0 = vec4; + vec2 = vec1; + } +} + +void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + int32_t multiple8_cnt; + + for (multiple8_cnt = 2; multiple8_cnt--;) { + ff_put_vp8_epel8_h4v4_msa(dst, dst_stride, src, src_stride, height, + mx, my); + + src += 8; + dst += 8; + } +} + +void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; + const int8_t *filter_vert = subpel_filters_msa[my - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v16i8 filt_hz0, filt_hz1, filt_hz2; + v16u8 res0, res1, mask0, mask1, mask2; + v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= (2 + 1 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + + filt = LD_SH(filter_vert); + SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src3, src4, src5, src6); + src += (4 * src_stride); + + XORI_B4_128_SB(src3, src4, src5, src6); + hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8); + vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2); + tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); + + hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8); + vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); + tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1); + + SRARI_H2_SH(tmp0, tmp1, 7); + SAT_SH2_SH(tmp0, tmp1, 7); + PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); + XORI_B2_128_UB(res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out1 = hz_out5; + vec0 = vec2; + } +} + +void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; + const int8_t *filter_vert = subpel_filters_msa[my - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2; + v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3; + v16u8 out0, out1; + + mask0 = LD_SB(&mc_filt_mask_arr[0]); + src -= (2 + src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2); + + filt = LD_SH(filter_vert); + SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src3, src4, src5, src6); + src += (4 * src_stride); + + XORI_B4_128_SB(src3, src4, src5, src6); + + hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2); + tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); + + hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3); + tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1); + + hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1); + + hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2); + tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1); + + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + out0 = PCKEV_XORI128_UB(tmp0, tmp1); + out1 = PCKEV_XORI128_UB(tmp2, tmp3); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + int32_t multiple8_cnt; + + for (multiple8_cnt = 2; multiple8_cnt--;) { + ff_put_vp8_epel8_h6v4_msa(dst, dst_stride, src, src_stride, height, + mx, my); + + src += 8; + dst += 8; + } +} + +void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; + const int8_t *filter_vert = subpel_filters_msa[my - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 filt_hz0, filt_hz1, mask0, mask1; + v16u8 out; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2; + + mask0 = LD_SB(&mc_filt_mask_arr[16]); + + src -= (1 + 2 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); + + mask1 = mask0 + 2; + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1); + hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8); + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + + filt = LD_SH(filter_vert); + SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src5, src6, src7, src8); + XORI_B4_128_SB(src5, src6, src7, src8); + src += (4 * src_stride); + + hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1); + hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8); + out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); + tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1); + hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8); + out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); + tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); + + SRARI_H2_SH(tmp0, tmp1, 7); + SAT_SH2_SH(tmp0, tmp1, 7); + out = PCKEV_XORI128_UB(tmp0, tmp1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out3 = hz_out7; + out0 = out2; + out1 = out3; + } +} + +void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; + const int8_t *filter_vert = subpel_filters_msa[my - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 filt_hz0, filt_hz1, mask0, mask1; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7; + v16u8 vec0, vec1; + + mask0 = LD_SB(&mc_filt_mask_arr[0]); + src -= (1 + 2 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); + + mask1 = mask0 + 2; + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1); + hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1); + hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1); + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4); + + filt = LD_SH(filter_vert); + SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src5, src6, src7, src8); + src += (4 * src_stride); + + XORI_B4_128_SB(src5, src6, src7, src8); + + hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1); + out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); + tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1); + out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5); + tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); + + hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1); + out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); + tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2); + + hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1); + out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7); + tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2); + + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + vec0 = PCKEV_XORI128_UB(tmp0, tmp1); + vec1 = PCKEV_XORI128_UB(tmp2, tmp3); + ST8x4_UB(vec0, vec1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out4 = hz_out8; + out0 = out2; + out1 = out6; + out3 = out5; + out4 = out7; + } +} + +void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + int32_t multiple8_cnt; + + for (multiple8_cnt = 2; multiple8_cnt--;) { + ff_put_vp8_epel8_h4v6_msa(dst, dst_stride, src, src_stride, height, + mx, my); + + src += 8; + dst += 8; + } +} + +static void common_hz_2t_4x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, vec0, vec1, res0, res1; + v8u16 vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); + SRARI_H2_UH(vec2, vec3, 7); + PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16u8 vec0, vec1, vec2, vec3, filt0; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16i8 res0, res1, res2, res3; + v8u16 vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + vec4, vec5, vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, 7); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, + res0, res1, res2, res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter = bilinear_filters_msa[mx - 1]; + + if (4 == height) { + common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_8x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16u8 filt0; + v16i8 src0, src1, src2, src3, mask; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + vec0, vec1, vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); + ST8x4_UB(src0, src1, dst, dst_stride); +} + +static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + v16u8 filt0; + v16i8 src0, src1, src2, src3, mask, out0, out1; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + vec0, vec1, vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + vec0, vec1, vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + if (16 == height) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + vec0, vec1, vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + vec0, vec1, vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); + } +} + +void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter = bilinear_filters_msa[mx - 1]; + + if (4 == height) { + common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, + height); + } +} + +void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = bilinear_filters_msa[mx - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + loop_cnt = (height >> 2) - 1; + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + out0, out1, out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, + out4, out5, out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, 7); + SRARI_H4_UH(out4, out5, out6, out7, 7); + PCKEV_ST_SB(out0, out1, dst); + dst += dst_stride; + PCKEV_ST_SB(out2, out3, dst); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + dst += dst_stride; + PCKEV_ST_SB(out6, out7, dst); + dst += dst_stride; + + for (; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + out0, out1, out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, + out4, out5, out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, 7); + SRARI_H4_UH(out4, out5, out6, out7, 7); + PCKEV_ST_SB(out0, out1, dst); + dst += dst_stride; + PCKEV_ST_SB(out2, out3, dst); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + dst += dst_stride; + PCKEV_ST_SB(out6, out7, dst); + dst += dst_stride; + } +} + +static void common_vt_2t_4x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, src4; + v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; + v16u8 filt0; + v8i16 filt; + v8u16 tmp0, tmp1; + + filt = LD_SH(filter); + filt0 = (v16u8) __msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 filt; + + filt = LD_SH(filter); + filt0 = (v16u8) __msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + src8 = LD_SB(src); + src += src_stride; + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, + src76_r, src87_r); + ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, + src87_r, src76_r, src2110, src4332, src6554, src8776); + DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); + ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); +} + +void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter = bilinear_filters_msa[my - 1]; + + if (4 == height) { + common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_8x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; + v16i8 out0, out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8) __msa_splati_h(filt, 0); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); + ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); +} + +static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v16i8 out0, out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8) __msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); + src += (8 * src_stride); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, + vec0, vec1, vec2, vec3); + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, + vec4, vec5, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + src0 = src8; + } +} + +void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter = bilinear_filters_msa[my - 1]; + + if (4 == height) { + common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, + height); + } +} + +void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = bilinear_filters_msa[my - 1]; + v16u8 src0, src1, src2, src3, src4; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8) __msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_ST_SB(tmp0, tmp1, dst); + dst += dst_stride; + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_ST_SB(tmp2, tmp3, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_ST_SB(tmp0, tmp1, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_ST_SB(tmp2, tmp3, dst); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_hv_2ht_2vt_4x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert) +{ + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter_horiz); + filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); + hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8); + hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2); + + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + v16i8 res0, res1, res2, res3; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter_horiz); + filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7); + hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7); + hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7); + SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, + hz_out3, hz_out5, 8); + hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6); + + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, + vec4, vec5, vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, 7); + SAT_UH4_UH(vec4, vec5, vec6, vec7, 7); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, + res0, res1, res2, res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter_horiz = bilinear_filters_msa[mx - 1]; + const int8_t *filter_vert = bilinear_filters_msa[my - 1]; + + if (4 == height) { + common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else if (8 == height) { + common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } +} + +static void common_hv_2ht_2vt_8x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert) +{ + v16i8 src0, src1, src2, src3, src4, mask, out0, out1; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8) __msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8) __msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); + vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); + tmp1 = __msa_dotp_u_h(vec1, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); + vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + tmp2 = __msa_dotp_u_h(vec2, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); + vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); + tmp3 = __msa_dotp_u_h(vec3, filt_vt); + + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, mask, out0, out1; + v16u8 filt_hz, filt_vt, vec0; + v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8) __msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8) __msa_splati_h(filt, 0); + + src0 = LD_SB(src); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp1, tmp2, 7); + SAT_UH2_UH(tmp1, tmp2, 7); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); + tmp4 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp3, tmp4, 7); + SAT_UH2_UH(tmp3, tmp4, 7); + PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + tmp5 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); + tmp6 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + tmp7 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); + tmp8 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7); + SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7); + PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter_horiz = bilinear_filters_msa[mx - 1]; + const int8_t *filter_vert = bilinear_filters_msa[my - 1]; + + if (4 == height) { + common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else { + common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + } +} + +void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter_horiz = bilinear_filters_msa[mx - 1]; + const int8_t *filter_vert = bilinear_filters_msa[my - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt_hz, filt_vt, vec0, vec1; + v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8) __msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8) __msa_splati_h(filt, 0); + + LD_SB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); + + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, 7); + SAT_UH2_UH(tmp1, tmp2, 7); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, 7); + SAT_UH2_UH(tmp1, tmp2, 7); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, 7); + SAT_UH2_UH(tmp1, tmp2, 7); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, 7); + SAT_UH2_UH(tmp1, tmp2, 7); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + } +} + +void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + int32_t cnt; + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 8) { + for (cnt = height >> 3; cnt--;) { + LD_UB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + out4 = __msa_copy_u_d((v2i64) src4, 0); + out5 = __msa_copy_u_d((v2i64) src5, 0); + out6 = __msa_copy_u_d((v2i64) src6, 0); + out7 = __msa_copy_u_d((v2i64) src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 4) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t width) +{ + int32_t cnt, loop_cnt; + uint8_t *src_tmp, *dst_tmp; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src_tmp, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src_tmp += (8 * src_stride); + + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, + dst_tmp, dst_stride); + dst_tmp += (8 * dst_stride); + } + + src += 16; + dst += 16; + } +} + +void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + int32_t cnt; + v16u8 src0, src1, src2, src3; + + if (0 == height % 8) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); + } else if (0 == height % 4) { + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } +} diff --git a/libavcodec/mips/vp8dsp_init_mips.c b/libavcodec/mips/vp8dsp_init_mips.c new file mode 100644 index 0000000000..58d1b6ce38 --- /dev/null +++ b/libavcodec/mips/vp8dsp_init_mips.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * VP8 compatible video decoder + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavcodec/vp8dsp.h" +#include "vp8dsp_mips.h" + +#define VP8_MC_MIPS_FUNC(IDX, SIZE) \ + dsp->put_vp8_epel_pixels_tab[IDX][0][1] = \ + ff_put_vp8_epel##SIZE##_h4_msa; \ + dsp->put_vp8_epel_pixels_tab[IDX][0][2] = \ + ff_put_vp8_epel##SIZE##_h6_msa; \ + dsp->put_vp8_epel_pixels_tab[IDX][1][0] = \ + ff_put_vp8_epel##SIZE##_v4_msa; \ + dsp->put_vp8_epel_pixels_tab[IDX][1][1] = \ + ff_put_vp8_epel##SIZE##_h4v4_msa; \ + dsp->put_vp8_epel_pixels_tab[IDX][1][2] = \ + ff_put_vp8_epel##SIZE##_h6v4_msa; \ + dsp->put_vp8_epel_pixels_tab[IDX][2][0] = \ + ff_put_vp8_epel##SIZE##_v6_msa; \ + dsp->put_vp8_epel_pixels_tab[IDX][2][1] = \ + ff_put_vp8_epel##SIZE##_h4v6_msa; \ + dsp->put_vp8_epel_pixels_tab[IDX][2][2] = \ + ff_put_vp8_epel##SIZE##_h6v6_msa + +#define VP8_BILINEAR_MC_MIPS_FUNC(IDX, SIZE) \ + dsp->put_vp8_bilinear_pixels_tab[IDX][0][1] = \ + ff_put_vp8_bilinear##SIZE##_h_msa; \ + dsp->put_vp8_bilinear_pixels_tab[IDX][0][2] = \ + ff_put_vp8_bilinear##SIZE##_h_msa; \ + dsp->put_vp8_bilinear_pixels_tab[IDX][1][0] = \ + ff_put_vp8_bilinear##SIZE##_v_msa; \ + dsp->put_vp8_bilinear_pixels_tab[IDX][1][1] = \ + ff_put_vp8_bilinear##SIZE##_hv_msa; \ + dsp->put_vp8_bilinear_pixels_tab[IDX][1][2] = \ + ff_put_vp8_bilinear##SIZE##_hv_msa; \ + dsp->put_vp8_bilinear_pixels_tab[IDX][2][0] = \ + ff_put_vp8_bilinear##SIZE##_v_msa; \ + dsp->put_vp8_bilinear_pixels_tab[IDX][2][1] = \ + ff_put_vp8_bilinear##SIZE##_hv_msa; \ + dsp->put_vp8_bilinear_pixels_tab[IDX][2][2] = \ + ff_put_vp8_bilinear##SIZE##_hv_msa + +#define VP8_MC_MIPS_COPY(IDX, SIZE) \ + dsp->put_vp8_epel_pixels_tab[IDX][0][0] = \ + ff_put_vp8_pixels##SIZE##_msa; \ + dsp->put_vp8_bilinear_pixels_tab[IDX][0][0] = \ + ff_put_vp8_pixels##SIZE##_msa; + +#if HAVE_MSA +static av_cold void vp8dsp_init_msa(VP8DSPContext *dsp) +{ + dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_msa; + dsp->vp8_idct_add = ff_vp8_idct_add_msa; + dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_msa; + dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_msa; + dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_msa; + + VP8_MC_MIPS_FUNC(0, 16); + VP8_MC_MIPS_FUNC(1, 8); + VP8_MC_MIPS_FUNC(2, 4); + + VP8_BILINEAR_MC_MIPS_FUNC(0, 16); + VP8_BILINEAR_MC_MIPS_FUNC(1, 8); + VP8_BILINEAR_MC_MIPS_FUNC(2, 4); + + VP8_MC_MIPS_COPY(0, 16); + VP8_MC_MIPS_COPY(1, 8); + + dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_msa; + dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_msa; + dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_msa; + dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_msa; + + dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_msa; + dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_msa; + dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_msa; + dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_msa; + + dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_msa; + dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_msa; +} +#endif // #if HAVE_MSA + +av_cold void ff_vp8dsp_init_mips(VP8DSPContext *dsp) +{ +#if HAVE_MSA + vp8dsp_init_msa(dsp); +#endif // #if HAVE_MSA +} diff --git a/libavcodec/mips/vp8dsp_mips.h b/libavcodec/mips/vp8dsp_mips.h new file mode 100644 index 0000000000..8e715b58be --- /dev/null +++ b/libavcodec/mips/vp8dsp_mips.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_VP8DSP_MIPS_H +#define AVCODEC_MIPS_VP8DSP_MIPS_H + +void ff_put_vp8_pixels4_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int x, int y); +void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int x, int y); +void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int x, int y); + +void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); + +void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); + +void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); + +void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); + +void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); + +void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); +void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int h, int mx, int my); + +/* loop filter */ +void ff_vp8_h_loop_filter16_inner_msa(uint8_t *dst, ptrdiff_t stride, + int32_t e, int32_t i, int32_t h); +void ff_vp8_v_loop_filter16_inner_msa(uint8_t *dst, ptrdiff_t stride, + int32_t e, int32_t i, int32_t h); +void ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *dst_u, uint8_t *dst_v, + ptrdiff_t stride, + int flim_e, int flim_i, int hev_thresh); +void ff_vp8_v_loop_filter8uv_inner_msa(uint8_t *dst_u, uint8_t *dst_v, + ptrdiff_t stride, + int flim_e, int flim_i, int hev_thresh); +void ff_vp8_h_loop_filter16_msa(uint8_t *dst, ptrdiff_t stride, + int flim_e, int flim_i, int hev_thresh); +void ff_vp8_v_loop_filter16_msa(uint8_t *dst, ptrdiff_t stride, + int flim_e, int flim_i, int hev_thresh); +void ff_vp8_h_loop_filter8uv_msa(uint8_t *dst_u, uint8_t *dst_v, + ptrdiff_t stride, + int flim_e, int flim_i, int hev_thresh); +void ff_vp8_v_loop_filter8uv_msa(uint8_t *dst_u, uint8_t *dst_v, + ptrdiff_t stride, + int flim_e, int flim_i, int hev_thresh); +void ff_vp8_h_loop_filter_simple_msa(uint8_t *dst, ptrdiff_t stride, int flim); +void ff_vp8_v_loop_filter_simple_msa(uint8_t *dst, ptrdiff_t stride, int flim); + +/* Idct functions */ +void ff_vp8_luma_dc_wht_msa(int16_t block[4][4][16], int16_t dc[16]); +void ff_vp8_idct_add_msa(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +void ff_vp8_idct_dc_add_msa(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +void ff_vp8_idct_dc_add4uv_msa(uint8_t *dst, int16_t block[4][16], + ptrdiff_t stride); +void ff_vp8_idct_dc_add4y_msa(uint8_t *dst, int16_t block[4][16], + ptrdiff_t stride); + +#endif // #ifndef AVCODEC_MIPS_VP8DSP_MIPS_H diff --git a/libavcodec/mips/vp9_idct_msa.c b/libavcodec/mips/vp9_idct_msa.c new file mode 100644 index 0000000000..25ea16c72a --- /dev/null +++ b/libavcodec/mips/vp9_idct_msa.c @@ -0,0 +1,2138 @@ +/* + * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <string.h> +#include "libavcodec/vp9dsp.h" +#include "libavutil/mips/generic_macros_msa.h" +#include "vp9dsp_mips.h" + +#define VP9_DCT_CONST_BITS 14 +#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n)) + +static const int32_t cospi_1_64 = 16364; +static const int32_t cospi_2_64 = 16305; +static const int32_t cospi_3_64 = 16207; +static const int32_t cospi_4_64 = 16069; +static const int32_t cospi_5_64 = 15893; +static const int32_t cospi_6_64 = 15679; +static const int32_t cospi_7_64 = 15426; +static const int32_t cospi_8_64 = 15137; +static const int32_t cospi_9_64 = 14811; +static const int32_t cospi_10_64 = 14449; +static const int32_t cospi_11_64 = 14053; +static const int32_t cospi_12_64 = 13623; +static const int32_t cospi_13_64 = 13160; +static const int32_t cospi_14_64 = 12665; +static const int32_t cospi_15_64 = 12140; +static const int32_t cospi_16_64 = 11585; +static const int32_t cospi_17_64 = 11003; +static const int32_t cospi_18_64 = 10394; +static const int32_t cospi_19_64 = 9760; +static const int32_t cospi_20_64 = 9102; +static const int32_t cospi_21_64 = 8423; +static const int32_t cospi_22_64 = 7723; +static const int32_t cospi_23_64 = 7005; +static const int32_t cospi_24_64 = 6270; +static const int32_t cospi_25_64 = 5520; +static const int32_t cospi_26_64 = 4756; +static const int32_t cospi_27_64 = 3981; +static const int32_t cospi_28_64 = 3196; +static const int32_t cospi_29_64 = 2404; +static const int32_t cospi_30_64 = 1606; +static const int32_t cospi_31_64 = 804; + +// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 +static const int32_t sinpi_1_9 = 5283; +static const int32_t sinpi_2_9 = 9929; +static const int32_t sinpi_3_9 = 13377; +static const int32_t sinpi_4_9 = 15212; + +#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ +{ \ + v8i16 k0_m = __msa_fill_h(cnst0); \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + s0_m = (v4i32) __msa_fill_h(cnst1); \ + k0_m = __msa_ilvev_h((v8i16) s0_m, k0_m); \ + \ + ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \ + ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \ + DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \ + SRARI_W2_SW(s1_m, s0_m, VP9_DCT_CONST_BITS); \ + out0 = __msa_pckev_h((v8i16) s0_m, (v8i16) s1_m); \ + \ + DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \ + SRARI_W2_SW(s1_m, s0_m, VP9_DCT_CONST_BITS); \ + out1 = __msa_pckev_h((v8i16) s0_m, (v8i16) s1_m); \ +} + +#define VP9_DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, \ + dst0, dst1, dst2, dst3) \ +{ \ + v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \ + v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \ + \ + DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, \ + tp0_m, tp2_m, tp3_m, tp4_m); \ + DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, \ + tp5_m, tp6_m, tp7_m, tp8_m); \ + BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \ + BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \ + SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, VP9_DCT_CONST_BITS); \ + SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, VP9_DCT_CONST_BITS); \ + PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, \ + dst0, dst1, dst2, dst3); \ +} + +#define VP9_DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) \ +( { \ + v8i16 dst_m; \ + v4i32 tp0_m, tp1_m; \ + \ + DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \ + SRARI_W2_SW(tp1_m, tp0_m, VP9_DCT_CONST_BITS); \ + dst_m = __msa_pckev_h((v8i16) tp1_m, (v8i16) tp0_m); \ + \ + dst_m; \ +} ) + +#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ +{ \ + v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ + v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ + cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ + v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, \ + -cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 }; \ + \ + SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ + ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ + VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ + cnst1_m, cnst2_m, cnst3_m, in7, in0, \ + in4, in3); \ + \ + SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + \ + VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ + cnst1_m, cnst2_m, cnst3_m, in5, in2, \ + in6, in1); \ + BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ + out7 = -s0_m; \ + out0 = s1_m; \ + \ + SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, \ + cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ + \ + ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + cnst1_m = cnst0_m; \ + \ + ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ + cnst2_m, cnst3_m, cnst1_m, out1, out6, \ + s0_m, s1_m); \ + \ + SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ + out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ + out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ + \ + out1 = -out1; \ + out3 = -out3; \ + out5 = -out5; \ +} + +#define VP9_MADD_SHORT(m0, m1, c0, c1, res0, res1) \ +{ \ + v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \ + v8i16 madd_s0_m, madd_s1_m; \ + \ + ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, \ + c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m); \ + SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, VP9_DCT_CONST_BITS); \ + PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \ +} + +#define VP9_MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \ + out0, out1, out2, out3) \ +{ \ + v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \ + \ + ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \ + ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \ + cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \ + m4_m, m5_m, tmp3_m, tmp2_m); \ + SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS); \ + PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \ + cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \ + m4_m, m5_m, tmp3_m, tmp2_m); \ + SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS); \ + PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \ +} + +#define VP9_SET_COSPI_PAIR(c0_h, c1_h) \ +( { \ + v8i16 out0_m, r0_m, r1_m; \ + \ + r0_m = __msa_fill_h(c0_h); \ + r1_m = __msa_fill_h(c1_h); \ + out0_m = __msa_ilvev_h(r1_m, r0_m); \ + \ + out0_m; \ +} ) + +#define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) \ +{ \ + uint8_t *dst_m = (uint8_t *) (dst); \ + v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \ + v16i8 tmp0_m, tmp1_m; \ + v16i8 zero_m = { 0 }; \ + v8i16 res0_m, res1_m, res2_m, res3_m; \ + \ + LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \ + ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, \ + zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m); \ + ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, \ + res0_m, res1_m, res2_m, res3_m); \ + CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \ + PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \ +} + +#define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ +{ \ + v8i16 c0_m, c1_m, c2_m, c3_m; \ + v8i16 step0_m, step1_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ + c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ + step0_m = __msa_ilvr_h(in2, in0); \ + DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + \ + c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + step1_m = __msa_ilvr_h(in3, in1); \ + DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS); \ + \ + PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \ + SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \ + BUTTERFLY_4((v8i16) tmp0_m, (v8i16) tmp1_m, \ + (v8i16) tmp2_m, (v8i16) tmp3_m, \ + out0, out1, out2, out3); \ +} + +#define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ +{ \ + v8i16 res0_m, res1_m, c0_m, c1_m; \ + v8i16 k1_m, k2_m, k3_m, k4_m; \ + v8i16 zero_m = { 0 }; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v4i32 int0_m, int1_m, int2_m, int3_m; \ + v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, \ + sinpi_4_9, -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, \ + -sinpi_4_9 }; \ + \ + SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \ + ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \ + ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \ + int0_m = tmp2_m + tmp1_m; \ + \ + SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \ + ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + int1_m = tmp0_m + tmp1_m; \ + \ + c0_m = __msa_splati_h(mask_m, 6); \ + ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \ + ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + int2_m = tmp0_m + tmp1_m; \ + \ + c0_m = __msa_splati_h(mask_m, 6); \ + c0_m = __msa_ilvev_h(c0_m, k1_m); \ + \ + res0_m = __msa_ilvr_h((in1), (in3)); \ + tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \ + int3_m = tmp2_m + tmp0_m; \ + \ + res0_m = __msa_ilvr_h((in2), (in3)); \ + c1_m = __msa_ilvev_h(k4_m, k3_m); \ + \ + tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \ + res1_m = __msa_ilvr_h((in0), (in2)); \ + c1_m = __msa_ilvev_h(k1_m, zero_m); \ + \ + tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \ + int3_m += tmp2_m; \ + int3_m += tmp3_m; \ + \ + SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, VP9_DCT_CONST_BITS); \ + PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \ + PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \ +} + +#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ +{ \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ + v8i16 zero_m = { 0 }; \ + \ + ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \ + tmp0_n, tmp1_n, tmp2_n, tmp3_n); \ + ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ + ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ + \ + out0 = (v8i16) __msa_ilvr_d((v2i64) tmp1_m, (v2i64) tmp0_m); \ + out1 = (v8i16) __msa_ilvl_d((v2i64) tmp1_m, (v2i64) tmp0_m); \ + out2 = (v8i16) __msa_ilvr_d((v2i64) tmp3_m, (v2i64) tmp2_m); \ + out3 = (v8i16) __msa_ilvl_d((v2i64) tmp3_m, (v2i64) tmp2_m); \ + \ + out4 = zero_m; \ + out5 = zero_m; \ + out6 = zero_m; \ + out7 = zero_m; \ +} + +static void vp9_idct4x4_1_add_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) +{ + int16_t out; + v8i16 vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 4); + vec = __msa_fill_h(out); + + ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride); +} + +static void vp9_idct4x4_colcol_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) +{ + v8i16 in0, in1, in2, in3; + + /* load vector elements of 4x4 block */ + LD4x4_SH(input, in0, in1, in2, in3); + /* rows */ + VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* columns */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* rounding (add 2^3, divide by 2^4) */ + SRARI_H4_SH(in0, in1, in2, in3, 4); + ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); +} + +static void vp9_iadst4x4_colcol_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) +{ + v8i16 in0, in1, in2, in3; + + /* load vector elements of 4x4 block */ + LD4x4_SH(input, in0, in1, in2, in3); + /* rows */ + VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* columns */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* rounding (add 2^3, divide by 2^4) */ + SRARI_H4_SH(in0, in1, in2, in3, 4); + ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); +} + +static void vp9_iadst_idct_4x4_add_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride, int32_t eob) +{ + v8i16 in0, in1, in2, in3; + + /* load vector elements of 4x4 block */ + LD4x4_SH(input, in0, in1, in2, in3); + /* cols */ + VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* columns */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* rounding (add 2^3, divide by 2^4) */ + SRARI_H4_SH(in0, in1, in2, in3, 4); + ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); +} + +static void vp9_idct_iadst_4x4_add_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride, int32_t eob) +{ + v8i16 in0, in1, in2, in3; + + /* load vector elements of 4x4 block */ + LD4x4_SH(input, in0, in1, in2, in3); + /* cols */ + VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* columns */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* rounding (add 2^3, divide by 2^4) */ + SRARI_H4_SH(in0, in1, in2, in3, 4); + ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); +} + +#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) \ +( { \ + v8i16 c0_m, c1_m; \ + \ + SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \ + c0_m = __msa_ilvev_h(c1_m, c0_m); \ + \ + c0_m; \ +} ) + +/* multiply and add macro */ +#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \ + out0, out1, out2, out3) \ +{ \ + v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \ + ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \ + DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, \ + cst0, cst0, cst1, cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \ + DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, \ + cst2, cst2, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \ +} + +/* idct 8x8 macro */ +#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ +{ \ + v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \ + v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \ + cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \ + \ + k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5); \ + k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0); \ + k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3); \ + k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2); \ + VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \ + SUB2(in1, in3, in7, in5, res0_m, res1_m); \ + k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7); \ + k1_m = __msa_splati_h(mask_m, 4); \ + \ + ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \ + DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \ + tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS); \ + tp4_m = in1 + in3; \ + PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \ + tp7_m = in7 + in5; \ + k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, \ + in0, in4, in2, in6); \ + BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \ + BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, \ + out0, out1, out2, out3, out4, out5, out6, out7); \ +} + +#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ +{ \ + v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \ + v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \ + v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, \ + cospi_10_64, cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \ + v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, \ + cospi_6_64, -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \ + v8i16 mask3_m = { -cospi_24_64, cospi_8_64, cospi_16_64, \ + -cospi_16_64, 0, 0, 0, 0 }; \ + \ + k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1); \ + k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2); \ + ILVRL_H2_SH(in1, in0, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ + r0_m, r1_m, r2_m, r3_m); \ + k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7); \ + k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1); \ + ILVRL_H2_SH(in5, in4, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ + r4_m, r5_m, r6_m, r7_m); \ + ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ + m0_m, m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \ + SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ + m0_m, m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS); \ + PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \ + k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4); \ + k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5); \ + ILVRL_H2_SH(in3, in2, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ + r0_m, r1_m, r2_m, r3_m); \ + k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3); \ + k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4); \ + ILVRL_H2_SH(in7, in6, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ + r4_m, r5_m, r6_m, r7_m); \ + ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ + m0_m, m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \ + SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ + m0_m, m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS); \ + PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \ + ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \ + BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \ + k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6); \ + k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7); \ + ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ + r0_m, r1_m, r2_m, r3_m); \ + k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1); \ + DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \ + r4_m, r5_m, r6_m, r7_m); \ + ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \ + m0_m, m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \ + SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \ + m0_m, m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \ + k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2); \ + k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3); \ + ILVRL_H2_SH(in4, in3, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ + m0_m, m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \ + ILVRL_H2_SW(in5, in2, m2_m, m3_m); \ + DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \ + m0_m, m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \ + \ + out1 = -in1; \ + out3 = -in3; \ + out5 = -in5; \ + out7 = -in7; \ +} + +static void vp9_idct8x8_1_add_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) +{ + int16_t out; + int32_t val; + v8i16 vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS); + val = ROUND_POWER_OF_TWO(out, 5); + vec = __msa_fill_h(val); + + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); + dst += (4 * dst_stride); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); +} + +static void vp9_idct8x8_12_colcol_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) +{ + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3; + v4i32 tmp0, tmp1, tmp2, tmp3; + v8i16 zero = { 0 }; + + /* load vector elements of 8x8 block */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + ILVR_D2_SH(in5, in4, in7, in6, in2, in3); + //TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + + /* stage1 */ + ILVL_H2_SH(in3, in0, in2, in1, s0, s1); + k0 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); + k1 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); + k2 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); + k3 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); + DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3); + SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, VP9_DCT_CONST_BITS); + PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1); + PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3); + BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5); + + /* stage2 */ + ILVR_H2_SH(in3, in1, in2, in0, s1, s0); + k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); + k1 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); + k2 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); + k3 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); + DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3); + SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, VP9_DCT_CONST_BITS); + PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1); + PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3); + BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3); + + /* stage3 */ + s0 = __msa_ilvr_h(s6, s5); + + k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); + DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1); + SRARI_W2_SW(tmp0, tmp1, VP9_DCT_CONST_BITS); + PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3); + + /* stage4 */ + BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + + /* final rounding (add 2^4, divide by 2^5) and shift */ + SRARI_H4_SH(in0, in1, in2, in3, 5); + SRARI_H4_SH(in4, in5, in6, in7, 5); + + /* add block and store 8x8 */ + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); + dst += (4 * dst_stride); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); +} + +static void vp9_idct8x8_colcol_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) +{ + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + /* load vector elements of 8x8 block */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + /* 1D idct8x8 */ + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + /* columns transform */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + /* 1D idct8x8 */ + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + /* final rounding (add 2^4, divide by 2^5) and shift */ + SRARI_H4_SH(in0, in1, in2, in3, 5); + SRARI_H4_SH(in4, in5, in6, in7, 5); + /* add block and store 8x8 */ + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); + dst += (4 * dst_stride); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); +} + +static void vp9_iadst8x8_colcol_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) +{ + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 cnst0, cnst1, cnst2, cnst3, cnst4; + v8i16 temp0, temp1, temp2, temp3, s0, s1; + v16i8 zero = { 0 }; + + /* load vector elements of 8x8 block */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + + /* 1D adst8x8 */ + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + + /* columns transform */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + + cnst0 = __msa_fill_h(cospi_2_64); + cnst1 = __msa_fill_h(cospi_30_64); + cnst2 = -cnst0; + ILVEV_H2_SH(cnst0, cnst1, cnst1, cnst2, cnst0, cnst1); + cnst2 = __msa_fill_h(cospi_18_64); + cnst3 = __msa_fill_h(cospi_14_64); + cnst4 = -cnst2; + ILVEV_H2_SH(cnst2, cnst3, cnst3, cnst4, cnst2, cnst3); + + ILVRL_H2_SH(in0, in7, temp1, temp0); + ILVRL_H2_SH(in4, in3, temp3, temp2); + VP9_DOT_ADD_SUB_SRARI_PCK(temp0, temp1, temp2, temp3, cnst0, cnst1, cnst2, + cnst3, in7, in0, in4, in3); + + cnst0 = __msa_fill_h(cospi_10_64); + cnst1 = __msa_fill_h(cospi_22_64); + cnst2 = -cnst0; + ILVEV_H2_SH(cnst0, cnst1, cnst1, cnst2, cnst0, cnst1); + cnst2 = __msa_fill_h(cospi_26_64); + cnst3 = __msa_fill_h(cospi_6_64); + cnst4 = -cnst2; + ILVEV_H2_SH(cnst2, cnst3, cnst3, cnst4, cnst2, cnst3); + + ILVRL_H2_SH(in2, in5, temp1, temp0); + ILVRL_H2_SH(in6, in1, temp3, temp2); + VP9_DOT_ADD_SUB_SRARI_PCK(temp0, temp1, temp2, temp3, cnst0, cnst1, cnst2, + cnst3, in5, in2, in6, in1); + BUTTERFLY_4(in7, in0, in2, in5, s1, s0, in2, in5); + out7 = -s0; + out0 = s1; + SRARI_H2_SH(out0, out7, 5); + dst0 = LD_UB(dst + 0 * dst_stride); + dst7 = LD_UB(dst + 7 * dst_stride); + + res0 = (v8i16) __msa_ilvr_b(zero, (v16i8) dst0); + res0 += out0; + res0 = CLIP_SH_0_255(res0); + res0 = (v8i16) __msa_pckev_b((v16i8) res0, (v16i8) res0); + ST8x1_UB(res0, dst); + + res7 = (v8i16) __msa_ilvr_b(zero, (v16i8) dst7); + res7 += out7; + res7 = CLIP_SH_0_255(res7); + res7 = (v8i16) __msa_pckev_b((v16i8) res7, (v16i8) res7); + ST8x1_UB(res7, dst + 7 * dst_stride); + + cnst1 = __msa_fill_h(cospi_24_64); + cnst0 = __msa_fill_h(cospi_8_64); + cnst3 = -cnst1; + cnst2 = -cnst0; + + ILVEV_H2_SH(cnst3, cnst0, cnst1, cnst2, cnst3, cnst2); + cnst0 = __msa_ilvev_h(cnst1, cnst0); + cnst1 = cnst0; + + ILVRL_H2_SH(in4, in3, temp1, temp0); + ILVRL_H2_SH(in6, in1, temp3, temp2); + VP9_DOT_ADD_SUB_SRARI_PCK(temp0, temp1, temp2, temp3, cnst0, cnst2, cnst3, + cnst1, out1, out6, s0, s1); + out1 = -out1; + SRARI_H2_SH(out1, out6, 5); + dst1 = LD_UB(dst + 1 * dst_stride); + dst6 = LD_UB(dst + 6 * dst_stride); + ILVR_B2_SH(zero, dst1, zero, dst6, res1, res6); + ADD2(res1, out1, res6, out6, res1, res6); + CLIP_SH2_0_255(res1, res6); + PCKEV_B2_SH(res1, res1, res6, res6, res1, res6); + ST8x1_UB(res1, dst + dst_stride); + ST8x1_UB(res6, dst + 6 * dst_stride); + + cnst0 = __msa_fill_h(cospi_16_64); + cnst1 = -cnst0; + cnst1 = __msa_ilvev_h(cnst1, cnst0); + + ILVRL_H2_SH(in2, in5, temp1, temp0); + ILVRL_H2_SH(s0, s1, temp3, temp2); + out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp0, temp1, cnst0); + out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp0, temp1, cnst1); + out3 = -out3; + SRARI_H2_SH(out3, out4, 5); + dst3 = LD_UB(dst + 3 * dst_stride); + dst4 = LD_UB(dst + 4 * dst_stride); + ILVR_B2_SH(zero, dst3, zero, dst4, res3, res4); + ADD2(res3, out3, res4, out4, res3, res4); + CLIP_SH2_0_255(res3, res4); + PCKEV_B2_SH(res3, res3, res4, res4, res3, res4); + ST8x1_UB(res3, dst + 3 * dst_stride); + ST8x1_UB(res4, dst + 4 * dst_stride); + + out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp2, temp3, cnst0); + out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp2, temp3, cnst1); + out5 = -out5; + SRARI_H2_SH(out2, out5, 5); + dst2 = LD_UB(dst + 2 * dst_stride); + dst5 = LD_UB(dst + 5 * dst_stride); + ILVR_B2_SH(zero, dst2, zero, dst5, res2, res5); + ADD2(res2, out2, res5, out5, res2, res5); + CLIP_SH2_0_255(res2, res5); + PCKEV_B2_SH(res2, res2, res5, res5, res2, res5); + ST8x1_UB(res2, dst + 2 * dst_stride); + ST8x1_UB(res5, dst + 5 * dst_stride); +} + +static void vp9_iadst_idct_8x8_add_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride, int32_t eob) +{ + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + /* load vector elements of 8x8 block */ + LD_SH8(input, 8, in1, in6, in3, in4, in5, in2, in7, in0); + /* 1D idct8x8 */ + VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + /* columns transform */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + /* 1D idct8x8 */ + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + /* final rounding (add 2^4, divide by 2^5) and shift */ + SRARI_H4_SH(in0, in1, in2, in3, 5); + SRARI_H4_SH(in4, in5, in6, in7, 5); + /* add block and store 8x8 */ + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); + dst += (4 * dst_stride); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); +} + +static void vp9_idct_iadst_8x8_add_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride, int32_t eob) +{ + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + /* load vector elements of 8x8 block */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + + /* 1D idct8x8 */ + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + /* columns transform */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in1, in6, in3, in4, in5, in2, in7, in0); + /* 1D idct8x8 */ + VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + /* final rounding (add 2^4, divide by 2^5) and shift */ + SRARI_H4_SH(in0, in1, in2, in3, 5); + SRARI_H4_SH(in4, in5, in6, in7, 5); + /* add block and store 8x8 */ + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); + dst += (4 * dst_stride); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); +} + +#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, \ + r9, r10, r11, r12, r13, r14, r15, \ + out0, out1, out2, out3, out4, out5, \ + out6, out7, out8, out9, out10, out11, \ + out12, out13, out14, out15) \ +{ \ + v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \ + v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \ + v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \ + v8i16 h8_m, h9_m, h10_m, h11_m; \ + v8i16 k0_m, k1_m, k2_m, k3_m; \ + \ + /* stage 1 */ \ + k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \ + k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \ + k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \ + k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \ + VP9_MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, \ + g0_m, g1_m, g2_m, g3_m); \ + k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \ + k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \ + k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \ + k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \ + VP9_MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, \ + g4_m, g5_m, g6_m, g7_m); \ + k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \ + k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \ + k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \ + k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \ + VP9_MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, \ + g8_m, g9_m, g10_m, g11_m); \ + k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \ + k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \ + k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \ + k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \ + VP9_MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, \ + g12_m, g13_m, g14_m, g15_m); \ + \ + /* stage 2 */ \ + k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \ + k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \ + k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \ + VP9_MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, \ + h0_m, h1_m, h2_m, h3_m); \ + k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \ + k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \ + k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \ + VP9_MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, \ + h4_m, h5_m, h6_m, h7_m); \ + BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \ + BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, \ + h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \ + \ + /* stage 3 */ \ + BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \ + k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \ + VP9_MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, \ + out4, out6, out5, out7); \ + VP9_MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, \ + out12, out14, out13, out15); \ + \ + /* stage 4 */ \ + k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ + k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \ + k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ + k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \ + VP9_MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \ + VP9_MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \ + VP9_MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \ + VP9_MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \ +} + +static void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) +{ + v8i16 loc0, loc1, loc2, loc3; + v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; + v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; + v8i16 tmp5, tmp6, tmp7; + + /* load up 8x8 */ + LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + input += 8 * 16; + /* load bottom 8x8 */ + LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); + + VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); + VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); + BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); + VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); + VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); + VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); + BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); + + reg0 = reg2 - loc1; + reg2 = reg2 + loc1; + reg12 = reg14 - loc0; + reg14 = reg14 + loc0; + reg4 = reg6 - loc3; + reg6 = reg6 + loc3; + reg8 = reg10 - loc2; + reg10 = reg10 + loc2; + + /* stage 2 */ + VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); + VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); + + reg9 = reg1 - loc2; + reg1 = reg1 + loc2; + reg7 = reg15 - loc3; + reg15 = reg15 + loc3; + + VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); + VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); + BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); + + loc1 = reg15 + reg3; + reg3 = reg15 - reg3; + loc2 = reg2 + loc1; + reg15 = reg2 - loc1; + + loc1 = reg1 + reg13; + reg13 = reg1 - reg13; + loc0 = reg0 + loc1; + loc1 = reg0 - loc1; + tmp6 = loc0; + tmp7 = loc1; + reg0 = loc2; + + VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); + VP9_DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, + reg11); + + loc0 = reg9 + reg5; + reg5 = reg9 - reg5; + reg2 = reg6 + loc0; + reg1 = reg6 - loc0; + + loc0 = reg7 + reg11; + reg11 = reg7 - reg11; + loc1 = reg4 + loc0; + loc2 = reg4 - loc0; + tmp5 = loc1; + + VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); + BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); + + reg10 = loc0; + reg11 = loc1; + + VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); + BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); + reg13 = loc2; + + /* Transpose and store the output */ + reg12 = tmp5; + reg14 = tmp6; + reg3 = tmp7; + + SRARI_H4_SH(reg0, reg2, reg4, reg6, 6); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6); + dst += (4 * dst_stride); + SRARI_H4_SH(reg8, reg10, reg12, reg14, 6); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14); + dst += (4 * dst_stride); + SRARI_H4_SH(reg3, reg13, reg11, reg5, 6); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5); + dst += (4 * dst_stride); + SRARI_H4_SH(reg7, reg9, reg1, reg15, 6); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15); +} + +static void vp9_idct16_1d_columns_msa(int16_t *input, int16_t *output) +{ + v8i16 loc0, loc1, loc2, loc3; + v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; + v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; + v8i16 tmp5, tmp6, tmp7; + + /* load up 8x8 */ + LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + input += 8 * 16; + /* load bottom 8x8 */ + LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); + + VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); + VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); + BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); + VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); + VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); + VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); + BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); + + reg0 = reg2 - loc1; + reg2 = reg2 + loc1; + reg12 = reg14 - loc0; + reg14 = reg14 + loc0; + reg4 = reg6 - loc3; + reg6 = reg6 + loc3; + reg8 = reg10 - loc2; + reg10 = reg10 + loc2; + + /* stage 2 */ + VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); + VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); + + reg9 = reg1 - loc2; + reg1 = reg1 + loc2; + reg7 = reg15 - loc3; + reg15 = reg15 + loc3; + + VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); + VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); + BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); + + loc1 = reg15 + reg3; + reg3 = reg15 - reg3; + loc2 = reg2 + loc1; + reg15 = reg2 - loc1; + + loc1 = reg1 + reg13; + reg13 = reg1 - reg13; + loc0 = reg0 + loc1; + loc1 = reg0 - loc1; + tmp6 = loc0; + tmp7 = loc1; + reg0 = loc2; + + VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); + VP9_DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, + reg11); + + loc0 = reg9 + reg5; + reg5 = reg9 - reg5; + reg2 = reg6 + loc0; + reg1 = reg6 - loc0; + + loc0 = reg7 + reg11; + reg11 = reg7 - reg11; + loc1 = reg4 + loc0; + loc2 = reg4 - loc0; + + tmp5 = loc1; + + VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); + BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); + + reg10 = loc0; + reg11 = loc1; + + VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); + BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); + reg13 = loc2; + + /* Transpose and store the output */ + reg12 = tmp5; + reg14 = tmp6; + reg3 = tmp7; + + /* transpose block */ + TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, + reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14); + ST_SH4(reg0, reg2, reg4, reg6, output, 16); + ST_SH4(reg8, reg10, reg12, reg14, (output + 4 * 16), 16); + + /* transpose block */ + TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, + reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15); + ST_SH4(reg3, reg13, reg11, reg5, (output + 8), 16); + ST_SH4(reg7, reg9, reg1, reg15, (output + 8 + 4 * 16), 16); +} + +static void vp9_idct16x16_1_add_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) +{ + uint8_t i; + int16_t out; + v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7; + v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 6); + + vec = __msa_fill_h(out); + + for (i = 4; i--;) + { + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + UNPCK_UB_SH(dst0, res0, res4); + UNPCK_UB_SH(dst1, res1, res5); + UNPCK_UB_SH(dst2, res2, res6); + UNPCK_UB_SH(dst3, res3, res7); + ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, + res3); + ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, + res7); + CLIP_SH4_0_255(res0, res1, res2, res3); + CLIP_SH4_0_255(res4, res5, res6, res7); + PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, + tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void vp9_idct16x16_10_colcol_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) +{ + int32_t i; + int16_t out_arr[16 * 16] ALLOC_ALIGNED(ALIGNMENT); + int16_t *out = out_arr; + + /* transform rows */ + vp9_idct16_1d_columns_msa(input, out); + + /* short case just considers top 4 rows as valid output */ + out += 4 * 16; + for (i = 12; i--;) { + __asm__ volatile ( + "sw $zero, 0(%[out]) \n\t" + "sw $zero, 4(%[out]) \n\t" + "sw $zero, 8(%[out]) \n\t" + "sw $zero, 12(%[out]) \n\t" + "sw $zero, 16(%[out]) \n\t" + "sw $zero, 20(%[out]) \n\t" + "sw $zero, 24(%[out]) \n\t" + "sw $zero, 28(%[out]) \n\t" + + : + : [out] "r" (out) + ); + + out += 16; + } + + out = out_arr; + + /* transform columns */ + for (i = 0; i < 2; i++) { + /* process 8 * 16 block */ + vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +static void vp9_idct16x16_colcol_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) +{ + int32_t i; + int16_t out_arr[16 * 16] ALLOC_ALIGNED(ALIGNMENT); + int16_t *out = out_arr; + + /* transform rows */ + for (i = 0; i < 2; i++) { + /* process 8 * 16 block */ + vp9_idct16_1d_columns_msa((input + (i << 3)), (out + (i << 7))); + } + + /* transform columns */ + for (i = 0; i < 2; i++) { + /* process 8 * 16 block */ + vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +static void vp9_iadst16_1d_columns_msa(int16_t *input, int16_t *output) +{ + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; + + /* load input data */ + LD_SH16(input, 16, + l0, l1, l2, l3, l4, l5, l6, l7, + l8, l9, l10, l11, l12, l13, l14, l15); + + /* ADST in horizontal */ + VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, + l8, l9, l10, l11, l12, l13, l14, l15, + r0, r1, r2, r3, r4, r5, r6, r7, + r8, r9, r10, r11, r12, r13, r14, r15); + + l1 = -r8; + l3 = -r4; + l13 = -r13; + l15 = -r1; + + TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, + l0, l1, l2, l3, l4, l5, l6, l7); + ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16); + TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, + l8, l9, l10, l11, l12, l13, l14, l15); + ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16); +} + +static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) +{ + v8i16 v0, v2, v4, v6, k0, k1, k2, k3; + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 out8, out9, out10, out11, out12, out13, out14, out15; + v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15; + v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7; + v8i16 res8, res9, res10, res11, res12, res13, res14, res15; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + v16i8 zero = { 0 }; + + r0 = LD_SH(input + 0 * 16); + r3 = LD_SH(input + 3 * 16); + r4 = LD_SH(input + 4 * 16); + r7 = LD_SH(input + 7 * 16); + r8 = LD_SH(input + 8 * 16); + r11 = LD_SH(input + 11 * 16); + r12 = LD_SH(input + 12 * 16); + r15 = LD_SH(input + 15 * 16); + + /* stage 1 */ + k0 = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); + k1 = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); + k2 = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); + k3 = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); + VP9_MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); + k0 = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); + k1 = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); + k2 = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); + k3 = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); + VP9_MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); + BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0); + k0 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); + k1 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); + k2 = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); + VP9_MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); + + r1 = LD_SH(input + 1 * 16); + r2 = LD_SH(input + 2 * 16); + r5 = LD_SH(input + 5 * 16); + r6 = LD_SH(input + 6 * 16); + r9 = LD_SH(input + 9 * 16); + r10 = LD_SH(input + 10 * 16); + r13 = LD_SH(input + 13 * 16); + r14 = LD_SH(input + 14 * 16); + + k0 = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); + k1 = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); + k2 = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); + k3 = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); + VP9_MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7); + k0 = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); + k1 = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); + k2 = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); + k3 = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); + VP9_MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15); + BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4); + BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10); + out1 = -out1; + SRARI_H2_SH(out0, out1, 6); + dst0 = LD_UB(dst + 0 * dst_stride); + dst1 = LD_UB(dst + 15 * dst_stride); + ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1); + ADD2(res0, out0, res1, out1, res0, res1); + CLIP_SH2_0_255(res0, res1); + PCKEV_B2_SH(res0, res0, res1, res1, res0, res1); + ST8x1_UB(res0, dst); + ST8x1_UB(res1, dst + 15 * dst_stride); + + k0 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); + k1 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); + k2 = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); + VP9_MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); + BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); + out8 = -out8; + + SRARI_H2_SH(out8, out9, 6); + dst8 = LD_UB(dst + 1 * dst_stride); + dst9 = LD_UB(dst + 14 * dst_stride); + ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9); + ADD2(res8, out8, res9, out9, res8, res9); + CLIP_SH2_0_255(res8, res9); + PCKEV_B2_SH(res8, res8, res9, res9, res8, res9); + ST8x1_UB(res8, dst + dst_stride); + ST8x1_UB(res9, dst + 14 * dst_stride); + + k0 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); + k1 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); + k2 = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); + VP9_MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7); + out4 = -out4; + SRARI_H2_SH(out4, out5, 6); + dst4 = LD_UB(dst + 3 * dst_stride); + dst5 = LD_UB(dst + 12 * dst_stride); + ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5); + ADD2(res4, out4, res5, out5, res4, res5); + CLIP_SH2_0_255(res4, res5); + PCKEV_B2_SH(res4, res4, res5, res5, res4, res5); + ST8x1_UB(res4, dst + 3 * dst_stride); + ST8x1_UB(res5, dst + 12 * dst_stride); + + VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); + out13 = -out13; + SRARI_H2_SH(out12, out13, 6); + dst12 = LD_UB(dst + 2 * dst_stride); + dst13 = LD_UB(dst + 13 * dst_stride); + ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13); + ADD2(res12, out12, res13, out13, res12, res13); + CLIP_SH2_0_255(res12, res13); + PCKEV_B2_SH(res12, res12, res13, res13, res12, res13); + ST8x1_UB(res12, dst + 2 * dst_stride); + ST8x1_UB(res13, dst + 13 * dst_stride); + + k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); + k3 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); + VP9_MADD_SHORT(out6, out7, k0, k3, out6, out7); + SRARI_H2_SH(out6, out7, 6); + dst6 = LD_UB(dst + 4 * dst_stride); + dst7 = LD_UB(dst + 11 * dst_stride); + ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7); + ADD2(res6, out6, res7, out7, res6, res7); + CLIP_SH2_0_255(res6, res7); + PCKEV_B2_SH(res6, res6, res7, res7, res6, res7); + ST8x1_UB(res6, dst + 4 * dst_stride); + ST8x1_UB(res7, dst + 11 * dst_stride); + + VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11); + SRARI_H2_SH(out10, out11, 6); + dst10 = LD_UB(dst + 6 * dst_stride); + dst11 = LD_UB(dst + 9 * dst_stride); + ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11); + ADD2(res10, out10, res11, out11, res10, res11); + CLIP_SH2_0_255(res10, res11); + PCKEV_B2_SH(res10, res10, res11, res11, res10, res11); + ST8x1_UB(res10, dst + 6 * dst_stride); + ST8x1_UB(res11, dst + 9 * dst_stride); + + k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); + k2 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); + VP9_MADD_SHORT(h10, h11, k1, k2, out2, out3); + SRARI_H2_SH(out2, out3, 6); + dst2 = LD_UB(dst + 7 * dst_stride); + dst3 = LD_UB(dst + 8 * dst_stride); + ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3); + ADD2(res2, out2, res3, out3, res2, res3); + CLIP_SH2_0_255(res2, res3); + PCKEV_B2_SH(res2, res2, res3, res3, res2, res3); + ST8x1_UB(res2, dst + 7 * dst_stride); + ST8x1_UB(res3, dst + 8 * dst_stride); + + VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15); + SRARI_H2_SH(out14, out15, 6); + dst14 = LD_UB(dst + 5 * dst_stride); + dst15 = LD_UB(dst + 10 * dst_stride); + ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15); + ADD2(res14, out14, res15, out15, res14, res15); + CLIP_SH2_0_255(res14, res15); + PCKEV_B2_SH(res14, res14, res15, res15, res14, res15); + ST8x1_UB(res14, dst + 5 * dst_stride); + ST8x1_UB(res15, dst + 10 * dst_stride); +} + +static void vp9_iadst16x16_colcol_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) +{ + int16_t out_arr[16 * 16] ALLOC_ALIGNED(ALIGNMENT); + int16_t *out = out_arr; + int32_t i; + + /* transform rows */ + for (i = 0; i < 2; i++) { + /* process 16 * 8 block */ + vp9_iadst16_1d_columns_msa((input + (i << 3)), (out + (i << 7))); + } + + /* transform columns */ + for (i = 0; i < 2; i++) { + /* process 8 * 16 block */ + vp9_iadst16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +static void vp9_iadst_idct_16x16_add_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride, int32_t eob) +{ + int32_t i; + int16_t out[16 * 16]; + int16_t *out_ptr = &out[0]; + + /* transform rows */ + for (i = 0; i < 2; i++) { + /* process 8 * 16 block */ + vp9_iadst16_1d_columns_msa((input + (i << 3)), (out_ptr + (i << 7))); + } + + /* transform columns */ + for (i = 0; i < 2; i++) { + /* process 8 * 16 block */ + vp9_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), + (dst + (i << 3)), dst_stride); + } +} + +static void vp9_idct_iadst_16x16_add_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride, int32_t eob) +{ + int32_t i; + int16_t out[16 * 16]; + int16_t *out_ptr = &out[0]; + + /* transform rows */ + for (i = 0; i < 2; i++) { + /* process 8 * 16 block */ + vp9_idct16_1d_columns_msa((input + (i << 3)), (out_ptr + (i << 7))); + } + + /* transform columns */ + for (i = 0; i < 2; i++) { + /* process 8 * 16 block */ + vp9_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)), + (dst + (i << 3)), dst_stride); + } +} + +static void vp9_idct_butterfly_transpose_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf, + int16_t *tmp_odd_buf, + int16_t *dst) +{ + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; + + /* FINAL BUTTERFLY : Dependency on Even & Odd */ + vec0 = LD_SH(tmp_odd_buf); + vec1 = LD_SH(tmp_odd_buf + 9 * 8); + vec2 = LD_SH(tmp_odd_buf + 14 * 8); + vec3 = LD_SH(tmp_odd_buf + 6 * 8); + loc0 = LD_SH(tmp_eve_buf); + loc1 = LD_SH(tmp_eve_buf + 8 * 8); + loc2 = LD_SH(tmp_eve_buf + 4 * 8); + loc3 = LD_SH(tmp_eve_buf + 12 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); + + ST_SH((loc0 - vec3), (tmp_buf + 31 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 23 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 27 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 19 * 8)); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 4 * 8); + vec1 = LD_SH(tmp_odd_buf + 13 * 8); + vec2 = LD_SH(tmp_odd_buf + 10 * 8); + vec3 = LD_SH(tmp_odd_buf + 3 * 8); + loc0 = LD_SH(tmp_eve_buf + 2 * 8); + loc1 = LD_SH(tmp_eve_buf + 10 * 8); + loc2 = LD_SH(tmp_eve_buf + 6 * 8); + loc3 = LD_SH(tmp_eve_buf + 14 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); + + ST_SH((loc0 - vec3), (tmp_buf + 29 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 21 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 25 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 17 * 8)); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 2 * 8); + vec1 = LD_SH(tmp_odd_buf + 11 * 8); + vec2 = LD_SH(tmp_odd_buf + 12 * 8); + vec3 = LD_SH(tmp_odd_buf + 7 * 8); + loc0 = LD_SH(tmp_eve_buf + 1 * 8); + loc1 = LD_SH(tmp_eve_buf + 9 * 8); + loc2 = LD_SH(tmp_eve_buf + 5 * 8); + loc3 = LD_SH(tmp_eve_buf + 13 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); + + ST_SH((loc0 - vec3), (tmp_buf + 30 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 22 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 26 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 18 * 8)); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 5 * 8); + vec1 = LD_SH(tmp_odd_buf + 15 * 8); + vec2 = LD_SH(tmp_odd_buf + 8 * 8); + vec3 = LD_SH(tmp_odd_buf + 1 * 8); + loc0 = LD_SH(tmp_eve_buf + 3 * 8); + loc1 = LD_SH(tmp_eve_buf + 11 * 8); + loc2 = LD_SH(tmp_eve_buf + 7 * 8); + loc3 = LD_SH(tmp_eve_buf + 15 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); + + ST_SH((loc0 - vec3), (tmp_buf + 28 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 20 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 24 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 16 * 8)); + + /* Transpose : 16 vectors */ + /* 1st & 2nd 8x8 */ + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, + m0, n0, m1, n1, m2, n2, m3, n3); + ST_SH4(m0, n0, m1, n1, (dst + 0), 32); + ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32); + + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, + m4, n4, m5, n5, m6, n6, m7, n7); + ST_SH4(m4, n4, m5, n5, (dst + 8), 32); + ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32); + + /* 3rd & 4th 8x8 */ + LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3); + LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, + m0, n0, m1, n1, m2, n2, m3, n3); + ST_SH4(m0, n0, m1, n1, (dst + 16), 32); + ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32); + + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, + m4, n4, m5, n5, m6, n6, m7, n7); + ST_SH4(m4, n4, m5, n5, (dst + 24), 32); + ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32); +} + +static void vp9_idct8x32_column_even_process_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf) +{ + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; + + /* Even stage 1 */ + LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + tmp_buf += (2 * 32); + + VP9_DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + VP9_DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + VP9_DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + + loc1 = vec3; + loc0 = vec1; + + VP9_DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + VP9_DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); + + /* Even stage 2 */ + /* Load 8 */ + LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + + VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + + vec0 = reg0 + reg4; + reg0 = reg0 - reg4; + reg4 = reg6 + reg2; + reg6 = reg6 - reg2; + reg2 = reg1 + reg5; + reg1 = reg1 - reg5; + reg5 = reg7 + reg3; + reg7 = reg7 - reg3; + reg3 = vec0; + + vec1 = reg2; + reg2 = reg3 + reg4; + reg3 = reg3 - reg4; + reg4 = reg5 - vec1; + reg5 = reg5 + vec1; + + VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + VP9_DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); + + vec0 = reg0 - reg6; + reg0 = reg0 + reg6; + vec1 = reg7 - reg1; + reg7 = reg7 + reg1; + + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + + /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ + /* Store 8 */ + BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, tmp_eve_buf, 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8); + + BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8); + + /* Store 8 */ + BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8); + + BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8); +} + +static void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf, + int16_t *tmp_odd_buf) +{ + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + + /* Odd stage 1 */ + reg0 = LD_SH(tmp_buf + 32); + reg1 = LD_SH(tmp_buf + 7 * 32); + reg2 = LD_SH(tmp_buf + 9 * 32); + reg3 = LD_SH(tmp_buf + 15 * 32); + reg4 = LD_SH(tmp_buf + 17 * 32); + reg5 = LD_SH(tmp_buf + 23 * 32); + reg6 = LD_SH(tmp_buf + 25 * 32); + reg7 = LD_SH(tmp_buf + 31 * 32); + + VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + + vec0 = reg0 + reg3; + reg0 = reg0 - reg3; + reg3 = reg7 + reg4; + reg7 = reg7 - reg4; + reg4 = reg1 + reg2; + reg1 = reg1 - reg2; + reg2 = reg6 + reg5; + reg6 = reg6 - reg5; + reg5 = vec0; + + /* 4 Stores */ + ADD2(reg5, reg4, reg3, reg2, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); + SUB2(reg5, reg4, reg3, reg2, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + ST_SH2(vec0, vec1, tmp_odd_buf, 8); + + /* 4 Stores */ + VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + VP9_DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); + VP9_DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); + + /* Odd stage 2 */ + /* 8 loads */ + reg0 = LD_SH(tmp_buf + 3 * 32); + reg1 = LD_SH(tmp_buf + 5 * 32); + reg2 = LD_SH(tmp_buf + 11 * 32); + reg3 = LD_SH(tmp_buf + 13 * 32); + reg4 = LD_SH(tmp_buf + 19 * 32); + reg5 = LD_SH(tmp_buf + 21 * 32); + reg6 = LD_SH(tmp_buf + 27 * 32); + reg7 = LD_SH(tmp_buf + 29 * 32); + + VP9_DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + VP9_DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + VP9_DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + + /* 4 Stores */ + SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, + vec0, vec1, vec2, vec3); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2); + ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); + VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); + + /* 4 Stores */ + ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, + vec0, vec1, vec2, vec3); + BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8); + VP9_DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); + + /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ + /* Load 8 & Store 8 */ + LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); + LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, + loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); + + SUB2(reg0, reg4, reg1, reg5, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg2, reg6, reg3, reg7, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); + + /* Load 8 & Store 8 */ + LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); + LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, + loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); + + SUB2(reg0, reg4, reg3, reg7, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg1, reg5, reg2, reg6, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); +} + +static void vp9_idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, + int16_t *tmp_odd_buf, + uint8_t *dst, + int32_t dst_stride) +{ + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; + + /* FINAL BUTTERFLY : Dependency on Even & Odd */ + vec0 = LD_SH(tmp_odd_buf); + vec1 = LD_SH(tmp_odd_buf + 9 * 8); + vec2 = LD_SH(tmp_odd_buf + 14 * 8); + vec3 = LD_SH(tmp_odd_buf + 6 * 8); + loc0 = LD_SH(tmp_eve_buf); + loc1 = LD_SH(tmp_eve_buf + 8 * 8); + loc2 = LD_SH(tmp_eve_buf + 4 * 8); + loc3 = LD_SH(tmp_eve_buf + 12 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); + SRARI_H4_SH(m0, m2, m4, m6, 6); + VP9_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0); + SRARI_H4_SH(m0, m2, m4, m6, 6); + VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), + m0, m2, m4, m6); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 4 * 8); + vec1 = LD_SH(tmp_odd_buf + 13 * 8); + vec2 = LD_SH(tmp_odd_buf + 10 * 8); + vec3 = LD_SH(tmp_odd_buf + 3 * 8); + loc0 = LD_SH(tmp_eve_buf + 2 * 8); + loc1 = LD_SH(tmp_eve_buf + 10 * 8); + loc2 = LD_SH(tmp_eve_buf + 6 * 8); + loc3 = LD_SH(tmp_eve_buf + 14 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); + SRARI_H4_SH(m1, m3, m5, m7, 6); + VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), + m1, m3, m5, m7); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1); + SRARI_H4_SH(m1, m3, m5, m7, 6); + VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), + m1, m3, m5, m7); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 2 * 8); + vec1 = LD_SH(tmp_odd_buf + 11 * 8); + vec2 = LD_SH(tmp_odd_buf + 12 * 8); + vec3 = LD_SH(tmp_odd_buf + 7 * 8); + loc0 = LD_SH(tmp_eve_buf + 1 * 8); + loc1 = LD_SH(tmp_eve_buf + 9 * 8); + loc2 = LD_SH(tmp_eve_buf + 5 * 8); + loc3 = LD_SH(tmp_eve_buf + 13 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); + SRARI_H4_SH(n0, n2, n4, n6, 6); + VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), + n0, n2, n4, n6); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0); + SRARI_H4_SH(n0, n2, n4, n6, 6); + VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), + n0, n2, n4, n6); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 5 * 8); + vec1 = LD_SH(tmp_odd_buf + 15 * 8); + vec2 = LD_SH(tmp_odd_buf + 8 * 8); + vec3 = LD_SH(tmp_odd_buf + 1 * 8); + loc0 = LD_SH(tmp_eve_buf + 3 * 8); + loc1 = LD_SH(tmp_eve_buf + 11 * 8); + loc2 = LD_SH(tmp_eve_buf + 7 * 8); + loc3 = LD_SH(tmp_eve_buf + 15 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); + SRARI_H4_SH(n1, n3, n5, n7, 6); + VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), + n1, n3, n5, n7); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1); + SRARI_H4_SH(n1, n3, n5, n7, 6); + VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), + n1, n3, n5, n7); +} + +static void vp9_idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) +{ + int16_t tmp_odd_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT); + int16_t tmp_eve_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT); + + vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); + vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); + vp9_idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], + dst, dst_stride); +} + +static void vp9_idct8x32_1d_columns_msa(int16_t *input, int16_t *output, + int16_t *tmp_buf) +{ + int16_t tmp_odd_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT); + int16_t tmp_eve_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT); + + vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); + vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); + vp9_idct_butterfly_transpose_store(tmp_buf, &tmp_eve_buf[0], + &tmp_odd_buf[0], output); +} + +static void vp9_idct32x32_1_add_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) +{ + int32_t i; + int16_t out; + v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 6); + + vec = __msa_fill_h(out); + + for (i = 16; i--;) + { + LD_UB2(dst, 16, dst0, dst1); + LD_UB2(dst + dst_stride, 16, dst2, dst3); + + UNPCK_UB_SH(dst0, res0, res4); + UNPCK_UB_SH(dst1, res1, res5); + UNPCK_UB_SH(dst2, res2, res6); + UNPCK_UB_SH(dst3, res3, res7); + ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, + res3); + ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, + res7); + CLIP_SH4_0_255(res0, res1, res2, res3); + CLIP_SH4_0_255(res4, res5, res6, res7); + PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, + tmp0, tmp1, tmp2, tmp3); + + ST_UB2(tmp0, tmp1, dst, 16); + dst += dst_stride; + ST_UB2(tmp2, tmp3, dst, 16); + dst += dst_stride; + } +} + +static void vp9_idct32x32_34_colcol_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) +{ + int32_t i; + int16_t out_arr[32 * 32] ALLOC_ALIGNED(ALIGNMENT); + int16_t *out_ptr = out_arr; + int16_t tmp_buf[8 * 32] ALLOC_ALIGNED(ALIGNMENT); + + for (i = 32; i--;) { + __asm__ volatile ( + "sw $zero, (%[out_ptr]) \n\t" + "sw $zero, 4(%[out_ptr]) \n\t" + "sw $zero, 8(%[out_ptr]) \n\t" + "sw $zero, 12(%[out_ptr]) \n\t" + "sw $zero, 16(%[out_ptr]) \n\t" + "sw $zero, 20(%[out_ptr]) \n\t" + "sw $zero, 24(%[out_ptr]) \n\t" + "sw $zero, 28(%[out_ptr]) \n\t" + "sw $zero, 32(%[out_ptr]) \n\t" + "sw $zero, 36(%[out_ptr]) \n\t" + "sw $zero, 40(%[out_ptr]) \n\t" + "sw $zero, 44(%[out_ptr]) \n\t" + "sw $zero, 48(%[out_ptr]) \n\t" + "sw $zero, 52(%[out_ptr]) \n\t" + "sw $zero, 56(%[out_ptr]) \n\t" + "sw $zero, 60(%[out_ptr]) \n\t" + + : + : [out_ptr] "r" (out_ptr) + ); + + out_ptr += 32; + } + + out_ptr = out_arr; + + /* process 8*32 block */ + vp9_idct8x32_1d_columns_msa(input, out_ptr, &tmp_buf[0]); + + /* transform columns */ + for (i = 0; i < 4; i++) { + /* process 8*32 block */ + vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), + (dst + (i << 3)), dst_stride); + } +} + +static void vp9_idct32x32_colcol_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) +{ + int32_t i; + int16_t out_arr[32 * 32] ALLOC_ALIGNED(ALIGNMENT); + int16_t *out_ptr = out_arr; + int16_t tmp_buf[8 * 32] ALLOC_ALIGNED(ALIGNMENT); + + /* transform rows */ + for (i = 0; i < 4; i++) { + /* process 8*32 block */ + vp9_idct8x32_1d_columns_msa((input + (i << 3)), (out_ptr + (i << 8)), + &tmp_buf[0]); + } + + /* transform columns */ + for (i = 0; i < 4; i++) { + /* process 8*32 block */ + vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), + (dst + (i << 3)), dst_stride); + } +} + +void ff_idct_idct_4x4_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob) +{ + if (eob > 1) { + vp9_idct4x4_colcol_addblk_msa(block, dst, stride); + memset(block, 0, 4 * 4 * sizeof(*block)); + } + else { + vp9_idct4x4_1_add_msa(block, dst, stride); + block[0] = 0; + } +} + +void ff_idct_idct_8x8_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob) +{ + if (eob == 1) { + vp9_idct8x8_1_add_msa(block, dst, stride); + block[0] = 0; + } + else if (eob <= 12) { + vp9_idct8x8_12_colcol_addblk_msa(block, dst, stride); + memset(block, 0, 4 * 8 * sizeof(*block)); + } + else { + vp9_idct8x8_colcol_addblk_msa(block, dst, stride); + memset(block, 0, 8 * 8 * sizeof(*block)); + } +} + +void ff_idct_idct_16x16_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob) +{ + int i; + + if (eob == 1) { + /* DC only DCT coefficient. */ + vp9_idct16x16_1_add_msa(block, dst, stride); + block[0] = 0; + } + else if (eob <= 10) { + vp9_idct16x16_10_colcol_addblk_msa(block, dst, stride); + for (i = 0; i < 4; ++i) { + memset(block, 0, 4 * sizeof(*block)); + block += 16; + } + } + else { + vp9_idct16x16_colcol_addblk_msa(block, dst, stride); + memset(block, 0, 16 * 16 * sizeof(*block)); + } +} + +void ff_idct_idct_32x32_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob) +{ + int i; + + if (eob == 1) { + vp9_idct32x32_1_add_msa(block, dst, stride); + block[0] = 0; + } + else if (eob <= 34) { + vp9_idct32x32_34_colcol_addblk_msa(block, dst, stride); + for (i = 0; i < 8; ++i) { + memset(block, 0, 8 * sizeof(*block)); + block += 32; + } + } + else { + vp9_idct32x32_colcol_addblk_msa(block, dst, stride); + memset(block, 0, 32 * 32 * sizeof(*block)); + } +} + +void ff_iadst_iadst_4x4_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob) +{ + vp9_iadst4x4_colcol_addblk_msa(block, dst, stride); + memset(block, 0, 4 * 4 * sizeof(*block)); +} + +void ff_iadst_iadst_8x8_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob) +{ + vp9_iadst8x8_colcol_addblk_msa(block, dst, stride); + memset(block, 0, 8 * 8 * sizeof(*block)); +} + +void ff_iadst_iadst_16x16_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob) +{ + vp9_iadst16x16_colcol_addblk_msa(block, dst, stride); + memset(block, 0, 16 * 16 * sizeof(*block)); +} + +void ff_idct_iadst_4x4_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob) +{ + vp9_idct_iadst_4x4_add_msa(block, dst, stride, eob); + memset(block, 0, 4 * 4 * sizeof(*block)); +} + +void ff_idct_iadst_8x8_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob) +{ + vp9_idct_iadst_8x8_add_msa(block, dst, stride, eob); + memset(block, 0, 8 * 8 * sizeof(*block)); +} + +void ff_idct_iadst_16x16_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob) +{ + vp9_idct_iadst_16x16_add_msa(block, dst, stride, eob); + memset(block, 0, 16 * 16 * sizeof(*block)); +} + +void ff_iadst_idct_4x4_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob) +{ + vp9_iadst_idct_4x4_add_msa(block, dst, stride, eob); + memset(block, 0, 4 * 4 * sizeof(*block)); +} + +void ff_iadst_idct_8x8_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob) +{ + vp9_iadst_idct_8x8_add_msa(block, dst, stride, eob); + memset(block, 0, 8 * 8 * sizeof(*block)); +} + +void ff_iadst_idct_16x16_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob) +{ + vp9_iadst_idct_16x16_add_msa(block, dst, stride, eob); + memset(block, 0, 16 * 16 * sizeof(*block)); +} diff --git a/libavcodec/mips/vp9_intra_msa.c b/libavcodec/mips/vp9_intra_msa.c new file mode 100644 index 0000000000..54cf0ae94f --- /dev/null +++ b/libavcodec/mips/vp9_intra_msa.c @@ -0,0 +1,533 @@ +/* + * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/vp9dsp.h" +#include "libavutil/mips/generic_macros_msa.h" +#include "vp9dsp_mips.h" + +#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \ +{ \ + out0 = __msa_subs_u_h(out0, in0); \ + out1 = __msa_subs_u_h(out1, in1); \ +} + +void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left, + const uint8_t *src) +{ + uint32_t row; + v16u8 src0; + + src0 = LD_UB(src); + + for (row = 16; row--;) { + ST_UB(src0, dst); + dst += dst_stride; + } +} + +void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left, + const uint8_t *src) +{ + uint32_t row; + v16u8 src1, src2; + + src1 = LD_UB(src); + src2 = LD_UB(src + 16); + + for (row = 32; row--;) { + ST_UB2(src1, src2, dst, 16); + dst += dst_stride; + } +} + +void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, + const uint8_t *top) +{ + uint32_t row, inp; + v16u8 src0, src1, src2, src3; + + src += 12; + for (row = 4; row--;) { + inp = LW(src); + src -= 4; + + src0 = (v16u8) __msa_fill_b(inp >> 24); + src1 = (v16u8) __msa_fill_b(inp >> 16); + src2 = (v16u8) __msa_fill_b(inp >> 8); + src3 = (v16u8) __msa_fill_b(inp); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, + const uint8_t *top) +{ + uint32_t row, inp; + v16u8 src0, src1, src2, src3; + + src += 28; + for (row = 8; row--;) { + inp = LW(src); + src -= 4; + + src0 = (v16u8) __msa_fill_b(inp >> 24); + src1 = (v16u8) __msa_fill_b(inp >> 16); + src2 = (v16u8) __msa_fill_b(inp >> 8); + src3 = (v16u8) __msa_fill_b(inp); + + ST_UB2(src0, src0, dst, 16); + dst += dst_stride; + ST_UB2(src1, src1, dst, 16); + dst += dst_stride; + ST_UB2(src2, src2, dst, 16); + dst += dst_stride; + ST_UB2(src3, src3, dst, 16); + dst += dst_stride; + } +} + +void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, + const uint8_t *src_top) +{ + uint32_t val0, val1; + v16i8 store, src = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LW(src_top); + val1 = LW(src_left); + INSERT_W2_SB(val0, val1, src); + sum_h = __msa_hadd_u_h((v16u8) src, (v16u8) src); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3); + store = __msa_splati_b((v16i8) sum_w, 0); + val0 = __msa_copy_u_w((v4i32) store, 0); + + SW4(val0, val0, val0, val0, dst, dst_stride); +} + +#define INTRA_DC_TL_4x4(dir) \ +void ff_dc_##dir##_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *left, \ + const uint8_t *top) \ +{ \ + uint32_t val0; \ + v16i8 store, data = { 0 }; \ + v8u16 sum_h; \ + v4u32 sum_w; \ + \ + val0 = LW(dir); \ + data = (v16i8) __msa_insert_w((v4i32) data, 0, val0); \ + sum_h = __msa_hadd_u_h((v16u8) data, (v16u8) data); \ + sum_w = __msa_hadd_u_w(sum_h, sum_h); \ + sum_w = (v4u32) __msa_srari_w((v4i32) sum_w, 2); \ + store = __msa_splati_b((v16i8) sum_w, 0); \ + val0 = __msa_copy_u_w((v4i32) store, 0); \ + \ + SW4(val0, val0, val0, val0, dst, dst_stride); \ +} +INTRA_DC_TL_4x4(top); +INTRA_DC_TL_4x4(left); + +void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, + const uint8_t *src_top) +{ + uint64_t val0, val1; + v16i8 store; + v16u8 src = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LD(src_top); + val1 = LD(src_left); + INSERT_D2_UB(val0, val1, src); + sum_h = __msa_hadd_u_h(src, src); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4); + store = __msa_splati_b((v16i8) sum_w, 0); + val0 = __msa_copy_u_d((v2i64) store, 0); + + SD4(val0, val0, val0, val0, dst, dst_stride); + dst += (4 * dst_stride); + SD4(val0, val0, val0, val0, dst, dst_stride); +} + +#define INTRA_DC_TL_8x8(dir) \ +void ff_dc_##dir##_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *left, \ + const uint8_t *top) \ +{ \ + uint64_t val0; \ + v16i8 store; \ + v16u8 data = { 0 }; \ + v8u16 sum_h; \ + v4u32 sum_w; \ + v2u64 sum_d; \ + \ + val0 = LD(dir); \ + data = (v16u8) __msa_insert_d((v2i64) data, 0, val0); \ + sum_h = __msa_hadd_u_h(data, data); \ + sum_w = __msa_hadd_u_w(sum_h, sum_h); \ + sum_d = __msa_hadd_u_d(sum_w, sum_w); \ + sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3); \ + store = __msa_splati_b((v16i8) sum_w, 0); \ + val0 = __msa_copy_u_d((v2i64) store, 0); \ + \ + SD4(val0, val0, val0, val0, dst, dst_stride); \ + dst += (4 * dst_stride); \ + SD4(val0, val0, val0, val0, dst, dst_stride); \ +} + +INTRA_DC_TL_8x8(top); +INTRA_DC_TL_8x8(left); + +void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src_left, const uint8_t *src_top) +{ + v16u8 top, left, out; + v8u16 sum_h, sum_top, sum_left; + v4u32 sum_w; + v2u64 sum_d; + + top = LD_UB(src_top); + left = LD_UB(src_left); + HADD_UB2_UH(top, left, sum_top, sum_left); + sum_h = sum_top + sum_left; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5); + out = (v16u8) __msa_splati_b((v16i8) sum_w, 0); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +#define INTRA_DC_TL_16x16(dir) \ +void ff_dc_##dir##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *left, \ + const uint8_t *top) \ +{ \ + v16u8 data, out; \ + v8u16 sum_h; \ + v4u32 sum_w; \ + v2u64 sum_d; \ + \ + data = LD_UB(dir); \ + sum_h = __msa_hadd_u_h(data, data); \ + sum_w = __msa_hadd_u_w(sum_h, sum_h); \ + sum_d = __msa_hadd_u_d(sum_w, sum_w); \ + sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); \ + sum_d = __msa_hadd_u_d(sum_w, sum_w); \ + sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4); \ + out = (v16u8) __msa_splati_b((v16i8) sum_w, 0); \ + \ + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \ + dst += (8 * dst_stride); \ + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \ +} +INTRA_DC_TL_16x16(top); +INTRA_DC_TL_16x16(left); + +void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src_left, const uint8_t *src_top) +{ + uint32_t row; + v16u8 top0, top1, left0, left1, out; + v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1; + v4u32 sum_w; + v2u64 sum_d; + + LD_UB2(src_top, 16, top0, top1); + LD_UB2(src_left, 16, left0, left1); + HADD_UB2_UH(top0, top1, sum_top0, sum_top1); + HADD_UB2_UH(left0, left1, sum_left0, sum_left1); + sum_h = sum_top0 + sum_top1; + sum_h += sum_left0 + sum_left1; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 6); + out = (v16u8) __msa_splati_b((v16i8) sum_w, 0); + + for (row = 16; row--;) + { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +#define INTRA_DC_TL_32x32(dir) \ +void ff_dc_##dir##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *left, \ + const uint8_t *top) \ +{ \ + uint32_t row; \ + v16u8 data0, data1, out; \ + v8u16 sum_h, sum_data0, sum_data1; \ + v4u32 sum_w; \ + v2u64 sum_d; \ + \ + LD_UB2(dir, 16, data0, data1); \ + HADD_UB2_UH(data0, data1, sum_data0, sum_data1); \ + sum_h = sum_data0 + sum_data1; \ + sum_w = __msa_hadd_u_w(sum_h, sum_h); \ + sum_d = __msa_hadd_u_d(sum_w, sum_w); \ + sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); \ + sum_d = __msa_hadd_u_d(sum_w, sum_w); \ + sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5); \ + out = (v16u8) __msa_splati_b((v16i8) sum_w, 0); \ + \ + for (row = 16; row--;) \ + { \ + ST_UB2(out, out, dst, 16); \ + dst += dst_stride; \ + ST_UB2(out, out, dst, 16); \ + dst += dst_stride; \ + } \ +} +INTRA_DC_TL_32x32(top); +INTRA_DC_TL_32x32(left); + +#define INTRA_PREDICT_VALDC_16X16_MSA(val) \ +void ff_dc_##val##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *left, const uint8_t *top) \ +{ \ + v16u8 out = (v16u8) __msa_ldi_b(val); \ + \ + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \ + dst += (8 * dst_stride); \ + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \ +} + +INTRA_PREDICT_VALDC_16X16_MSA(127); +INTRA_PREDICT_VALDC_16X16_MSA(128); +INTRA_PREDICT_VALDC_16X16_MSA(129); + +#define INTRA_PREDICT_VALDC_32X32_MSA(val) \ +void ff_dc_##val##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *left, const uint8_t *top) \ +{ \ + uint32_t row; \ + v16u8 out = (v16u8) __msa_ldi_b(val); \ + \ + for (row = 16; row--;) \ + { \ + ST_UB2(out, out, dst, 16); \ + dst += dst_stride; \ + ST_UB2(out, out, dst, 16); \ + dst += dst_stride; \ + } \ +} + +INTRA_PREDICT_VALDC_32X32_MSA(127); +INTRA_PREDICT_VALDC_32X32_MSA(128); +INTRA_PREDICT_VALDC_32X32_MSA(129); + +void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src_left, const uint8_t *src_top_ptr) +{ + uint32_t left; + uint8_t top_left = src_top_ptr[-1]; + v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1; + v16u8 src0, src1, src2, src3; + v8u16 src_top_left, vec0, vec1, vec2, vec3; + + src_top_left = (v8u16) __msa_fill_h(top_left); + src_top = LD_SB(src_top_ptr); + left = LW(src_left); + src_left0 = __msa_fill_b(left >> 24); + src_left1 = __msa_fill_b(left >> 16); + src_left2 = __msa_fill_b(left >> 8); + src_left3 = __msa_fill_b(left); + + ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, + src_left3, src_top, src0, src1, src2, src3); + HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); + SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); + ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride); +} + +void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src_left, const uint8_t *src_top_ptr) +{ + uint8_t top_left = src_top_ptr[-1]; + uint32_t loop_cnt, left; + v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1; + v8u16 src_top_left, vec0, vec1, vec2, vec3; + v16u8 src0, src1, src2, src3; + + src_top = LD_SB(src_top_ptr); + src_top_left = (v8u16) __msa_fill_h(top_left); + + src_left += 4; + for (loop_cnt = 2; loop_cnt--;) { + left = LW(src_left); + src_left0 = __msa_fill_b(left >> 24); + src_left1 = __msa_fill_b(left >> 16); + src_left2 = __msa_fill_b(left >> 8); + src_left3 = __msa_fill_b(left); + src_left -= 4; + + ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, + src_left3, src_top, src0, src1, src2, src3); + HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); + SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src_left, const uint8_t *src_top_ptr) +{ + uint8_t top_left = src_top_ptr[-1]; + uint32_t loop_cnt, left; + v16i8 src_top, src_left0, src_left1, src_left2, src_left3; + v8u16 src_top_left, res_r, res_l; + + src_top = LD_SB(src_top_ptr); + src_top_left = (v8u16) __msa_fill_h(top_left); + + src_left += 12; + for (loop_cnt = 4; loop_cnt--;) { + left = LW(src_left); + src_left0 = __msa_fill_b(left >> 24); + src_left1 = __msa_fill_b(left >> 16); + src_left2 = __msa_fill_b(left >> 8); + src_left3 = __msa_fill_b(left); + src_left -= 4; + + ILVRL_B2_UH(src_left0, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left1, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left2, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left3, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + } +} + +void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src_left, const uint8_t *src_top_ptr) +{ + uint8_t top_left = src_top_ptr[-1]; + uint32_t loop_cnt, left; + v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3; + v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1; + + src_top0 = LD_SB(src_top_ptr); + src_top1 = LD_SB(src_top_ptr + 16); + src_top_left = (v8u16) __msa_fill_h(top_left); + + src_left += 28; + for (loop_cnt = 8; loop_cnt--;) { + left = LW(src_left); + src_left0 = __msa_fill_b(left >> 24); + src_left1 = __msa_fill_b(left >> 16); + src_left2 = __msa_fill_b(left >> 8); + src_left3 = __msa_fill_b(left); + src_left -= 4; + + ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, + res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, + res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, + res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, + res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + } +} diff --git a/libavcodec/mips/vp9_lpf_msa.c b/libavcodec/mips/vp9_lpf_msa.c new file mode 100644 index 0000000000..eef8afc482 --- /dev/null +++ b/libavcodec/mips/vp9_lpf_msa.c @@ -0,0 +1,2599 @@ +/* + * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/vp9dsp.h" +#include "libavutil/mips/generic_macros_msa.h" +#include "vp9dsp_mips.h" + +#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ + p1_out, p0_out, q0_out, q1_out) \ +{ \ + v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ + v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ + v8i16 q0_sub_p0_r, filt_r, cnst3h; \ + \ + p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + filt = filt & (v16i8) hev_in; \ + q0_sub_p0 = q0_m - p0_m; \ + filt_sign = __msa_clti_s_b(filt, 0); \ + \ + cnst3h = __msa_ldi_h(3); \ + q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h); \ + filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \ + filt_r += q0_sub_p0_r; \ + filt_r = __msa_sat_s_h(filt_r, 7); \ + \ + /* combine left and right part */ \ + filt = __msa_pckev_b((v16i8) filt_r, (v16i8) filt_r); \ + \ + filt = filt & (v16i8) mask_in; \ + cnst4b = __msa_ldi_b(4); \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= 3; \ + \ + cnst3b = __msa_ldi_b(3); \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= 3; \ + \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + q0_out = __msa_xori_b((v16u8) q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + p0_out = __msa_xori_b((v16u8) p0_m, 0x80); \ + \ + filt = __msa_srari_b(filt1, 1); \ + hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \ + filt = filt & (v16i8) hev_in; \ + \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1_out = __msa_xori_b((v16u8) q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1_out = __msa_xori_b((v16u8) p1_m, 0x80); \ +} + +#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ + p1_out, p0_out, q0_out, q1_out) \ +{ \ + v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ + v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ + v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ + \ + p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + \ + filt = filt & (v16i8) hev_in; \ + \ + q0_sub_p0 = q0_m - p0_m; \ + filt_sign = __msa_clti_s_b(filt, 0); \ + \ + cnst3h = __msa_ldi_h(3); \ + q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h); \ + filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \ + filt_r += q0_sub_p0_r; \ + filt_r = __msa_sat_s_h(filt_r, 7); \ + \ + q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_l = __msa_dotp_s_h((v16i8) q0_sub_p0_l, (v16i8) cnst3h); \ + filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt); \ + filt_l += q0_sub_p0_l; \ + filt_l = __msa_sat_s_h(filt_l, 7); \ + \ + filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r); \ + filt = filt & (v16i8) mask_in; \ + \ + cnst4b = __msa_ldi_b(4); \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= 3; \ + \ + cnst3b = __msa_ldi_b(3); \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= 3; \ + \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + q0_out = __msa_xori_b((v16u8) q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + p0_out = __msa_xori_b((v16u8) p0_m, 0x80); \ + \ + filt = __msa_srari_b(filt1, 1); \ + hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \ + filt = filt & (v16i8) hev_in; \ + \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1_out = __msa_xori_b((v16u8) q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1_out = __msa_xori_b((v16u8) p1_m, 0x80); \ +} + +#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ +{ \ + v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ + v16u8 zero_in = { 0 }; \ + \ + tmp = __msa_ori_b(zero_in, 1); \ + p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ + q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ + p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ + q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ + \ + p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ + flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ + p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ + flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ + \ + flat_out = (tmp < (v16u8) flat_out); \ + flat_out = __msa_xori_b(flat_out, 0xff); \ + flat_out = flat_out & (mask); \ +} + +#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, \ + q5_in, q6_in, q7_in, flat_in, flat2_out) \ +{ \ + v16u8 tmp, zero_in = { 0 }; \ + v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ + v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ + \ + tmp = __msa_ori_b(zero_in, 1); \ + p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \ + q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \ + p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \ + q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \ + p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \ + q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \ + p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \ + q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \ + \ + p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \ + flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \ + flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \ + p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \ + flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \ + p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \ + flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \ + \ + flat2_out = (tmp < (v16u8) flat2_out); \ + flat2_out = __msa_xori_b(flat2_out, 0xff); \ + flat2_out = flat2_out & flat_in; \ +} + +#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, \ + q0_in, q1_in, q2_in, q3_in, \ + p2_filt8_out, p1_filt8_out, p0_filt8_out, \ + q0_filt8_out, q1_filt8_out, q2_filt8_out) \ +{ \ + v8u16 tmp0, tmp1, tmp2; \ + \ + tmp2 = p2_in + p1_in + p0_in; \ + tmp0 = p3_in << 1; \ + \ + tmp0 = tmp0 + tmp2 + q0_in; \ + tmp1 = tmp0 + p3_in + p2_in; \ + p2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \ + \ + tmp1 = tmp0 + p1_in + q1_in; \ + p1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \ + \ + tmp1 = q2_in + q1_in + q0_in; \ + tmp2 = tmp2 + tmp1; \ + tmp0 = tmp2 + (p0_in); \ + tmp0 = tmp0 + (p3_in); \ + p0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp0, 3); \ + \ + tmp0 = q2_in + q3_in; \ + tmp0 = p0_in + tmp1 + tmp0; \ + tmp1 = q3_in + q3_in; \ + tmp1 = tmp1 + tmp0; \ + q2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \ + \ + tmp0 = tmp2 + q3_in; \ + tmp1 = tmp0 + q0_in; \ + q0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \ + \ + tmp1 = tmp0 - p2_in; \ + tmp0 = q1_in + q3_in; \ + tmp1 = tmp0 + tmp1; \ + q1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \ +} + +#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \ + q0_in, q1_in, q2_in, q3_in, \ + limit_in, b_limit_in, thresh_in, \ + hev_out, mask_out, flat_out) \ +{ \ + v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ + v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ + \ + /* absolute subtraction of pixel values */ \ + p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \ + p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \ + p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \ + q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \ + q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \ + q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \ + p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \ + p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \ + \ + /* calculation of hev */ \ + flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ + hev_out = thresh_in < (v16u8) flat_out; \ + \ + /* calculation of mask */ \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ + p1_asub_q1_m >>= 1; \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ + \ + mask_out = b_limit_in < p0_asub_q0_m; \ + mask_out = __msa_max_u_b(flat_out, mask_out); \ + p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ + mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ + q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ + mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ + \ + mask_out = limit_in < (v16u8) mask_out; \ + mask_out = __msa_xori_b(mask_out, 0xff); \ +} + +void ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + uint64_t p1_d, p0_d, q0_d, q1_d; + v16u8 mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8) __msa_fill_b(thresh_ptr); + b_limit = (v16u8) __msa_fill_b(b_limit_ptr); + limit = (v16u8) __msa_fill_b(limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + p1_d = __msa_copy_u_d((v2i64) p1_out, 0); + p0_d = __msa_copy_u_d((v2i64) p0_out, 0); + q0_d = __msa_copy_u_d((v2i64) q0_out, 0); + q1_d = __msa_copy_u_d((v2i64) q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); +} + + +void ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh0 = (v16u8) __msa_fill_b(thresh_ptr); + thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8); + thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0); + + b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr); + b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8); + b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0); + + limit0 = (v16u8) __msa_fill_b(limit_ptr); + limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8); + limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, + hev, mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + + ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); +} + +void ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; + v16u8 mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v8i16 p2_filter8, p1_filter8, p0_filter8; + v8i16 q0_filter8, q1_filter8, q2_filter8; + v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; + v16i8 zero = { 0 }; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8) __msa_fill_b(thresh_ptr); + b_limit = (v16u8) __msa_fill_b(b_limit_ptr); + limit = (v16u8) __msa_fill_b(limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__msa_test_bz_v(flat)) { + p1_d = __msa_copy_u_d((v2i64) p1_out, 0); + p0_d = __msa_copy_u_d((v2i64) p0_out, 0); + q0_d = __msa_copy_u_d((v2i64) q0_out, 0); + q1_d = __msa_copy_u_d((v2i64) q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, + q2_r, q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, + zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8, + q0_filter8); + PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat); + q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat); + + p2_d = __msa_copy_u_d((v2i64) p2_out, 0); + p1_d = __msa_copy_u_d((v2i64) p1_out, 0); + p0_d = __msa_copy_u_d((v2i64) p0_out, 0); + q0_d = __msa_copy_u_d((v2i64) q0_out, 0); + q1_d = __msa_copy_u_d((v2i64) q1_out, 0); + q2_d = __msa_copy_u_d((v2i64) q2_out, 0); + + src -= 3 * pitch; + + SD4(p2_d, p1_d, p0_d, q0_d, src, pitch); + src += (4 * pitch); + SD(q1_d, src); + src += pitch; + SD(q2_d, src); + } +} + +void ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; + v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l; + v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + + /* load vector elements */ + LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8) __msa_fill_b(thresh_ptr); + tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8); + thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh); + + b_limit = (v16u8) __msa_fill_b(b_limit_ptr); + tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8); + b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit); + + limit = (v16u8) __msa_fill_b(limit_ptr); + tmp = (v16u8) __msa_fill_b(limit_ptr >> 8); + limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__msa_test_bz_v(flat)) { + ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, + q2_r, q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, + p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, + q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, + q1_filt8_r, q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); + + src -= 3 * pitch; + + ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); + src += (4 * pitch); + ST_UB2(q1_out, q2_out, src, pitch); + src += (2 * pitch); + } +} + +void ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; + v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; + v16u8 zero = { 0 }; + + /* load vector elements */ + LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8) __msa_fill_b(thresh_ptr); + tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8); + thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh); + + b_limit = (v16u8) __msa_fill_b(b_limit_ptr); + tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8); + b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit); + + limit = (v16u8) __msa_fill_b(limit_ptr); + tmp = (v16u8) __msa_fill_b(limit_ptr >> 8); + limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__msa_test_bz_v(flat)) { + ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, + q2_r, q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, + p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r, + p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, + q1_filt8_r, q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); + + src -= 3 * pitch; + + ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); + src += (4 * pitch); + ST_UB2(q1_out, q2_out, src, pitch); + src += (2 * pitch); + } +} + +void ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l; + v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + + /* load vector elements */ + LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8) __msa_fill_b(thresh_ptr); + tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8); + thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh); + + b_limit = (v16u8) __msa_fill_b(b_limit_ptr); + tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8); + b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit); + + limit = (v16u8) __msa_fill_b(limit_ptr); + tmp = (v16u8) __msa_fill_b(limit_ptr >> 8); + limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__msa_test_bz_v(flat)) { + ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); + } else { + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, + p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, + q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l, + p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, + p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat); + q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat); + + src -= 3 * pitch; + + ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); + src += (4 * pitch); + ST_UB2(q1_out, q2_out, src, pitch); + src += (2 * pitch); + } +} + +static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch, + uint8_t *filter48, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; + v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l; + v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + + /* load vector elements */ + LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8) __msa_fill_b(thresh_ptr); + b_limit = (v16u8) __msa_fill_b(b_limit_ptr); + limit = (v16u8) __msa_fill_b(limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__msa_test_bz_v(flat)) { + ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); + + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, + q2_r, q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, + p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, + q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +static void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48) +{ + v16u8 flat, flat2, filter8; + v16i8 zero = { 0 }; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in; + v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in; + v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in; + v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in; + v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; + v8i16 l_out, r_out; + + flat = LD_UB(filter48 + 96); + + LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7); + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + /* if flat2 is zero for all pixels, then no need to calculate other filter */ + if (__msa_test_bz_v(flat2)) { + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + src -= 3 * pitch; + ST_UB4(p2, p1, p0, q0, src, pitch); + src += (4 * pitch); + ST_UB2(q1, q2, src, pitch); + } else { + src -= 7 * pitch; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, + zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, + p3_r_in, p2_r_in, p1_r_in, p0_r_in); + + q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + + ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, + p5_l_in, p4_l_in); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, + p1_l_in, p0_l_in); + q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2); + ST_UB(p6, src); + src += pitch; + + /* p5 */ + q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + + q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2); + ST_UB(p5, src); + src += pitch; + + /* p4 */ + q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = (v8i16) __msa_srari_h((v8i16) tmp1_r, 4); + + q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2); + ST_UB(p4, src); + src += pitch; + + /* p3 */ + q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + + q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2); + ST_UB(p3, src); + src += pitch; + + /* p2 */ + q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + + q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* p1 */ + q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + + q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* p0 */ + q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + + q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q0 */ + q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + + q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2); + ST_UB(q3, src); + src += pitch; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2); + ST_UB(q4, src); + src += pitch; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2); + ST_UB(q5, src); + src += pitch; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2); + ST_UB(q6, src); + } +} + +void ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + uint8_t filter48[16 * 8] ALLOC_ALIGNED(ALIGNMENT); + uint8_t early_exit = 0; + + early_exit = vp9_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], + b_limit_ptr, limit_ptr, thresh_ptr); + + if (0 == early_exit) { + vp9_hz_lpf_t16_16w(src, pitch, filter48); + } +} + +void ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; + uint64_t dword0, dword1; + v16u8 flat2, mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 p0_filter16, p1_filter16; + v8i16 p2_filter8, p1_filter8, p0_filter8; + v8i16 q0_filter8, q1_filter8, q2_filter8; + v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r; + v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; + v16i8 zero = { 0 }; + v8u16 tmp0, tmp1, tmp2; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8) __msa_fill_b(thresh_ptr); + b_limit = (v16u8) __msa_fill_b(b_limit_ptr); + limit = (v16u8) __msa_fill_b(limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__msa_test_bz_v(flat)) { + p1_d = __msa_copy_u_d((v2i64) p1_out, 0); + p0_d = __msa_copy_u_d((v2i64) p0_out, 0); + q0_d = __msa_copy_u_d((v2i64) q0_out, 0); + q1_d = __msa_copy_u_d((v2i64) q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch); + } else { + /* convert 8 bit input data into 16 bit */ + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, + q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, + q1_r, q2_r, q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, + p2_filter8, p1_filter8, p0_filter8, q0_filter8, + q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, + zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8, + q0_filter8); + PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, + q2_filter8); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat); + q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat); + + /* load 16 vector elements */ + LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4); + LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + /* if flat2 is zero for all pixels, then no need to calculate other filter */ + if (__msa_test_bz_v(flat2)) { + p2_d = __msa_copy_u_d((v2i64) p2_out, 0); + p1_d = __msa_copy_u_d((v2i64) p1_out, 0); + p0_d = __msa_copy_u_d((v2i64) p0_out, 0); + q0_d = __msa_copy_u_d((v2i64) q0_out, 0); + q1_d = __msa_copy_u_d((v2i64) q1_out, 0); + q2_d = __msa_copy_u_d((v2i64) q2_out, 0); + + SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch); + SD(q1_d, src + pitch); + SD(q2_d, src + 2 * pitch); + } else { + /* LSB(right) 8 pixel operation */ + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, + zero, q5, zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, + q4_r, q5_r, q6_r, q7_r); + + tmp0 = p7_r << 3; + tmp0 -= p7_r; + tmp0 += p6_r; + tmp0 += q0_r; + + src -= 7 * pitch; + + /* calculation of p6 and p5 */ + tmp1 = p6_r + p5_r + p4_r + p3_r; + tmp1 += (p2_r + p1_r + p0_r); + tmp1 += tmp0; + p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); + tmp0 = p5_r - p6_r + q1_r - p7_r; + tmp1 += tmp0; + p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p4 and p3 */ + tmp0 = p4_r - p5_r + q2_r - p7_r; + tmp2 = p3_r - p4_r + q3_r - p7_r; + tmp1 += tmp0; + p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p2 and p1 */ + tmp0 = p2_r - p3_r + q4_r - p7_r; + tmp2 = p1_r - p2_r + q5_r - p7_r; + tmp1 += tmp0; + p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p0 and q0 */ + tmp0 = (p0_r - p1_r) + (q6_r - p7_r); + tmp2 = (q7_r - p0_r) + (q0_r - p7_r); + tmp1 += tmp0; + p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q1 and q2 */ + tmp0 = q7_r - q0_r + q1_r - p6_r; + tmp2 = q7_r - q1_r + q2_r - p5_r; + tmp1 += tmp0; + p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q3 and q4 */ + tmp0 = (q7_r - q2_r) + (q3_r - p4_r); + tmp2 = (q7_r - q3_r) + (q4_r - p3_r); + tmp1 += tmp0; + p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q5 and q6 */ + tmp0 = (q7_r - q4_r) + (q5_r - p2_r); + tmp2 = (q7_r - q5_r) + (q6_r - p1_r); + tmp1 += tmp0; + p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + } + } +} + +void ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + v16u8 mask, hev, flat, limit, thresh, b_limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v8i16 vec0, vec1, vec2, vec3; + + LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8) __msa_fill_b(thresh_ptr); + b_limit = (v16u8) __msa_fill_b(b_limit_ptr); + limit = (v16u8) __msa_fill_b(limit_ptr); + + TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, + p3, p2, p1, p0, q0, q1, q2, q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + + src -= 2; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + src += 4 * pitch; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); +} + +void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + v16u8 mask, hev, flat; + v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); + LD_UB8(src - 4 + (8 * pitch), pitch, + row8, row9, row10, row11, row12, row13, row14, row15); + + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, + row8, row9, row10, row11, row12, row13, row14, row15, + p3, p2, p1, p0, q0, q1, q2, q3); + + thresh0 = (v16u8) __msa_fill_b(thresh_ptr); + thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8); + thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0); + + b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr); + b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8); + b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0); + + limit0 = (v16u8) __msa_fill_b(limit_ptr); + limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8); + limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, + hev, mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); + ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); + + src -= 2; + + ST4x8_UB(tmp2, tmp3, src, pitch); + src += (8 * pitch); + ST4x8_UB(tmp4, tmp5, src, pitch); +} + +void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p1_out, p0_out, q0_out, q1_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; + v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; + v16u8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4; + + /* load vector elements */ + LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, + p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8) __msa_fill_b(thresh_ptr); + b_limit = (v16u8) __msa_fill_b(b_limit_ptr); + limit = (v16u8) __msa_fill_b(limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__msa_test_bz_v(flat)) { + /* Store 4 pixels p1-_q1 */ + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + + src -= 2; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + src += 4 * pitch; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, + q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r, + p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); + p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); + p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); + q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); + q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); + q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); + + /* Store 6 pixels p2-_q2 */ + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1); + + src -= 3; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec4, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec4, 4, src + 4, pitch); + } +} + +void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + uint8_t *temp_src; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p1_out, p0_out, q0_out, q1_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v16u8 row4, row5, row6, row7, row12, row13, row14, row15; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; + v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l; + v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + temp_src = src - 4; + + LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); + temp_src += (8 * pitch); + LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); + + /* transpose 16x8 matrix into 8x16 */ + TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, + q3, q2, q1, q0, row12, row13, row14, row15, + p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8) __msa_fill_b(thresh_ptr); + vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8); + thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh); + + b_limit = (v16u8) __msa_fill_b(b_limit_ptr); + vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8); + b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit); + + limit = (v16u8) __msa_fill_b(limit_ptr); + vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8); + limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec4, vec5); + + src -= 2; + ST4x8_UB(vec2, vec3, src, pitch); + src += 8 * pitch; + ST4x8_UB(vec4, vec5, src, pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, + q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, + p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, + q3_l); + + /* filter8 */ + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); + p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); + p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); + q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); + q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); + q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec6, vec7); + ILVRL_B2_SH(q2, q1, vec2, vec5); + + src -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec2, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec2, 4, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec5, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec5, 4, src + 4, pitch); + } +} + +void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + uint8_t *temp_src; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p1_out, p0_out, q0_out, q1_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v16u8 row4, row5, row6, row7, row12, row13, row14, row15; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; + v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; + v16u8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + temp_src = src - 4; + + LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); + temp_src += (8 * pitch); + LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); + + /* transpose 16x8 matrix into 8x16 */ + TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, + q3, q2, q1, q0, row12, row13, row14, row15, + p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8) __msa_fill_b(thresh_ptr); + vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8); + thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh); + + b_limit = (v16u8) __msa_fill_b(b_limit_ptr); + vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8); + b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit); + + limit = (v16u8) __msa_fill_b(limit_ptr); + vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8); + limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec4, vec5); + + src -= 2; + ST4x8_UB(vec2, vec3, src, pitch); + src += 8 * pitch; + ST4x8_UB(vec4, vec5, src, pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, + q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, + p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r, + p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, + q1_filt8_r, q2_filt8_r); + + /* store pixel values */ + p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); + p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); + p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); + q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); + q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); + q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec6, vec7); + ILVRL_B2_SH(q2, q1, vec2, vec5); + + src -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec2, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec2, 4, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec5, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec5, 4, src + 4, pitch); + } +} + +void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + uint8_t *temp_src; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p1_out, p0_out, q0_out, q1_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v16u8 row4, row5, row6, row7, row12, row13, row14, row15; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l; + v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + temp_src = src - 4; + + LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); + temp_src += (8 * pitch); + LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); + + /* transpose 16x8 matrix into 8x16 */ + TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, + q3, q2, q1, q0, row12, row13, row14, row15, + p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8) __msa_fill_b(thresh_ptr); + vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8); + thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh); + + b_limit = (v16u8) __msa_fill_b(b_limit_ptr); + vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8); + b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit); + + limit = (v16u8) __msa_fill_b(limit_ptr); + vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8); + limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec4, vec5); + + src -= 2; + ST4x8_UB(vec2, vec3, src, pitch); + src += 8 * pitch; + ST4x8_UB(vec4, vec5, src, pitch); + } else { + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, + p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, + q3_l); + + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l, + p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, + p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + + /* store pixel values */ + p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat); + p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat); + p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat); + q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat); + q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat); + q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec6, vec7); + ILVRL_B2_SH(q2, q1, vec2, vec5); + + src -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec2, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec2, 4, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec5, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec5, 4, src + 4, pitch); + } +} + +static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, + uint8_t *output, int32_t out_pitch) +{ + v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org; + v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + + LD_UB8(input, in_pitch, + p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org); + /* 8x8 transpose */ + TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, + p0_org, p7, p6, p5, p4, p3, p2, p1, p0); + /* 8x8 transpose */ + ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org, + tmp0, tmp1, tmp2, tmp3); + ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6); + ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7); + ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4); + ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6); + SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8); + + ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); + output += (8 * out_pitch); + ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); +} + +static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, + uint8_t *output, int32_t out_pitch) +{ + v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + + LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7); + TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, + q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o); + ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch); +} + +static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch, + uint8_t *output, int32_t out_pitch) +{ + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; + v4i32 tmp2, tmp3; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + + LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7); + input += (8 * in_pitch); + LD_UB8(input, in_pitch, + row8, row9, row10, row11, row12, row13, row14, row15); + + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, + row8, row9, row10, row11, row12, row13, row14, row15, + p7, p6, p5, p4, p3, p2, p1, p0); + + /* transpose 16x8 matrix into 8x16 */ + /* total 8 intermediate register and 32 instructions */ + q7 = (v16u8) __msa_ilvod_d((v2i64) row8, (v2i64) row0); + q6 = (v16u8) __msa_ilvod_d((v2i64) row9, (v2i64) row1); + q5 = (v16u8) __msa_ilvod_d((v2i64) row10, (v2i64) row2); + q4 = (v16u8) __msa_ilvod_d((v2i64) row11, (v2i64) row3); + q3 = (v16u8) __msa_ilvod_d((v2i64) row12, (v2i64) row4); + q2 = (v16u8) __msa_ilvod_d((v2i64) row13, (v2i64) row5); + q1 = (v16u8) __msa_ilvod_d((v2i64) row14, (v2i64) row6); + q0 = (v16u8) __msa_ilvod_d((v2i64) row15, (v2i64) row7); + + ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1); + tmp4 = (v8i16) __msa_ilvod_b((v16i8) q6, (v16i8) q7); + tmp5 = (v8i16) __msa_ilvod_b((v16i8) q4, (v16i8) q5); + + ILVEV_B2_UB(q3, q2, q1, q0, q5, q7); + tmp6 = (v8i16) __msa_ilvod_b((v16i8) q2, (v16i8) q3); + tmp7 = (v8i16) __msa_ilvod_b((v16i8) q0, (v16i8) q1); + + ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3); + q0 = (v16u8) __msa_ilvev_w(tmp3, tmp2); + q4 = (v16u8) __msa_ilvod_w(tmp3, tmp2); + + tmp2 = (v4i32) __msa_ilvod_h(tmp1, tmp0); + tmp3 = (v4i32) __msa_ilvod_h((v8i16) q7, (v8i16) q5); + q2 = (v16u8) __msa_ilvev_w(tmp3, tmp2); + q6 = (v16u8) __msa_ilvod_w(tmp3, tmp2); + + ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3); + q1 = (v16u8) __msa_ilvev_w(tmp3, tmp2); + q5 = (v16u8) __msa_ilvod_w(tmp3, tmp2); + + tmp2 = (v4i32) __msa_ilvod_h(tmp5, tmp4); + tmp3 = (v4i32) __msa_ilvod_h(tmp7, tmp6); + q3 = (v16u8) __msa_ilvev_w(tmp3, tmp2); + q7 = (v16u8) __msa_ilvod_w(tmp3, tmp2); + + ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); + output += (8 * out_pitch); + ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); +} + +static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch_org, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; + v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; + v16i8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3; + + /* load vector elements */ + LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8) __msa_fill_b(thresh_ptr); + b_limit = (v16u8) __msa_fill_b(b_limit_ptr); + limit = (v16u8) __msa_fill_b(limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org); + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, + q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + /* convert 16 bit output data into 8 bit */ + p2_r = (v8u16) __msa_pckev_b((v16i8) p2_filt8_r, (v16i8) p2_filt8_r); + p1_r = (v8u16) __msa_pckev_b((v16i8) p1_filt8_r, (v16i8) p1_filt8_r); + p0_r = (v8u16) __msa_pckev_b((v16i8) p0_filt8_r, (v16i8) p0_filt8_r); + q0_r = (v8u16) __msa_pckev_b((v16i8) q0_filt8_r, (v16i8) q0_filt8_r); + q1_r = (v8u16) __msa_pckev_b((v16i8) q1_filt8_r, (v16i8) q1_filt8_r); + q2_r = (v8u16) __msa_pckev_b((v16i8) q2_filt8_r, (v16i8) q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8) p2_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8) q2_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch, + uint8_t *filter48) +{ + v16i8 zero = { 0 }; + v16u8 filter8, flat, flat2; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in; + v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in; + v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 tmp0_r, tmp1_r; + v8i16 r_out; + + flat = LD_UB(filter48 + 6 * 16); + + LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + /* if flat2 is zero for all pixels, then no need to calculate other filter */ + if (__msa_test_bz_v(flat2)) { + v8i16 vec0, vec1, vec2, vec3, vec4; + + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1); + + src_org -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 4, (src_org + 4), pitch); + + return 1; + } else { + src -= 7 * 16; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, + zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, + p3_r_in, p2_r_in, p1_r_in, p0_r_in); + q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); + p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2); + ST8x1_UB(p6, src); + src += 16; + + /* p5 */ + q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); + p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2); + ST8x1_UB(p5, src); + src += 16; + + /* p4 */ + q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); + p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2); + ST8x1_UB(p4, src); + src += 16; + + /* p3 */ + q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); + p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2); + ST8x1_UB(p3, src); + src += 16; + + /* p2 */ + q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* p1 */ + q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* p0 */ + q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q0 */ + q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); + q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2); + ST8x1_UB(q3, src); + src += 16; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); + q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2); + ST8x1_UB(q4, src); + src += 16; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); + q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2); + ST8x1_UB(q5, src); + src += 16; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); + q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2); + ST8x1_UB(q6, src); + + return 0; + } +} + +void ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + uint8_t early_exit = 0; + uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT); + uint8_t *filter48 = &transposed_input[16 * 16]; + + vp9_transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); + + early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), + &filter48[0], src, pitch, + b_limit_ptr, limit_ptr, thresh_ptr); + + if (0 == early_exit) { + early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, + &filter48[0]); + + if (0 == early_exit) { + vp9_transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch); + } + } +} + +static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, ptrdiff_t pitch, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; + v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l; + v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16i8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + + /* load vector elements */ + LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8) __msa_fill_b(thresh_ptr); + b_limit = (v16u8) __msa_fill_b(b_limit_ptr); + limit = (v16u8) __msa_fill_b(limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec4, vec5); + + src_org -= 2; + ST4x8_UB(vec2, vec3, src_org, pitch); + src_org += 8 * pitch; + ST4x8_UB(vec4, vec5, src_org, pitch); + + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, + q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, + p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, + q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch, + uint8_t *filter48) +{ + v16u8 flat, flat2, filter8; + v16i8 zero = { 0 }; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in; + v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in; + v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in; + v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in; + v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; + v8i16 l_out, r_out; + + flat = LD_UB(filter48 + 6 * 16); + + LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + /* if flat2 is zero for all pixels, then no need to calculate other filter */ + if (__msa_test_bz_v(flat2)) { + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec6, vec7); + ILVRL_B2_SH(q2, q1, vec2, vec5); + + src_org -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 4, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec5, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec5, 4, (src_org + 4), pitch); + + return 1; + } else { + src -= 7 * 16; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, + zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, + p3_r_in, p2_r_in, p1_r_in, p0_r_in); + q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + + ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, + p5_l_in, p4_l_in); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, + p1_l_in, p0_l_in); + q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2); + ST_UB(p6, src); + src += 16; + + /* p5 */ + q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2); + ST_UB(p5, src); + src += 16; + + /* p4 */ + q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2); + ST_UB(p4, src); + src += 16; + + /* p3 */ + q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2); + ST_UB(p3, src); + src += 16; + + /* p2 */ + q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* p1 */ + q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) (tmp1_l), 4); + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* p0 */ + q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q0 */ + q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2); + ST_UB(q3, src); + src += 16; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2); + ST_UB(q4, src); + src += 16; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2); + ST_UB(q5, src); + src += 16; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16) tmp1_r, 4); + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16) tmp1_l, 4); + r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); + q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2); + ST_UB(q6, src); + + return 0; + } +} + +void ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch, + int32_t b_limit_ptr, + int32_t limit_ptr, + int32_t thresh_ptr) +{ + uint8_t early_exit = 0; + uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT); + uint8_t *filter48 = &transposed_input[16 * 16]; + + vp9_transpose_16x16((src - 8), pitch, &transposed_input[0], 16); + + early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), + &filter48[0], src, pitch, + b_limit_ptr, limit_ptr, thresh_ptr); + + if (0 == early_exit) { + early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, + &filter48[0]); + + if (0 == early_exit) { + vp9_transpose_16x16(transposed_input, 16, (src - 8), pitch); + } + } +} diff --git a/libavcodec/mips/vp9_mc_msa.c b/libavcodec/mips/vp9_mc_msa.c new file mode 100644 index 0000000000..1671d973a4 --- /dev/null +++ b/libavcodec/mips/vp9_mc_msa.c @@ -0,0 +1,4510 @@ +/* + * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/vp9dsp.h" +#include "libavutil/mips/generic_macros_msa.h" +#include "vp9dsp_mips.h" + +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static const int8_t vp9_bilinear_filters_msa[15][2] = { + {120, 8}, + {112, 16}, + {104, 24}, + {96, 32}, + {88, 40}, + {80, 48}, + {72, 56}, + {64, 64}, + {56, 72}, + {48, 80}, + {40, 88}, + {32, 96}, + {24, 104}, + {16, 112}, + {8, 120} +}; + +#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \ + filt0, filt1, filt2, filt3) \ +( { \ + v8i16 tmp0, tmp1; \ + \ + tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \ + tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \ + tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2); \ + tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3); \ + tmp0 = __msa_adds_s_h(tmp0, tmp1); \ + \ + tmp0; \ +} ) + +#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, \ + filt_h0, filt_h1, filt_h2, filt_h3) \ +( { \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ + v8i16 hz_out_m; \ + \ + VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, \ + vec0_m, vec1_m, vec2_m, vec3_m); \ + hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, \ + filt_h0, filt_h1, filt_h2, filt_h3); \ + \ + hz_out_m = __msa_srari_h(hz_out_m, 7); \ + hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ + \ + hz_out_m; \ +} ) + +#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ + mask0, mask1, mask2, mask3, \ + filt0, filt1, filt2, filt3, \ + out0, out1) \ +{ \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m; \ + \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ + DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ + DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ + DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ + DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \ + ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \ +} + +#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ + mask0, mask1, mask2, mask3, \ + filt0, filt1, filt2, filt3, \ + out0, out1, out2, out3) \ +{ \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ + res0_m, res1_m, res2_m, res3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ + res4_m, res5_m, res6_m, res7_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ + res0_m, res1_m, res2_m, res3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ + res4_m, res5_m, res6_m, res7_m); \ + ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \ + res7_m, out0, out1, out2, out3); \ +} + +#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \ +{ \ + v16u8 tmp_m; \ + \ + tmp_m = PCKEV_XORI128_UB(in1, in0); \ + tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \ + ST_UB(tmp_m, (pdst)); \ +} + +#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \ +{ \ + v16u8 tmp_m; \ + \ + tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1); \ + tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \ + ST_UB(tmp_m, (pdst)); \ +} + +#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \ + pdst, stride) \ +{ \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + uint8_t *pdst_m = (uint8_t *) (pdst); \ + \ + PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \ + PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ + AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ +} + +static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16u8 mask0, mask1, mask2, mask3, out; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v8i16 filt, out0, out1; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1); + SRARI_H2_SH(out0, out1, 7); + SAT_SH2_SH(out0, out1, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 filt0, filt1, filt2, filt3; + v16i8 src0, src1, src2, src3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_8t_4x16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16u8 mask0, mask1, mask2, mask3, out; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out2, out3); + + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + if (4 == height) { + common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } else if (16 == height) { + common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); +} + +static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, + out1, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + if (4 == height) { + common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, + height); + } +} + +static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, src_stride, src0, src2); + LD_SB2(src + 8, src_stride, src1, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (2 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, + out1, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst); + dst += dst_stride; + } +} + +static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, + out1, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + dst += dst_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, + out1, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + dst += dst_stride; + } +} + +static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, + mask2, mask3, filt0, filt1, filt2, filt3, + out0, out1, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + + src0 = LD_SB(src + 32); + src2 = LD_SB(src + 48); + src3 = LD_SB(src + 56); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, + mask2, mask3, filt0, filt1, filt2, filt3, + out0, out1, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst + 32); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 48); + dst += dst_stride; + } +} + +static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; + v16i8 src10998, filt0, filt1, filt2, filt3; + v16u8 out; + v8i16 filt, out10, out32; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, + src4332, src6554); + XORI_B3_128_SB(src2110, src4332, src6554); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); + XORI_B2_128_SB(src8776, src10998); + out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, + filt1, filt2, filt3); + out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, + filt1, filt2, filt3); + SRARI_H2_SH(out10, out32, 7); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + src2110 = src6554; + src4332 = src8776; + src6554 = src10998; + src6 = src10; + } +} + +static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; + v16u8 tmp0, tmp1; + v8i16 filt, out0_r, out1_r, out2_r, out3_r; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); + tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src6 = src10; + } +} + +static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt0, filt1, filt2, filt3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } +} + +static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, + int32_t width) +{ + const uint8_t *src_tmp; + uint8_t *dst_tmp; + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt0, filt1, filt2, filt3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src_tmp += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, + src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, + src32_l, src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src_tmp += (4 * src_stride); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, + filt0, filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, + filt0, filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, + filt0, filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, + filt0, filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, + filt0, filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, + filt0, filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, + filt0, filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, + filt0, filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } + + src += 16; + dst += 16; + } +} + +static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, + 32); +} + +static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, + 64); +} + +static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8); + out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); + tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8); + out4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8); + tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + SRARI_H2_SH(tmp0, tmp1, 7); + SAT_SH2_SH(tmp0, tmp1, 7); + out = PCKEV_XORI128_UB(tmp0, tmp1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out5 = hz_out9; + out0 = out2; + out1 = out3; + out2 = out4; + } +} + +static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v16u8 mask0, mask1, mask2, mask3, vec0, vec1; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); + ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + XORI_B4_128_SB(src7, src8, src9, src10); + + hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); + tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7); + tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8); + tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, + filt_vt1, filt_vt2, filt_vt3); + + hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9); + tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + vec0 = PCKEV_XORI128_UB(tmp0, tmp1); + vec1 = PCKEV_XORI128_UB(tmp2, tmp3); + ST8x4_UB(vec0, vec1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out6 = hz_out10; + out0 = out2; + out1 = out3; + out2 = out8; + out4 = out6; + out5 = out7; + out6 = out9; + } +} + +static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) +{ + int32_t multiple8_cnt; + + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) +{ + int32_t multiple8_cnt; + + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) +{ + int32_t multiple8_cnt; + + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + + src += 8; + dst += 8; + } +} + +static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 dst0, dst1, dst2, dst3, res2, res3; + v16u8 mask0, mask1, mask2, mask3; + v8i16 filt, res0, res1; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, res0, res1); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + SRARI_H2_SH(res0, res1, 7); + SAT_SH2_SH(res0, res1, 7); + PCKEV_B2_UB(res0, res0, res1, res1, res2, res3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + XORI_B2_128_UB(res2, res3); + AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8i16 filt, vec0, vec1, vec2, vec3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, vec0, vec1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, vec2, vec3); + SRARI_H4_SH(vec0, vec1, vec2, vec3, 7); + SAT_SH4_SH(vec0, vec1, vec2, vec3, 7); + PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, + res0, res1, res2, res3); + ILVR_D2_UB(res1, res0, res3, res2, res0, res2); + XORI_B2_128_UB(res0, res2); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, + dst0, dst2, dst4, dst6); + ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4); + AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2); + ST4x8_UB(res0, res2, dst, dst_stride); +} + +static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + if (4 == height) { + common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, + filter); + } else if (8 == height) { + common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, + filter); + } +} + +static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, + out1, out2, out3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, + dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, dst0, dst1; + v8i16 filt, out0, out1, out2, out3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height >> 1; loop_cnt--;) { + LD_SB2(src, src_stride, src0, src2); + LD_SB2(src + 8, src_stride, src1, src3); + src += (2 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, + vec12); + VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, + vec13); + VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, + vec14); + VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, + vec15); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, + vec1, vec2, vec3); + DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, + vec9, vec10, vec11); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, + vec1, vec2, vec3); + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, + vec8, vec9, vec10, vec11); + ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, + out1, out2, out3); + LD_UB2(dst, dst_stride, dst0, dst1); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst); + dst += dst_stride; + PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 dst1, dst2, mask0, mask1, mask2, mask3; + v8i16 filt, out0, out1, out2, out3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, + vec12); + VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, + vec13); + VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, + vec14); + VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, + vec15); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, + vec1, vec2, vec3); + DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, + vec9, vec10, vec11); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, + vec1, vec2, vec3); + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, + vec8, vec9, vec10, vec11); + ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, + out1, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + LD_UB2(dst, 16, dst1, dst2); + PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst); + PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 dst1, dst2, mask0, mask1, mask2, mask3; + v8i16 filt, out0, out1, out2, out3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + for (cnt = 0; cnt < 2; ++cnt) { + src0 = LD_SB(&src[cnt << 5]); + src2 = LD_SB(&src[16 + (cnt << 5)]); + src3 = LD_SB(&src[24 + (cnt << 5)]); + src1 = __msa_sldi_b(src2, src0, 8); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, + vec12); + VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, + vec13); + VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, + vec10, vec14); + VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, + vec11, vec15); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + vec0, vec1, vec2, vec3); + DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, + vec8, vec9, vec10, vec11); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, + vec0, vec1, vec2, vec3); + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, + vec8, vec9, vec10, vec11); + ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, + out1, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + LD_UB2(&dst[cnt << 5], 16, dst1, dst2); + PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]); + PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]); + } + + src += src_stride; + dst += dst_stride; + } +} + +static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 dst0, dst1, dst2, dst3, out; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; + v16i8 src10998, filt0, filt1, filt2, filt3; + v8i16 filt, out10, out32; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, + src4332, src6554); + XORI_B3_128_SB(src2110, src4332, src6554); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); + XORI_B2_128_SB(src8776, src10998); + out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, + filt1, filt2, filt3); + out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, + filt1, filt2, filt3); + SRARI_H2_SH(out10, out32, 7); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + + dst0 = (v16u8) __msa_ilvr_d((v2i64) dst2, (v2i64) dst0); + out = __msa_aver_u_b(out, dst0); + + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + src2110 = src6554; + src4332 = src8776; + src6554 = src10998; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 dst0, dst1, dst2, dst3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; + v8i16 filt, out0, out1, out2, out3; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out0, out1, out2, out3, 7); + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, + dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height, + int32_t width) +{ + const uint8_t *src_tmp; + uint8_t *dst_tmp; + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16i8 filt0, filt1, filt2, filt3; + v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src_tmp += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, + src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, + src32_l, src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); + src_tmp += (4 * src_stride); + + LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3); + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, + filt0, filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, + filt0, filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, + filt0, filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, + filt0, filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, + filt0, filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, + filt0, filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, + filt0, filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, + filt0, filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, + dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } + + src += 16; + dst += 16; + } +} + +static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, 16); +} + +static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, 32); +} + +static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, 64); +} + +static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8); + vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); + res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8); + vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8); + res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + + SRARI_H2_SH(res0, res1, 7); + SAT_SH2_SH(res0, res1, 7); + PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1); + XORI_B2_128_UB(tmp0, tmp1); + AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1); + ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out5 = hz_out9; + vec0 = vec2; + vec1 = vec3; + vec2 = vec4; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); + ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); + tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7); + tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8); + tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9); + tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, + dst, dst_stride); + dst += (4 * dst_stride); + + hz_out6 = hz_out10; + out0 = out2; + out1 = out3; + out2 = out8; + out4 = out6; + out5 = out7; + out6 = out9; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) +{ + int32_t multiple8_cnt; + + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, + height); + + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) +{ + int32_t multiple8_cnt; + + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, + height); + + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) +{ + int32_t multiple8_cnt; + + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, + height); + + src += 8; + dst += 8; + } +} + +static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, vec0, vec1, res0, res1; + v8u16 vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); + SRARI_H2_UH(vec2, vec3, 7); + PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16u8 vec0, vec1, vec2, vec3, filt0; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16i8 res0, res1, res2, res3; + v8u16 vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + vec4, vec5, vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, 7); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, + res0, res1, res2, res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +void ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; + + if (4 == height) { + common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16u8 filt0; + v16i8 src0, src1, src2, src3, mask; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + vec0, vec1, vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); + ST8x4_UB(src0, src1, dst, dst_stride); +} + +static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + v16u8 filt0; + v16i8 src0, src1, src2, src3, mask, out0, out1; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + vec0, vec1, vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + vec0, vec1, vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + if (16 == height) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + vec0, vec1, vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + vec0, vec1, vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); + } +} + +void ff_put_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; + + if (4 == height) { + common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, + height); + } +} + +void ff_put_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + loop_cnt = (height >> 2) - 1; + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + out0, out1, out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, + out4, out5, out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, 7); + SRARI_H4_UH(out4, out5, out6, out7, 7); + PCKEV_ST_SB(out0, out1, dst); + dst += dst_stride; + PCKEV_ST_SB(out2, out3, dst); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + dst += dst_stride; + PCKEV_ST_SB(out6, out7, dst); + dst += dst_stride; + + for (; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + out0, out1, out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, + out4, out5, out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, 7); + SRARI_H4_UH(out4, out5, out6, out7, 7); + PCKEV_ST_SB(out0, out1, dst); + dst += dst_stride; + PCKEV_ST_SB(out2, out3, dst); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + dst += dst_stride; + PCKEV_ST_SB(out6, out7, dst); + dst += dst_stride; + } +} + +void ff_put_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + for (loop_cnt = height >> 1; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + src4 = LD_SB(src); + src6 = LD_SB(src + 16); + src7 = LD_SB(src + 24); + src5 = __msa_sldi_b(src6, src4, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + out0, out1, out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, + out4, out5, out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, 7); + SRARI_H4_UH(out4, out5, out6, out7, 7); + PCKEV_ST_SB(out0, out1, dst); + PCKEV_ST_SB(out2, out3, dst + 16); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + PCKEV_ST_SB(out6, out7, dst + 16); + dst += dst_stride; + } +} + +void ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src4 = LD_SB(src + 32); + src6 = LD_SB(src + 48); + src7 = LD_SB(src + 56); + SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + out0, out1, out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, + out4, out5, out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, 7); + SRARI_H4_UH(out4, out5, out6, out7, 7); + PCKEV_ST_SB(out0, out1, dst); + PCKEV_ST_SB(out2, out3, dst + 16); + PCKEV_ST_SB(out4, out5, dst + 32); + PCKEV_ST_SB(out6, out7, dst + 48); + dst += dst_stride; + } +} + +static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, src4; + v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; + v16u8 filt0; + v8i16 filt; + v8u16 tmp0, tmp1; + + filt = LD_SH(filter); + filt0 = (v16u8) __msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 filt; + + filt = LD_SH(filter); + filt0 = (v16u8) __msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + src8 = LD_SB(src); + src += src_stride; + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, + src76_r, src87_r); + ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, + src87_r, src76_r, src2110, src4332, src6554, src8776); + DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); + ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); +} + +void ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; + + if (4 == height) { + common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; + v16i8 out0, out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8) __msa_splati_h(filt, 0); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); + ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); +} + +static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) +{ + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v16i8 out0, out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8) __msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); + src += (8 * src_stride); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, + vec0, vec1, vec2, vec3); + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, + vec4, vec5, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + src0 = src8; + } +} + +void ff_put_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; + + if (4 == height) { + common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, + height); + } +} + +void ff_put_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; + v16u8 src0, src1, src2, src3, src4; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8) __msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_ST_SB(tmp0, tmp1, dst); + dst += dst_stride; + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_ST_SB(tmp2, tmp3, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_ST_SB(tmp0, tmp1, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_ST_SB(tmp2, tmp3, dst); + dst += dst_stride; + + src0 = src4; + } +} + +void ff_put_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8) __msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src5 = LD_UB(src + 16); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + + LD_UB4(src + 16, src_stride, src6, src7, src8, src9); + src += (4 * src_stride); + + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_ST_SB(tmp0, tmp1, dst); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride); + + ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); + ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_ST_SB(tmp0, tmp1, dst + 16); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride); + + ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); + ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride); + dst += (4 * dst_stride); + + src0 = src4; + src5 = src9; + } +} + +void ff_put_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8) __msa_splati_h(filt, 0); + + LD_UB4(src, 16, src0, src3, src6, src9); + src += src_stride; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_UB2(src, src_stride, src1, src2); + LD_UB2(src + 16, src_stride, src4, src5); + LD_UB2(src + 32, src_stride, src7, src8); + LD_UB2(src + 48, src_stride, src10, src11); + src += (2 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_ST_SB(tmp0, tmp1, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); + + ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); + ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, 7); + SAT_UH2_UH(tmp4, tmp5, 7); + PCKEV_ST_SB(tmp4, tmp5, dst + 16); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, 7); + SAT_UH2_UH(tmp6, tmp7, 7); + PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride); + + ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); + ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_ST_SB(tmp0, tmp1, dst + 32); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride); + + ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); + ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, 7); + SAT_UH2_UH(tmp4, tmp5, 7); + PCKEV_ST_SB(tmp4, tmp5, dst + 48); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, 7); + SAT_UH2_UH(tmp6, tmp7, 7); + PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride); + dst += (2 * dst_stride); + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert) +{ + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter_horiz); + filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); + hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8); + hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2); + + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + v16i8 res0, res1, res2, res3; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter_horiz); + filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7); + hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7); + hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7); + SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, + hz_out3, hz_out5, 8); + hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6); + + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, + vec4, vec5, vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, 7); + SAT_UH4_UH(vec4, vec5, vec6, vec7, 7); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, + res0, res1, res2, res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +void ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1]; + const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1]; + + if (4 == height) { + common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else if (8 == height) { + common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } +} + +static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert) +{ + v16i8 src0, src1, src2, src3, src4, mask, out0, out1; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8) __msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8) __msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); + vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); + tmp1 = __msa_dotp_u_h(vec1, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); + vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + tmp2 = __msa_dotp_u_h(vec2, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); + vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); + tmp3 = __msa_dotp_u_h(vec3, filt_vt); + + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, mask, out0, out1; + v16u8 filt_hz, filt_vt, vec0; + v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8) __msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8) __msa_splati_h(filt, 0); + + src0 = LD_SB(src); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp1, tmp2, 7); + SAT_UH2_UH(tmp1, tmp2, 7); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); + tmp4 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp3, tmp4, 7); + SAT_UH2_UH(tmp3, tmp4, 7); + PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + tmp5 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); + tmp6 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + tmp7 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); + tmp8 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7); + SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7); + PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +void ff_put_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1]; + const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1]; + + if (4 == height) { + common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else { + common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + } +} + +void ff_put_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1]; + const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt_hz, filt_vt, vec0, vec1; + v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8) __msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8) __msa_splati_h(filt, 0); + + LD_SB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); + + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, 7); + SAT_UH2_UH(tmp1, tmp2, 7); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, 7); + SAT_UH2_UH(tmp1, tmp2, 7); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, 7); + SAT_UH2_UH(tmp1, tmp2, 7); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, 7); + SAT_UH2_UH(tmp1, tmp2, 7); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + } +} + +void ff_put_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + int32_t multiple8_cnt; + + for (multiple8_cnt = 2; multiple8_cnt--;) { + ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my); + + src += 16; + dst += 16; + } +} + +void ff_put_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + int32_t multiple8_cnt; + + for (multiple8_cnt = 4; multiple8_cnt--;) { + ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my); + + src += 16; + dst += 16; + } +} + +static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1; + v8u16 vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); + SRARI_H2_UH(vec2, vec3, 7); + PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8u16 vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, + vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, 7); + PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, + res2, res3); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, + dst4, dst6); + AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, + res2, res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +void ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; + + if (4 == height) { + common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, + filter); + } else if (8 == height) { + common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, + filter); + } +} + +static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + vec0, vec1, vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, + dst, dst_stride); +} + +static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, + dst, dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, + dst, dst_stride); + dst += (4 * dst_stride); + + if (16 == height) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, + vec1, vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD_SB4(src, src_stride, src0, src1, src2, src3); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, + dst, dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, + vec1, vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, + dst, dst_stride); + } +} + +void ff_avg_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; + + if (4 == height) { + common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, + filter); + } else { + common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, + filter, height); + } +} + +void ff_avg_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, + res2, res3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, + res6, res7); + SRARI_H4_UH(res0, res1, res2, res3, 7); + SRARI_H4_UH(res4, res5, res6, res7, 7); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST_UB(res1, res0, dst0, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res3, res2, dst1, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res5, res4, dst2, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res7, res6, dst3, dst); + dst += dst_stride; + + for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, + res1, res2, res3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, + res5, res6, res7); + SRARI_H4_UH(res0, res1, res2, res3, 7); + SRARI_H4_UH(res4, res5, res6, res7, 7); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST_UB(res1, res0, dst0, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res3, res2, dst1, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res5, res4, dst2, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res7, res6, dst3, dst); + dst += dst_stride; + } +} + +void ff_avg_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + for (loop_cnt = (height >> 1); loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + src4 = LD_SB(src); + src6 = LD_SB(src + 16); + src7 = LD_SB(src + 24); + src5 = __msa_sldi_b(src6, src4, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + res0, res1, res2, res3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, + res4, res5, res6, res7); + SRARI_H4_UH(res0, res1, res2, res3, 7); + SRARI_H4_UH(res4, res5, res6, res7, 7); + LD_UB2(dst, 16, dst0, dst1); + PCKEV_AVG_ST_UB(res1, res0, dst0, dst); + PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16)); + dst += dst_stride; + LD_UB2(dst, 16, dst2, dst3); + PCKEV_AVG_ST_UB(res5, res4, dst2, dst); + PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16)); + dst += dst_stride; + } +} + +void ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + for (loop_cnt = height; loop_cnt--;) { + LD_SB4(src, 16, src0, src2, src4, src6); + src7 = LD_SB(src + 56); + SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + out0, out1, out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, + out4, out5, out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, 7); + SRARI_H4_UH(out4, out5, out6, out7, 7); + LD_UB4(dst, 16, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST_UB(out1, out0, dst0, dst); + PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16); + PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32); + PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48); + dst += dst_stride; + } +} + +static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16i8 src0, src1, src2, src3, src4; + v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 filt; + v8u16 tmp0, tmp1; + + filt = LD_SH(filter); + filt0 = (v16u8) __msa_splati_h(filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + src4 = LD_SB(src); + src += src_stride; + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + dst0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + + out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); + out = __msa_aver_u_b(out, dst0); + + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) +{ + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16u8 src2110, src4332, src6554, src8776, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + filt = LD_SH(filter); + filt0 = (v16u8) __msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, + dst2, dst3); + ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, + src76_r, src87_r); + ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, + src87_r, src76_r, src2110, src4332, src6554, src8776); + DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); + AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride); +} + +void ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; + + if (4 == height) { + common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, + filter); + } else if (8 == height) { + common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, + filter); + } +} + +static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter) +{ + v16u8 src0, src1, src2, src3, src4; + v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8) __msa_splati_h(filt, 0); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); + ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, + dst, dst_stride); +} + +static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) +{ + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8) __msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); + src += (8 * src_stride); + LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, + vec0, vec1, vec2, vec3); + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, + vec4, vec5, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, + dst4, dst, dst_stride); + dst += (4 * dst_stride); + + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, + dst8, dst, dst_stride); + dst += (4 * dst_stride); + + src0 = src8; + } +} + +void ff_avg_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; + + if (4 == height) { + common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, + filter); + } else { + common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, + filter, height); + } +} + +void ff_avg_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; + v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3, filt; + + /* rearranging filter_y */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + dst += dst_stride; + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst); + dst += dst_stride; + + src0 = src4; + } +} + +void ff_avg_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3, filt; + + /* rearranging filter_y */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_UB2(src, 16, src0, src5); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + + LD_UB4(src + 16, src_stride, src6, src7, src8, src9); + LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7); + src += (4 * src_stride); + + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride); + + ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); + ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride); + + ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); + ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride); + dst += (4 * dst_stride); + + src0 = src4; + src5 = src9; + } +} + +void ff_avg_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; + v16u8 src0, src1, src2, src3, src4, src5; + v16u8 src6, src7, src8, src9, src10, src11, filt0; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8u16 filt; + + /* rearranging filter_y */ + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_UB4(src, 16, src0, src3, src6, src9); + src += src_stride; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_UB2(src, src_stride, src1, src2); + LD_UB2(dst, dst_stride, dst0, dst1); + LD_UB2(src + 16, src_stride, src4, src5); + LD_UB2(dst + 16, dst_stride, dst2, dst3); + LD_UB2(src + 32, src_stride, src7, src8); + LD_UB2(dst + 32, dst_stride, dst4, dst5); + LD_UB2(src + 48, src_stride, src10, src11); + LD_UB2(dst + 48, dst_stride, dst6, dst7); + src += (2 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); + + ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); + ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, 7); + SAT_UH2_UH(tmp4, tmp5, 7); + PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, 7); + SAT_UH2_UH(tmp6, tmp7, 7); + PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride); + + ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); + ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride); + + ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); + ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, 7); + SAT_UH2_UH(tmp4, tmp5, 7); + PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48)); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, 7); + SAT_UH2_UH(tmp6, tmp7, 7); + PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride); + dst += (2 * dst_stride); + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert) +{ + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 dst0, dst1, dst2, dst3, res0, res1; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter_horiz); + filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); + hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8); + hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert) +{ + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8) __msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8) __msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7); + hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7); + hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7); + SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, + hz_out3, hz_out5, 8); + hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6); + + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, + dst4, dst6); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, + res2, res3); + AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, + res2, res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +void ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1]; + const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1]; + + if (4 == height) { + common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else if (8 == height) { + common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert) +{ + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8) __msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8) __msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); + vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); + tmp1 = __msa_dotp_u_h(vec1, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); + vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + tmp2 = __msa_dotp_u_h(vec2, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); + vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); + tmp3 = __msa_dotp_u_h(vec3, filt_vt); + + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, + dst, dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) +{ + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8) __msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8) __msa_splati_h(filt, 0); + + src0 = LD_SB(src); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); + vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp2, tmp3, 7); + SAT_UH2_UH(tmp2, tmp3, 7); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, + dst3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +void ff_avg_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1]; + const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1]; + + if (4 == height) { + common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else { + common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride, + dst, dst_stride, + filter_horiz, filter_vert, + height); + } +} + +void ff_avg_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + uint32_t loop_cnt; + const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1]; + const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1]; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8) __msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8) __msa_splati_h(filt, 0); + + LD_SB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst); + dst += dst_stride; + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, 7); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst); + dst += dst_stride; + } +} + +void ff_avg_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + int32_t multiple8_cnt; + + for (multiple8_cnt = 2; multiple8_cnt--;) { + ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my); + + src += 16; + dst += 16; + } +} + +void ff_avg_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int height, int mx, int my) +{ + int32_t multiple8_cnt; + + for (multiple8_cnt = 4; multiple8_cnt--;) { + ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my); + + src += 16; + dst += 16; + } +} + +static void copy_width8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + out4 = __msa_copy_u_d((v2i64) src4, 0); + out5 = __msa_copy_u_d((v2i64) src5, 0); + out6 = __msa_copy_u_d((v2i64) src6, 0); + out7 = __msa_copy_u_d((v2i64) src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + for (cnt = height >> 3; cnt--;) { + LD_UB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + out4 = __msa_copy_u_d((v2i64) src4, 0); + out5 = __msa_copy_u_d((v2i64) src5, 0); + out6 = __msa_copy_u_d((v2i64) src6, 0); + out7 = __msa_copy_u_d((v2i64) src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 4) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + out2 = __msa_copy_u_d((v2i64) src2, 0); + out3 = __msa_copy_u_d((v2i64) src3, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 2) { + for (cnt = (height / 2); cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + out0 = __msa_copy_u_d((v2i64) src0, 0); + out1 = __msa_copy_u_d((v2i64) src1, 0); + + SD(out0, dst); + dst += dst_stride; + SD(out1, dst); + dst += dst_stride; + } + } +} + +static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t width) +{ + int32_t cnt, loop_cnt; + const uint8_t *src_tmp; + uint8_t *dst_tmp; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src_tmp, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src_tmp += (8 * src_stride); + + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, + dst_tmp, dst_stride); + dst_tmp += (8 * dst_stride); + } + + src += 16; + dst += 16; + } +} + +static void copy_width16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, + dst, dst_stride); + dst += (8 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); + } else if (0 == height % 4) { + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void copy_width32_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32); + } else if (0 == height % 4) { + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void copy_width64_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64); +} + +static void avg_width4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + uint32_t out0, out1, out2, out3; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + + if (0 == (height % 4)) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + + out0 = __msa_copy_u_w((v4i32) dst0, 0); + out1 = __msa_copy_u_w((v4i32) dst1, 0); + out2 = __msa_copy_u_w((v4i32) dst2, 0); + out3 = __msa_copy_u_w((v4i32) dst3, 0); + SW4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == (height % 2)) { + for (cnt = (height / 2); cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + + AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1); + + out0 = __msa_copy_u_w((v4i32) dst0, 0); + out1 = __msa_copy_u_w((v4i32) dst1, 0); + SW(out0, dst); + dst += dst_stride; + SW(out1, dst); + dst += dst_stride; + } + } +} + +static void avg_width8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + uint64_t out0, out1, out2, out3; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + + out0 = __msa_copy_u_d((v2i64) dst0, 0); + out1 = __msa_copy_u_d((v2i64) dst1, 0); + out2 = __msa_copy_u_d((v2i64) dst2, 0); + out3 = __msa_copy_u_d((v2i64) dst3, 0); + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avg_width16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + for (cnt = (height / 8); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +static void avg_width32_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + uint8_t *dst_dup = dst; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 src8, src9, src10, src11, src12, src13, src14, src15; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + for (cnt = (height / 8); cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 16, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6); + LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7); + dst_dup += (4 * dst_stride); + LD_UB4(src, src_stride, src8, src10, src12, src14); + LD_UB4(src + 16, src_stride, src9, src11, src13, src15); + src += (4 * src_stride); + LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14); + LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15); + dst_dup += (4 * dst_stride); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, + dst8, dst9, dst10, dst11); + AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, + dst12, dst13, dst14, dst15); + + ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride); + ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride); + dst += (4 * dst_stride); + ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride); + ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avg_width64_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height) +{ + int32_t cnt; + uint8_t *dst_dup = dst; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 src8, src9, src10, src11, src12, src13, src14, src15; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(src, 16, src4, src5, src6, src7); + src += src_stride; + LD_UB4(src, 16, src8, src9, src10, src11); + src += src_stride; + LD_UB4(src, 16, src12, src13, src14, src15); + src += src_stride; + + LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3); + dst_dup += dst_stride; + LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7); + dst_dup += dst_stride; + LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11); + dst_dup += dst_stride; + LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15); + dst_dup += dst_stride; + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, + dst8, dst9, dst10, dst11); + AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, + dst12, dst13, dst14, dst15); + + ST_UB4(dst0, dst1, dst2, dst3, dst, 16); + dst += dst_stride; + ST_UB4(dst4, dst5, dst6, dst7, dst, 16); + dst += dst_stride; + ST_UB4(dst8, dst9, dst10, dst11, dst, 16); + dst += dst_stride; + ST_UB4(dst12, dst13, dst14, dst15, dst, 16); + dst += dst_stride; + } +} + +static const int8_t vp9_subpel_filters_msa[3][15][8] = { + [FILTER_8TAP_REGULAR] = { + {0, 1, -5, 126, 8, -3, 1, 0}, + {-1, 3, -10, 122, 18, -6, 2, 0}, + {-1, 4, -13, 118, 27, -9, 3, -1}, + {-1, 4, -16, 112, 37, -11, 4, -1}, + {-1, 5, -18, 105, 48, -14, 4, -1}, + {-1, 5, -19, 97, 58, -16, 5, -1}, + {-1, 6, -19, 88, 68, -18, 5, -1}, + {-1, 6, -19, 78, 78, -19, 6, -1}, + {-1, 5, -18, 68, 88, -19, 6, -1}, + {-1, 5, -16, 58, 97, -19, 5, -1}, + {-1, 4, -14, 48, 105, -18, 5, -1}, + {-1, 4, -11, 37, 112, -16, 4, -1}, + {-1, 3, -9, 27, 118, -13, 4, -1}, + {0, 2, -6, 18, 122, -10, 3, -1}, + {0, 1, -3, 8, 126, -5, 1, 0}, + }, [FILTER_8TAP_SHARP] = { + {-1, 3, -7, 127, 8, -3, 1, 0}, + {-2, 5, -13, 125, 17, -6, 3, -1}, + {-3, 7, -17, 121, 27, -10, 5, -2}, + {-4, 9, -20, 115, 37, -13, 6, -2}, + {-4, 10, -23, 108, 48, -16, 8, -3}, + {-4, 10, -24, 100, 59, -19, 9, -3}, + {-4, 11, -24, 90, 70, -21, 10, -4}, + {-4, 11, -23, 80, 80, -23, 11, -4}, + {-4, 10, -21, 70, 90, -24, 11, -4}, + {-3, 9, -19, 59, 100, -24, 10, -4}, + {-3, 8, -16, 48, 108, -23, 10, -4}, + {-2, 6, -13, 37, 115, -20, 9, -4}, + {-2, 5, -10, 27, 121, -17, 7, -3}, + {-1, 3, -6, 17, 125, -13, 5, -2}, + {0, 1, -3, 8, 127, -7, 3, -1}, + }, [FILTER_8TAP_SMOOTH] = { + {-3, -1, 32, 64, 38, 1, -3, 0}, + {-2, -2, 29, 63, 41, 2, -3, 0}, + {-2, -2, 26, 63, 43, 4, -4, 0}, + {-2, -3, 24, 62, 46, 5, -4, 0}, + {-2, -3, 21, 60, 49, 7, -4, 0}, + {-1, -4, 18, 59, 51, 9, -4, 0}, + {-1, -4, 16, 57, 53, 12, -4, -1}, + {-1, -4, 14, 55, 55, 14, -4, -1}, + {-1, -4, 12, 53, 57, 16, -4, -1}, + {0, -4, 9, 51, 59, 18, -4, -1}, + {0, -4, 7, 49, 60, 21, -3, -2}, + {0, -4, 5, 46, 62, 24, -3, -2}, + {0, -4, 4, 43, 63, 26, -2, -2}, + {0, -3, 2, 41, 63, 29, -2, -2}, + {0, -3, 1, 38, 64, 32, -1, -3}, + } +}; + +#define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx) \ +void ff_put_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, \ + ptrdiff_t srcstride, \ + int h, int mx, int my) \ +{ \ + const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1]; \ + \ + common_hz_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h); \ +} \ + \ +void ff_put_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, \ + ptrdiff_t srcstride, \ + int h, int mx, int my) \ +{ \ + const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1]; \ + \ + common_vt_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h); \ +} \ + \ +void ff_put_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, \ + ptrdiff_t srcstride, \ + int h, int mx, int my) \ +{ \ + const uint8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1]; \ + const uint8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1]; \ + \ + common_hv_8ht_8vt_##SIZE##w_msa(src, srcstride, dst, dststride, hfilter, \ + vfilter, h); \ +} \ + \ +void ff_avg_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, \ + ptrdiff_t srcstride, \ + int h, int mx, int my) \ +{ \ + const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1]; \ + \ + common_hz_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, \ + dststride, filter, h); \ +} \ + \ +void ff_avg_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, \ + ptrdiff_t srcstride, \ + int h, int mx, int my) \ +{ \ + const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1]; \ + \ + common_vt_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, dststride, \ + filter, h); \ +} \ + \ +void ff_avg_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, \ + ptrdiff_t srcstride, \ + int h, int mx, int my) \ +{ \ + const uint8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1]; \ + const uint8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1]; \ + \ + common_hv_8ht_8vt_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, \ + dststride, hfilter, \ + vfilter, h); \ +} + +#define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE) \ +void ff_copy##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, ptrdiff_t srcstride, \ + int h, int mx, int my) \ +{ \ + \ + copy_width##SIZE##_msa(src, srcstride, dst, dststride, h); \ +} \ + \ +void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, ptrdiff_t srcstride, \ + int h, int mx, int my) \ +{ \ + \ + avg_width##SIZE##_msa(src, srcstride, dst, dststride, h); \ +} + +#define VP9_AVG_MIPS_MSA_FUNC(SIZE) \ +void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, ptrdiff_t srcstride, \ + int h, int mx, int my) \ +{ \ + \ + avg_width##SIZE##_msa(src, srcstride, dst, dststride, h); \ +} + +VP9_8TAP_MIPS_MSA_FUNC(64, regular, FILTER_8TAP_REGULAR); +VP9_8TAP_MIPS_MSA_FUNC(32, regular, FILTER_8TAP_REGULAR); +VP9_8TAP_MIPS_MSA_FUNC(16, regular, FILTER_8TAP_REGULAR); +VP9_8TAP_MIPS_MSA_FUNC(8, regular, FILTER_8TAP_REGULAR); +VP9_8TAP_MIPS_MSA_FUNC(4, regular, FILTER_8TAP_REGULAR); + +VP9_8TAP_MIPS_MSA_FUNC(64, sharp, FILTER_8TAP_SHARP); +VP9_8TAP_MIPS_MSA_FUNC(32, sharp, FILTER_8TAP_SHARP); +VP9_8TAP_MIPS_MSA_FUNC(16, sharp, FILTER_8TAP_SHARP); +VP9_8TAP_MIPS_MSA_FUNC(8, sharp, FILTER_8TAP_SHARP); +VP9_8TAP_MIPS_MSA_FUNC(4, sharp, FILTER_8TAP_SHARP); + +VP9_8TAP_MIPS_MSA_FUNC(64, smooth, FILTER_8TAP_SMOOTH); +VP9_8TAP_MIPS_MSA_FUNC(32, smooth, FILTER_8TAP_SMOOTH); +VP9_8TAP_MIPS_MSA_FUNC(16, smooth, FILTER_8TAP_SMOOTH); +VP9_8TAP_MIPS_MSA_FUNC(8, smooth, FILTER_8TAP_SMOOTH); +VP9_8TAP_MIPS_MSA_FUNC(4, smooth, FILTER_8TAP_SMOOTH); + +VP9_COPY_AVG_MIPS_MSA_FUNC(64); +VP9_COPY_AVG_MIPS_MSA_FUNC(32); +VP9_COPY_AVG_MIPS_MSA_FUNC(16); +VP9_COPY_AVG_MIPS_MSA_FUNC(8); +VP9_AVG_MIPS_MSA_FUNC(4); + +#undef VP9_8TAP_MIPS_MSA_FUNC +#undef VP9_COPY_AVG_MIPS_MSA_FUNC +#undef VP9_AVG_MIPS_MSA_FUNC diff --git a/libavcodec/mips/vp9dsp_init_mips.c b/libavcodec/mips/vp9dsp_init_mips.c new file mode 100644 index 0000000000..c8a48908af --- /dev/null +++ b/libavcodec/mips/vp9dsp_init_mips.c @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/common.h" +#include "libavcodec/vp9dsp.h" +#include "vp9dsp_mips.h" + +#if HAVE_MSA +static av_cold void vp9dsp_intrapred_init_msa(VP9DSPContext *dsp, int bpp) +{ + if (bpp == 8) { +#define init_intra_pred_msa(tx, sz) \ + dsp->intra_pred[tx][VERT_PRED] = ff_vert_##sz##_msa; \ + dsp->intra_pred[tx][HOR_PRED] = ff_hor_##sz##_msa; \ + dsp->intra_pred[tx][DC_PRED] = ff_dc_##sz##_msa; \ + dsp->intra_pred[tx][LEFT_DC_PRED] = ff_dc_left_##sz##_msa; \ + dsp->intra_pred[tx][TOP_DC_PRED] = ff_dc_top_##sz##_msa; \ + dsp->intra_pred[tx][DC_128_PRED] = ff_dc_128_##sz##_msa; \ + dsp->intra_pred[tx][DC_127_PRED] = ff_dc_127_##sz##_msa; \ + dsp->intra_pred[tx][DC_129_PRED] = ff_dc_129_##sz##_msa; \ + dsp->intra_pred[tx][TM_VP8_PRED] = ff_tm_##sz##_msa; \ + + init_intra_pred_msa(TX_16X16, 16x16); + init_intra_pred_msa(TX_32X32, 32x32); +#undef init_intra_pred_msa + +#define init_intra_pred_msa(tx, sz) \ + dsp->intra_pred[tx][DC_PRED] = ff_dc_##sz##_msa; \ + dsp->intra_pred[tx][LEFT_DC_PRED] = ff_dc_left_##sz##_msa; \ + dsp->intra_pred[tx][TOP_DC_PRED] = ff_dc_top_##sz##_msa; \ + dsp->intra_pred[tx][TM_VP8_PRED] = ff_tm_##sz##_msa; \ + + init_intra_pred_msa(TX_4X4, 4x4); + init_intra_pred_msa(TX_8X8, 8x8); +#undef init_intra_pred_msa + } +} + +static av_cold void vp9dsp_itxfm_init_msa(VP9DSPContext *dsp, int bpp) +{ + if (bpp == 8) { +#define init_itxfm(tx, sz) \ + dsp->itxfm_add[tx][DCT_DCT] = ff_idct_idct_##sz##_add_msa; \ + dsp->itxfm_add[tx][DCT_ADST] = ff_iadst_idct_##sz##_add_msa; \ + dsp->itxfm_add[tx][ADST_DCT] = ff_idct_iadst_##sz##_add_msa; \ + dsp->itxfm_add[tx][ADST_ADST] = ff_iadst_iadst_##sz##_add_msa \ + +#define init_idct(tx, nm) \ + dsp->itxfm_add[tx][DCT_DCT] = \ + dsp->itxfm_add[tx][ADST_DCT] = \ + dsp->itxfm_add[tx][DCT_ADST] = \ + dsp->itxfm_add[tx][ADST_ADST] = nm##_add_msa + + init_itxfm(TX_4X4, 4x4); + init_itxfm(TX_8X8, 8x8); + init_itxfm(TX_16X16, 16x16); + init_idct(TX_32X32, ff_idct_idct_32x32); +#undef init_itxfm +#undef init_idct + } +} + +static av_cold void vp9dsp_mc_init_msa(VP9DSPContext *dsp, int bpp) +{ + if (bpp == 8) { +#define init_fpel(idx1, idx2, sz, type) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = ff_##type##sz##_msa; \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = ff_##type##sz##_msa; \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = ff_##type##sz##_msa; \ + dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_##type##sz##_msa + +#define init_copy_avg(idx, sz) \ + init_fpel(idx, 0, sz, copy); \ + init_fpel(idx, 1, sz, avg) + +#define init_avg(idx, sz) \ + init_fpel(idx, 1, sz, avg) + + init_copy_avg(0, 64); + init_copy_avg(1, 32); + init_copy_avg(2, 16); + init_copy_avg(3, 8); + init_avg(4, 4); + +#undef init_copy_avg +#undef init_avg +#undef init_fpel + +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type) \ + dsp->mc[idx1][FILTER_BILINEAR ][idx2][idxh][idxv] = \ + ff_##type##_bilin_##sz##dir##_msa; \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \ + ff_##type##_8tap_smooth_##sz##dir##_msa; \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \ + ff_##type##_8tap_regular_##sz##dir##_msa; \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \ + ff_##type##_8tap_sharp_##sz##dir##_msa; + +#define init_subpel2(idx, idxh, idxv, dir, type) \ + init_subpel1(0, idx, idxh, idxv, 64, dir, type); \ + init_subpel1(1, idx, idxh, idxv, 32, dir, type); \ + init_subpel1(2, idx, idxh, idxv, 16, dir, type); \ + init_subpel1(3, idx, idxh, idxv, 8, dir, type); \ + init_subpel1(4, idx, idxh, idxv, 4, dir, type) + +#define init_subpel3(idx, type) \ + init_subpel2(idx, 1, 1, hv, type); \ + init_subpel2(idx, 0, 1, v, type); \ + init_subpel2(idx, 1, 0, h, type) + + init_subpel3(0, put); + init_subpel3(1, avg); + +#undef init_subpel1 +#undef init_subpel2 +#undef init_subpel3 + } +} + +static av_cold void vp9dsp_loopfilter_init_msa(VP9DSPContext *dsp, int bpp) +{ + if (bpp == 8) { + dsp->loop_filter_8[0][0] = ff_loop_filter_h_4_8_msa; + dsp->loop_filter_8[0][1] = ff_loop_filter_v_4_8_msa; + dsp->loop_filter_8[1][0] = ff_loop_filter_h_8_8_msa; + dsp->loop_filter_8[1][1] = ff_loop_filter_v_8_8_msa; + dsp->loop_filter_8[2][0] = ff_loop_filter_h_16_8_msa; + dsp->loop_filter_8[2][1] = ff_loop_filter_v_16_8_msa; + + dsp->loop_filter_16[0] = ff_loop_filter_h_16_16_msa; + dsp->loop_filter_16[1] = ff_loop_filter_v_16_16_msa; + + dsp->loop_filter_mix2[0][0][0] = ff_loop_filter_h_44_16_msa; + dsp->loop_filter_mix2[0][0][1] = ff_loop_filter_v_44_16_msa; + dsp->loop_filter_mix2[0][1][0] = ff_loop_filter_h_48_16_msa; + dsp->loop_filter_mix2[0][1][1] = ff_loop_filter_v_48_16_msa; + dsp->loop_filter_mix2[1][0][0] = ff_loop_filter_h_84_16_msa; + dsp->loop_filter_mix2[1][0][1] = ff_loop_filter_v_84_16_msa; + dsp->loop_filter_mix2[1][1][0] = ff_loop_filter_h_88_16_msa; + dsp->loop_filter_mix2[1][1][1] = ff_loop_filter_v_88_16_msa; + } +} + +static av_cold void vp9dsp_init_msa(VP9DSPContext *dsp, int bpp) +{ + vp9dsp_intrapred_init_msa(dsp, bpp); + vp9dsp_itxfm_init_msa(dsp, bpp); + vp9dsp_mc_init_msa(dsp, bpp); + vp9dsp_loopfilter_init_msa(dsp, bpp); +} +#endif // #if HAVE_MSA + +av_cold void ff_vp9dsp_init_mips(VP9DSPContext *dsp, int bpp) +{ +#if HAVE_MSA + vp9dsp_init_msa(dsp, bpp); +#endif // #if HAVE_MSA +} diff --git a/libavcodec/mips/vp9dsp_mips.h b/libavcodec/mips/vp9dsp_mips.h new file mode 100644 index 0000000000..4d7303888d --- /dev/null +++ b/libavcodec/mips/vp9dsp_mips.h @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_VP9DSP_MIPS_H +#define AVCODEC_MIPS_VP9DSP_MIPS_H + +#define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx) \ +void ff_put_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, \ + ptrdiff_t srcstride, \ + int h, int mx, int my); \ + \ +void ff_put_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, \ + ptrdiff_t srcstride, \ + int h, int mx, int my); \ + \ +void ff_put_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, \ + ptrdiff_t srcstride, \ + int h, int mx, int my); \ + \ +void ff_avg_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, \ + ptrdiff_t srcstride, \ + int h, int mx, int my); \ + \ +void ff_avg_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, \ + ptrdiff_t srcstride, \ + int h, int mx, int my); \ + \ +void ff_avg_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, \ + ptrdiff_t srcstride, \ + int h, int mx, int my); + +#define VP9_BILINEAR_MIPS_MSA_FUNC(SIZE) \ +void ff_put_bilin_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, ptrdiff_t srcstride, \ + int h, int mx, int my); \ + \ +void ff_put_bilin_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, ptrdiff_t srcstride, \ + int h, int mx, int my); \ + \ +void ff_put_bilin_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, ptrdiff_t srcstride, \ + int h, int mx, int my); \ + \ +void ff_avg_bilin_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, ptrdiff_t srcstride, \ + int h, int mx, int my); \ + \ +void ff_avg_bilin_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, ptrdiff_t srcstride, \ + int h, int mx, int my); \ + \ +void ff_avg_bilin_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, ptrdiff_t srcstride, \ + int h, int mx, int my); + +#define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE) \ +void ff_copy##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, ptrdiff_t srcstride, \ + int h, int mx, int my); \ + \ +void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, ptrdiff_t srcstride, \ + int h, int mx, int my); + +VP9_8TAP_MIPS_MSA_FUNC(64, regular, FILTER_8TAP_REGULAR); +VP9_8TAP_MIPS_MSA_FUNC(32, regular, FILTER_8TAP_REGULAR); +VP9_8TAP_MIPS_MSA_FUNC(16, regular, FILTER_8TAP_REGULAR); +VP9_8TAP_MIPS_MSA_FUNC(8, regular, FILTER_8TAP_REGULAR); +VP9_8TAP_MIPS_MSA_FUNC(4, regular, FILTER_8TAP_REGULAR); + +VP9_8TAP_MIPS_MSA_FUNC(64, sharp, FILTER_8TAP_SHARP); +VP9_8TAP_MIPS_MSA_FUNC(32, sharp, FILTER_8TAP_SHARP); +VP9_8TAP_MIPS_MSA_FUNC(16, sharp, FILTER_8TAP_SHARP); +VP9_8TAP_MIPS_MSA_FUNC(8, sharp, FILTER_8TAP_SHARP); +VP9_8TAP_MIPS_MSA_FUNC(4, sharp, FILTER_8TAP_SHARP); + +VP9_8TAP_MIPS_MSA_FUNC(64, smooth, FILTER_8TAP_SMOOTH); +VP9_8TAP_MIPS_MSA_FUNC(32, smooth, FILTER_8TAP_SMOOTH); +VP9_8TAP_MIPS_MSA_FUNC(16, smooth, FILTER_8TAP_SMOOTH); +VP9_8TAP_MIPS_MSA_FUNC(8, smooth, FILTER_8TAP_SMOOTH); +VP9_8TAP_MIPS_MSA_FUNC(4, smooth, FILTER_8TAP_SMOOTH); + +VP9_BILINEAR_MIPS_MSA_FUNC(64); +VP9_BILINEAR_MIPS_MSA_FUNC(32); +VP9_BILINEAR_MIPS_MSA_FUNC(16); +VP9_BILINEAR_MIPS_MSA_FUNC(8); +VP9_BILINEAR_MIPS_MSA_FUNC(4); + +VP9_COPY_AVG_MIPS_MSA_FUNC(64); +VP9_COPY_AVG_MIPS_MSA_FUNC(32); +VP9_COPY_AVG_MIPS_MSA_FUNC(16); +VP9_COPY_AVG_MIPS_MSA_FUNC(8); +VP9_COPY_AVG_MIPS_MSA_FUNC(4); + +#undef VP9_8TAP_MIPS_MSA_FUNC +#undef VP9_BILINEAR_MIPS_MSA_FUNC +#undef VP9_COPY_AVG_MIPS_MSA_FUNC + +void ff_loop_filter_h_4_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e, + int32_t i, int32_t h); +void ff_loop_filter_h_8_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e, + int32_t i, int32_t h); +void ff_loop_filter_h_16_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e, + int32_t i, int32_t h); +void ff_loop_filter_v_4_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e, + int32_t i, int32_t h); +void ff_loop_filter_v_8_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e, + int32_t i, int32_t h); +void ff_loop_filter_v_16_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e, + int32_t i, int32_t h); +void ff_loop_filter_h_44_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e, + int32_t i, int32_t h); +void ff_loop_filter_h_88_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e, + int32_t i, int32_t h); +void ff_loop_filter_h_16_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e, + int32_t i, int32_t h); +void ff_loop_filter_v_44_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e, + int32_t i, int32_t h); +void ff_loop_filter_v_88_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e, + int32_t i, int32_t h); +void ff_loop_filter_v_16_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e, + int32_t i, int32_t h); +void ff_loop_filter_h_48_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e, + int32_t i, int32_t h); +void ff_loop_filter_h_84_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e, + int32_t i, int32_t h); +void ff_loop_filter_v_48_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e, + int32_t i, int32_t h); +void ff_loop_filter_v_84_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e, + int32_t i, int32_t h); +void ff_idct_idct_4x4_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob); +void ff_idct_idct_8x8_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob); +void ff_idct_idct_16x16_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob); +void ff_idct_idct_32x32_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob); +void ff_iadst_iadst_4x4_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob); +void ff_iadst_iadst_8x8_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob); +void ff_iadst_iadst_16x16_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob); +void ff_iadst_idct_4x4_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob); +void ff_iadst_idct_8x8_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob); +void ff_iadst_idct_16x16_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob); +void ff_idct_iadst_4x4_add_msa(uint8_t *pu8Dest, ptrdiff_t stride, + int16_t *block, int eob); +void ff_idct_iadst_8x8_add_msa(uint8_t *pu8Dest, ptrdiff_t stride, + int16_t *block, int eob); +void ff_idct_iadst_16x16_add_msa(uint8_t *pu8Dest, ptrdiff_t stride, + int16_t *block, int eob); +void ff_iwht_iwht_4x4_add_msa(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob); + +void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_left_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_left_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_left_16x16_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_dc_left_32x32_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_dc_top_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_top_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_top_16x16_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_dc_top_32x32_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_dc_128_16x16_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_dc_128_32x32_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_dc_127_16x16_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_dc_127_32x32_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_dc_129_16x16_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_dc_129_32x32_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); + +#endif // #ifndef AVCODEC_MIPS_VP9DSP_MIPS_H diff --git a/libavcodec/mips/xvid_idct_mmi.c b/libavcodec/mips/xvid_idct_mmi.c new file mode 100644 index 0000000000..d3f9acb0e2 --- /dev/null +++ b/libavcodec/mips/xvid_idct_mmi.c @@ -0,0 +1,253 @@ +/* + * Loongson SIMD optimized xvid idct + * + * Copyright (c) 2015 Loongson Technology Corporation Limited + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "idctdsp_mips.h" +#include "xvididct_mips.h" + +#define BITS_INV_ACC 5 // 4 or 5 for IEEE +#define SHIFT_INV_ROW (16 - BITS_INV_ACC) //11 +#define SHIFT_INV_COL (1 + BITS_INV_ACC) //6 +#define RND_INV_ROW (1024 * (6 - BITS_INV_ACC)) +#define RND_INV_COL (16 * (BITS_INV_ACC - 3)) +#define RND_INV_CORR (RND_INV_COL - 1) + +#define BITS_FRW_ACC 3 // 2 or 3 for accuracy +#define SHIFT_FRW_COL BITS_FRW_ACC +#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17) +#define RND_FRW_ROW (262144*(BITS_FRW_ACC - 1)) + +DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4*4] = { + 13036, 13036, 13036, 13036, // tg * (2<<16) + 0.5 + 27146, 27146, 27146, 27146, // tg * (2<<16) + 0.5 + -21746,-21746,-21746,-21746, // tg * (2<<16) + 0.5 + 23170, 23170, 23170, 23170 // cos * (2<<15) + 0.5 +}; + +DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2*8] = { + 65536,65536, + 3597, 3597, + 2260, 2260, + 1203, 1203, + 0, 0, + 120, 120, + 512, 512, + 512, 512 +}; + +DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmi)[32*4] = { + 16384, 21407, 16384, 8867, // w05 w04 w01 w00 + 16384, 8867,-16384,-21407, // w07 w06 w03 w02 + 16384, -8867, 16384,-21407, // w13 w12 w09 w08 + -16384, 21407, 16384, -8867, // w15 w14 w11 w10 + 22725, 19266, 19266, -4520, // w21 w20 w17 w16 + 12873, 4520,-22725,-12873, // w23 w22 w19 w18 + 12873,-22725, 4520,-12873, // w29 w28 w25 w24 + 4520, 19266, 19266,-22725, // w31 w30 w27 w26 + + 22725, 29692, 22725, 12299, // w05 w04 w01 w00 + 22725, 12299,-22725,-29692, // w07 w06 w03 w02 + 22725,-12299, 22725,-29692, // w13 w12 w09 w08 + -22725, 29692, 22725,-12299, // w15 w14 w11 w10 + 31521, 26722, 26722, -6270, // w21 w20 w17 w16 + 17855, 6270,-31521,-17855, // w23 w22 w19 w18 + 17855,-31521, 6270,-17855, // w29 w28 w25 w24 + 6270, 26722, 26722,-31521, // w31 w30 w27 w26 + + 21407, 27969, 21407, 11585, // w05 w04 w01 w00 + 21407, 11585,-21407,-27969, // w07 w06 w03 w02 + 21407,-11585, 21407,-27969, // w13 w12 w09 w08 + -21407, 27969, 21407,-11585, // w15 w14 w11 w10 + 29692, 25172, 25172, -5906, // w21 w20 w17 w16 + 16819, 5906,-29692,-16819, // w23 w22 w19 w18 + 16819,-29692, 5906,-16819, // w29 w28 w25 w24 + 5906, 25172, 25172,-29692, // w31 w30 w27 w26 + + 19266, 25172, 19266, 10426, // w05 w04 w01 w00 + 19266, 10426,-19266,-25172, // w07 w06 w03 w02 + 19266,-10426, 19266,-25172, // w13 w12 w09 w08 + -19266, 25172, 19266,-10426, // w15 w14 w11 w10 + 26722, 22654, 22654, -5315, // w21 w20 w17 w16 + 15137, 5315,-26722,-15137, // w23 w22 w19 w18 + 15137,-26722, 5315,-15137, // w29 w28 w25 w24 + 5315, 22654, 22654,-26722, // w31 w30 w27 w26 +}; + +#define DCT_8_INV_ROW_MMI(A1,A2,A3,A4) \ + "dli $10, 0x88 \n\t" \ + "ldc1 $f4, "#A1" \n\t" /* 0; x3 x2 x1 x0 */\ + "dmtc1 $10, $f16 \n\t" \ + "ldc1 $f10, 8+"#A1" \n\t" /* 1; x7 x6 x5 x4 */\ + "ldc1 $f6, "#A3" \n\t" /* 3; w05 w04 w01 w00 */\ + "pshufh $f0, $f4, $f16 \n\t" /* x2 x0 x2 x0 */\ + "ldc1 $f8, 8+"#A3" \n\t" /* 4; w07 w06 w03 w02 */\ + "ldc1 $f12, 32+"#A3" \n\t" /* 6; w21 w20 w17 w16 */\ + "pmaddhw $f6, $f6, $f0 \n\t" /* x2*w05+x0*w04 x2*w01+x0*w00 */\ + "dli $10, 0xdd \n\t" \ + "pshufh $f2, $f10, $f16 \n\t" /* x6 x4 x6 x4 */\ + "dmtc1 $10, $f16 \n\t" \ + "pmaddhw $f8, $f8, $f2 \n\t" /* x6*w07+x4*w06 x6*w03+x4*w02 */\ + "ldc1 $f14, 40+"#A3" \n\t" /* 7; w23 w22 w19 w18 */\ + "pshufh $f4, $f4, $f16 \n\t" /* x3 x1 x3 x1 */\ + "pmaddhw $f12, $f12, $f4 \n\t" /* x3*w21+x1*w20 x3*w17+x1*w16 */\ + "pshufh $f10, $f10, $f16 \n\t" /* x7 x5 x7 x5 */\ + "ldc1 $f18, "#A4" \n\t" \ + "pmaddhw $f14, $f14, $f10 \n\t" /* x7*w23+x5*w22 x7*w19+x5*w18 */\ + "paddw $f6, $f6, $f18 \n\t" /* +%4 */\ + "ldc1 $f16, 16+"#A3" \n\t" \ + "pmaddhw $f0, $f0, $f16 \n\t" /* x2*w13+x0*w12 x2*w09+x0*w08 */\ + "ldc1 $f16, 24+"#A3" \n\t" \ + "paddw $f6, $f6, $f8 \n\t" /* 4; a1=sum(even1) a0=sum(even0) */\ + "pmaddhw $f2, $f2, $f16 \n\t" /* x6*w15+x4*w14 x6*w11+x4*w10 */\ + "ldc1 $f16, 48+"#A3" \n\t" \ + "pmaddhw $f4, $f4, $f16 \n\t" /* x3*w29+x1*w28 x3*w25+x1*w24 */\ + "ldc1 $f16, 56+"#A3" \n\t" \ + "paddw $f12, $f12, $f14 \n\t" /* 7; b1=sum(odd1) b0=sum(odd0) */\ + "dli $10, 11 \n\t" \ + "pmaddhw $f10, $f10, $f16 \n\t" /* x7*w31+x5*w30 x7*w27+x5*w26 */\ + "dmtc1 $10, $f16 \n\t" \ + "psubw $f8, $f6, $f12 \n\t" /* 6; a1-b1 a0-b0 */\ + "paddw $f6, $f6, $f12 \n\t" /* a1+b1 a0+b0 */\ + "paddw $f0, $f0, $f18 \n\t" /* +%4 */\ + "psraw $f6, $f6, $f16 \n\t" /* y1=a1+b1 y0=a0+b0 */\ + "paddw $f0, $f0, $f2 \n\t" /* 1; a3=sum(even3) a2=sum(even2) */\ + "paddw $f4, $f4, $f10 \n\t" /* 5; b3=sum(odd3) b2=sum(odd2) */\ + "psraw $f8, $f8, $f16 \n\t" /* y6=a1-b1 y7=a0-b0 */\ + "psubw $f14, $f0, $f4 \n\t" /* 2; a3-b3 a2-b2 */\ + "paddw $f0, $f0, $f4 \n\t" /* a3+b3 a2+b2 */\ + "psraw $f0, $f0, $f16 \n\t" /* y3=a3+b3 y2=a2+b2 */\ + "psraw $f14, $f14, $f16 \n\t" /* y4=a3-b3 y5=a2-b2 */\ + "dli $10, 0xb1 \n\t" \ + "packsswh $f6, $f6, $f0 \n\t" /* 0; y3 y2 y1 y0 */\ + "dmtc1 $10, $f16 \n\t" \ + "packsswh $f14, $f14, $f8 \n\t" /* 4; y6 y7 y4 y5 */\ + "sdc1 $f6, "#A2" \n\t" /* 3; save y3 y2 y1 y0 */\ + "pshufh $f14, $f14, $f16 \n\t" /* y7 y6 y5 y4 */\ + "sdc1 $f14, 8+"#A2" \n\t" /* 7; save y7 y6 y5 y4 */\ + + +#define DCT_8_INV_COL(A1,A2) \ + "ldc1 $f2, 2*8(%3) \n\t" \ + "ldc1 $f6, 16*3+"#A1" \n\t" \ + "ldc1 $f10, 16*5+"#A1" \n\t" \ + "pmulhh $f0, $f2, $f6 \n\t" /* x3*(tg_3_16-1) */\ + "ldc1 $f4, 0(%3) \n\t" \ + "pmulhh $f2, $f2, $f10 \n\t" /* x5*(tg_3_16-1) */\ + "ldc1 $f14, 16*7+"#A1" \n\t" \ + "ldc1 $f12, 16*1+"#A1" \n\t" \ + "pmulhh $f8, $f4, $f14 \n\t" /* x7*tg_1_16 */\ + "paddsh $f0, $f0, $f6 \n\t" /* x3*tg_3_16 */\ + "pmulhh $f4, $f4, $f12 \n\t" /* x1*tg_1_16 */\ + "paddsh $f2, $f2, $f6 \n\t" /* x3+x5*(tg_3_16-1) */\ + "psubsh $f0, $f0, $f10 \n\t" /* x3*tg_3_16-x5 = tm35 */\ + "ldc1 $f6, 3*8(%3) \n\t" \ + "paddsh $f2, $f2, $f10 \n\t" /* x3+x5*tg_3_16 = tp35 */\ + "paddsh $f8, $f8, $f12 \n\t" /* x1+tg_1_16*x7 = tp17 */\ + "psubsh $f4, $f4, $f14 \n\t" /* x1*tg_1_16-x7 = tm17 */\ + "paddsh $f10, $f8, $f2 \n\t" /* tp17+tp35 = b0 */\ + "psubsh $f12, $f4, $f0 \n\t" /* tm17-tm35 = b3 */\ + "psubsh $f8, $f8, $f2 \n\t" /* tp17-tp35 = t1 */\ + "paddsh $f4, $f4, $f0 \n\t" /* tm17+tm35 = t2 */\ + "ldc1 $f14, 1*8(%3) \n\t" \ + "sdc1 $f10, 3*16+"#A2" \n\t" /* save b0 */\ + "paddsh $f2, $f8, $f4 \n\t" /* t1+t2 */\ + "sdc1 $f12, 5*16+"#A2" \n\t" /* save b3 */\ + "psubsh $f8, $f8, $f4 \n\t" /* t1-t2 */\ + "ldc1 $f10, 2*16+"#A1" \n\t" \ + "ldc1 $f12, 6*16+"#A1" \n\t" \ + "pmulhh $f0, $f14, $f10 \n\t" /* x2*tg_2_16 */\ + "pmulhh $f14, $f14, $f12 \n\t" /* x6*tg_2_16 */\ + "pmulhh $f2, $f2, $f6 \n\t" /* ocos_4_16*(t1+t2) = b1/2 */\ + "ldc1 $f4, 0*16+"#A1" \n\t" \ + "pmulhh $f8, $f8, $f6 \n\t" /* ocos_4_16*(t1-t2) = b2/2 */\ + "psubsh $f0, $f0, $f12 \n\t" /* t2*tg_2_16-x6 = tm26 */\ + "ldc1 $f12, 4*16+"#A1" \n\t" \ + "paddsh $f14, $f14, $f10 \n\t" /* x2+x6*tg_2_16 = tp26 */\ + "psubsh $f6, $f4, $f12 \n\t" /* x0-x4 = tm04 */\ + "paddsh $f4, $f4, $f12 \n\t" /* x0+x4 = tp04 */\ + "paddsh $f10, $f4, $f14 \n\t" /* tp04+tp26 = a0 */\ + "psubsh $f12, $f6, $f0 \n\t" /* tm04-tm26 = a2 */\ + "psubsh $f4, $f4, $f14 \n\t" /* tp04-tp26 = a3 */\ + "paddsh $f6, $f6, $f0 \n\t" /* tm04+tm26 = a1 */\ + "paddsh $f2, $f2, $f2 \n\t" /* b1 */\ + "paddsh $f8, $f8, $f8 \n\t" /* b2 */\ + "psubsh $f14, $f6, $f2 \n\t" /* a1-b1 */\ + "dli $10, 6 \n\t" \ + "paddsh $f6, $f6, $f2 \n\t" /* a1+b1 */\ + "dmtc1 $10, $f16 \n\t" \ + "psubsh $f0, $f12, $f8 \n\t" /* a2-b2 */\ + "paddsh $f12, $f12, $f8 \n\t" /* a2+b2 */\ + "psrah $f6, $f6, $f16 \n\t" /* dst1 */\ + "psrah $f12, $f12, $f16 \n\t" /* dst2 */\ + "ldc1 $f2, 3*16+"#A2" \n\t" /* load b0 */\ + "psrah $f14, $f14, $f16 \n\t" /* dst6 */\ + "psrah $f0, $f0, $f16 \n\t" /* dst5 */\ + "sdc1 $f6, 1*16+"#A2" \n\t" \ + "psubsh $f8, $f10, $f2 \n\t" /* a0-b0 */\ + "paddsh $f10, $f10, $f2 \n\t" /* a0+b0 */\ + "sdc1 $f12, 2*16+"#A2" \n\t" \ + "ldc1 $f6, 5*16+"#A2" \n\t" /* load b3 */\ + "psrah $f10, $f10, $f16 \n\t" /* dst0 */\ + "psrah $f8, $f8, $f16 \n\t" /* dst7 */\ + "sdc1 $f0, 5*16+"#A2" \n\t" \ + "psubsh $f12, $f4, $f6 \n\t" /* a3-b3 */\ + "paddsh $f4, $f4, $f6 \n\t" /* a3+b3 */\ + "sdc1 $f14, 6*16+"#A2" \n\t" \ + "sdc1 $f10, 0*16+"#A2" \n\t" \ + "psrah $f4, $f4, $f16 \n\t" /* dst3 */\ + "sdc1 $f8, 7*16+"#A2" \n\t" \ + "psrah $f12, $f12, $f16 \n\t" /* dst4 */\ + "sdc1 $f4, 3*16+"#A2" \n\t" \ + "sdc1 $f12, 4*16+"#A2" \n\t" \ + + +void ff_xvid_idct_mmi(int16_t *block) +{ + __asm__ volatile ( + //# Process each row + DCT_8_INV_ROW_MMI(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1)) + DCT_8_INV_ROW_MMI(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1)) + DCT_8_INV_ROW_MMI(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1)) + DCT_8_INV_ROW_MMI(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1)) + DCT_8_INV_ROW_MMI(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1)) + DCT_8_INV_ROW_MMI(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1)) + DCT_8_INV_ROW_MMI(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1)) + DCT_8_INV_ROW_MMI(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1)) + //# Process the columns (4 at a time) + DCT_8_INV_COL(0(%0), 0(%0)) + DCT_8_INV_COL(8(%0), 8(%0)) + ::"r"(block),"r"(rounder_0),"r"(tab_i_04_mmi),"r"(tg_1_16) + : "$10" + ); +} + +void ff_xvid_idct_put_mmi(uint8_t *dest, int32_t line_size, int16_t *block) +{ + ff_xvid_idct_mmi(block); + ff_put_pixels_clamped_mmi(block, dest, line_size); +} + +void ff_xvid_idct_add_mmi(uint8_t *dest, int32_t line_size, int16_t *block) +{ + ff_xvid_idct_mmi(block); + ff_add_pixels_clamped_mmi(block, dest, line_size); +} diff --git a/libavcodec/mips/xvididct_init_mips.c b/libavcodec/mips/xvididct_init_mips.c new file mode 100644 index 0000000000..c1d82cc30c --- /dev/null +++ b/libavcodec/mips/xvididct_init_mips.c @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xvididct_mips.h" + +#if HAVE_MMI +static av_cold void xvid_idct_init_mmi(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + if (!high_bit_depth) { + if (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_XVID) { + c->idct_put = ff_xvid_idct_put_mmi; + c->idct_add = ff_xvid_idct_add_mmi; + c->idct = ff_xvid_idct_mmi; + c->perm_type = FF_IDCT_PERM_NONE; + } + } +} +#endif /* HAVE_MMI */ + +av_cold void ff_xvid_idct_init_mips(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ +#if HAVE_MMI + xvid_idct_init_mmi(c, avctx, high_bit_depth); +#endif /* HAVE_MMI */ +} diff --git a/libavcodec/mips/xvididct_mips.h b/libavcodec/mips/xvididct_mips.h new file mode 100644 index 0000000000..0768aaa26f --- /dev/null +++ b/libavcodec/mips/xvididct_mips.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_XVIDIDCT_MIPS_H +#define AVCODEC_MIPS_XVIDIDCT_MIPS_H + +#include "libavcodec/xvididct.h" + +void ff_xvid_idct_mmi(int16_t *block); +void ff_xvid_idct_put_mmi(uint8_t *dest, int32_t line_size, int16_t *block); +void ff_xvid_idct_add_mmi(uint8_t *dest, int32_t line_size, int16_t *block); + +#endif /* AVCODEC_MIPS_XVIDIDCT_MIPS_H */ |