summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNedeljko Babic <nedeljko.babic@imgtec.com>2015-03-05 12:23:59 +0100
committerMichael Niedermayer <michaelni@gmx.at>2015-03-05 12:31:44 +0100
commitdfa920807494f0bc505aa090e036b531daa604ad (patch)
tree97f2a121f95c834544326e477193108bf84c0cf1
parent45555a20c6f425b2999612f80d287253a24216bd (diff)
mips/float_dsp: fix a bug in vector_fmul_window_mips
Loop was unrolled eight times although in heder there is assumption that len is multiple of 4. This is fixed, and assembly code is rewritten to be more optimal and to simplify clobber list. Signed-off-by: Nedeljko Babic <nedeljko.babic@imgtec.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r--libavutil/mips/float_dsp_mips.c180
1 files changed, 74 insertions, 106 deletions
diff --git a/libavutil/mips/float_dsp_mips.c b/libavutil/mips/float_dsp_mips.c
index 06d52dc258..a54f959329 100644
--- a/libavutil/mips/float_dsp_mips.c
+++ b/libavutil/mips/float_dsp_mips.c
@@ -144,114 +144,82 @@ static void vector_fmul_scalar_mips(float *dst, const float *src, float mul,
}
static void vector_fmul_window_mips(float *dst, const float *src0,
- const float *src1, const float *win, int len)
+ const float *src1, const float *win, int len)
{
- int i, j;
- /*
- * variables used in inline assembler
- */
- float * dst_i, * dst_j, * dst_i2, * dst_j2;
- float temp, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ float * dst_j, *win_j, *src0_i, *src1_j, *dst_i, *win_i;
+ float temp, temp1, temp2, temp3;
+ float s0, s01, s1, s11;
+ float wi, wi1, wi2, wi3;
+ float wj, wj1, wj2, wj3;
+ const float * lp_end = win + len;
+
+ win_i = (float *)win;
+ win_j = (float *)(win + 2 * len -1);
+ src1_j = (float *)(src1 + len - 1);
+ src0_i = (float *)src0;
+ dst_i = (float *)dst;
+ dst_j = (float *)(dst + 2 * len -1);
- dst += len;
- win += len;
- src0 += len;
-
- for (i = -len, j = len - 1; i < 0; i += 8, j -= 8) {
-
- dst_i = dst + i;
- dst_j = dst + j;
-
- dst_i2 = dst + i + 4;
- dst_j2 = dst + j - 4;
-
- __asm__ volatile (
- "mul.s %[temp], %[s1], %[wi] \n\t"
- "mul.s %[temp1], %[s1], %[wj] \n\t"
- "mul.s %[temp2], %[s11], %[wi1] \n\t"
- "mul.s %[temp3], %[s11], %[wj1] \n\t"
-
- "msub.s %[temp], %[temp], %[s0], %[wj] \n\t"
- "madd.s %[temp1], %[temp1], %[s0], %[wi] \n\t"
- "msub.s %[temp2], %[temp2], %[s01], %[wj1] \n\t"
- "madd.s %[temp3], %[temp3], %[s01], %[wi1] \n\t"
-
- "swc1 %[temp], 0(%[dst_i]) \n\t" /* dst[i] = s0*wj - s1*wi; */
- "swc1 %[temp1], 0(%[dst_j]) \n\t" /* dst[j] = s0*wi + s1*wj; */
- "swc1 %[temp2], 4(%[dst_i]) \n\t" /* dst[i+1] = s01*wj1 - s11*wi1; */
- "swc1 %[temp3], -4(%[dst_j]) \n\t" /* dst[j-1] = s01*wi1 + s11*wj1; */
-
- "mul.s %[temp4], %[s12], %[wi2] \n\t"
- "mul.s %[temp5], %[s12], %[wj2] \n\t"
- "mul.s %[temp6], %[s13], %[wi3] \n\t"
- "mul.s %[temp7], %[s13], %[wj3] \n\t"
-
- "msub.s %[temp4], %[temp4], %[s02], %[wj2] \n\t"
- "madd.s %[temp5], %[temp5], %[s02], %[wi2] \n\t"
- "msub.s %[temp6], %[temp6], %[s03], %[wj3] \n\t"
- "madd.s %[temp7], %[temp7], %[s03], %[wi3] \n\t"
-
- "swc1 %[temp4], 8(%[dst_i]) \n\t" /* dst[i+2] = s02*wj2 - s12*wi2; */
- "swc1 %[temp5], -8(%[dst_j]) \n\t" /* dst[j-2] = s02*wi2 + s12*wj2; */
- "swc1 %[temp6], 12(%[dst_i]) \n\t" /* dst[i+2] = s03*wj3 - s13*wi3; */
- "swc1 %[temp7], -12(%[dst_j]) \n\t" /* dst[j-3] = s03*wi3 + s13*wj3; */
- : [temp]"=&f"(temp), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
- [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
- [temp6]"=&f"(temp6), [temp7]"=&f"(temp7)
- : [dst_j]"r"(dst_j), [dst_i]"r" (dst_i),
- [s0] "f"(src0[i]), [wj] "f"(win[j]), [s1] "f"(src1[j]),
- [wi] "f"(win[i]), [s01]"f"(src0[i + 1]),[wj1]"f"(win[j - 1]),
- [s11]"f"(src1[j - 1]), [wi1]"f"(win[i + 1]), [s02]"f"(src0[i + 2]),
- [wj2]"f"(win[j - 2]), [s12]"f"(src1[j - 2]),[wi2]"f"(win[i + 2]),
- [s03]"f"(src0[i + 3]), [wj3]"f"(win[j - 3]), [s13]"f"(src1[j - 3]),
- [wi3]"f"(win[i + 3])
- : "memory"
- );
-
- __asm__ volatile (
- "mul.s %[temp], %[s1], %[wi] \n\t"
- "mul.s %[temp1], %[s1], %[wj] \n\t"
- "mul.s %[temp2], %[s11], %[wi1] \n\t"
- "mul.s %[temp3], %[s11], %[wj1] \n\t"
-
- "msub.s %[temp], %[temp], %[s0], %[wj] \n\t"
- "madd.s %[temp1], %[temp1], %[s0], %[wi] \n\t"
- "msub.s %[temp2], %[temp2], %[s01], %[wj1] \n\t"
- "madd.s %[temp3], %[temp3], %[s01], %[wi1] \n\t"
-
- "swc1 %[temp], 0(%[dst_i2]) \n\t" /* dst[i] = s0*wj - s1*wi; */
- "swc1 %[temp1], 0(%[dst_j2]) \n\t" /* dst[j] = s0*wi + s1*wj; */
- "swc1 %[temp2], 4(%[dst_i2]) \n\t" /* dst[i+1] = s01*wj1 - s11*wi1; */
- "swc1 %[temp3], -4(%[dst_j2]) \n\t" /* dst[j-1] = s01*wi1 + s11*wj1; */
-
- "mul.s %[temp4], %[s12], %[wi2] \n\t"
- "mul.s %[temp5], %[s12], %[wj2] \n\t"
- "mul.s %[temp6], %[s13], %[wi3] \n\t"
- "mul.s %[temp7], %[s13], %[wj3] \n\t"
-
- "msub.s %[temp4], %[temp4], %[s02], %[wj2] \n\t"
- "madd.s %[temp5], %[temp5], %[s02], %[wi2] \n\t"
- "msub.s %[temp6], %[temp6], %[s03], %[wj3] \n\t"
- "madd.s %[temp7], %[temp7], %[s03], %[wi3] \n\t"
-
- "swc1 %[temp4], 8(%[dst_i2]) \n\t" /* dst[i+2] = s02*wj2 - s12*wi2; */
- "swc1 %[temp5], -8(%[dst_j2]) \n\t" /* dst[j-2] = s02*wi2 + s12*wj2; */
- "swc1 %[temp6], 12(%[dst_i2]) \n\t" /* dst[i+2] = s03*wj3 - s13*wi3; */
- "swc1 %[temp7], -12(%[dst_j2]) \n\t" /* dst[j-3] = s03*wi3 + s13*wj3; */
- : [temp]"=&f"(temp),
- [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), [temp3]"=&f"(temp3),
- [temp4]"=&f"(temp4), [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
- [temp7] "=&f" (temp7)
- : [dst_j2]"r"(dst_j2), [dst_i2]"r"(dst_i2),
- [s0] "f"(src0[i + 4]), [wj] "f"(win[j - 4]), [s1] "f"(src1[j - 4]),
- [wi] "f"(win[i + 4]), [s01]"f"(src0[i + 5]),[wj1]"f"(win[j - 5]),
- [s11]"f"(src1[j - 5]), [wi1]"f"(win[i + 5]), [s02]"f"(src0[i + 6]),
- [wj2]"f"(win[j - 6]), [s12]"f"(src1[j - 6]),[wi2]"f"(win[i + 6]),
- [s03]"f"(src0[i + 7]), [wj3]"f"(win[j - 7]), [s13]"f"(src1[j - 7]),
- [wi3]"f"(win[i + 7])
- : "memory"
- );
- }
+ /* loop unrolled 4 times */
+ __asm__ volatile (
+ "1:"
+ "lwc1 %[s1], 0(%[src1_j]) \n\t"
+ "lwc1 %[wi], 0(%[win_i]) \n\t"
+ "lwc1 %[wj], 0(%[win_j]) \n\t"
+ "lwc1 %[s11], -4(%[src1_j]) \n\t"
+ "lwc1 %[wi1], 4(%[win_i]) \n\t"
+ "lwc1 %[wj1], -4(%[win_j]) \n\t"
+ "lwc1 %[s0], 0(%[src0_i]) \n\t"
+ "lwc1 %[s01], 4(%[src0_i]) \n\t"
+ "mul.s %[temp], %[s1], %[wi] \n\t"
+ "mul.s %[temp1], %[s1], %[wj] \n\t"
+ "mul.s %[temp2], %[s11], %[wi1] \n\t"
+ "mul.s %[temp3], %[s11], %[wj1] \n\t"
+ "lwc1 %[s1], -8(%[src1_j]) \n\t"
+ "lwc1 %[wi2], 8(%[win_i]) \n\t"
+ "lwc1 %[wj2], -8(%[win_j]) \n\t"
+ "lwc1 %[s11], -12(%[src1_j]) \n\t"
+ "msub.s %[temp], %[temp], %[s0], %[wj] \n\t"
+ "madd.s %[temp1], %[temp1], %[s0], %[wi] \n\t"
+ "msub.s %[temp2], %[temp2], %[s01], %[wj1] \n\t"
+ "madd.s %[temp3], %[temp3], %[s01], %[wi1] \n\t"
+ "lwc1 %[wi3], 12(%[win_i]) \n\t"
+ "lwc1 %[wj3], -12(%[win_j]) \n\t"
+ "lwc1 %[s0], 8(%[src0_i]) \n\t"
+ "lwc1 %[s01], 12(%[src0_i]) \n\t"
+ "addiu %[src1_j],-16 \n\t"
+ "addiu %[win_i], 16 \n\t"
+ "addiu %[win_j], -16 \n\t"
+ "addiu %[src0_i], 16 \n\t"
+ "swc1 %[temp], 0(%[dst_i]) \n\t" /* dst[i] = s0*wj - s1*wi; */
+ "swc1 %[temp1], 0(%[dst_j]) \n\t" /* dst[j] = s0*wi + s1*wj; */
+ "swc1 %[temp2], 4(%[dst_i]) \n\t" /* dst[i+1] = s01*wj1 - s11*wi1; */
+ "swc1 %[temp3], -4(%[dst_j]) \n\t" /* dst[j-1] = s01*wi1 + s11*wj1; */
+ "mul.s %[temp], %[s1], %[wi2] \n\t"
+ "mul.s %[temp1], %[s1], %[wj2] \n\t"
+ "mul.s %[temp2], %[s11], %[wi3] \n\t"
+ "mul.s %[temp3], %[s11], %[wj3] \n\t"
+ "msub.s %[temp], %[temp], %[s0], %[wj2] \n\t"
+ "madd.s %[temp1], %[temp1], %[s0], %[wi2] \n\t"
+ "msub.s %[temp2], %[temp2], %[s01], %[wj3] \n\t"
+ "madd.s %[temp3], %[temp3], %[s01], %[wi3] \n\t"
+ "swc1 %[temp], 8(%[dst_i]) \n\t" /* dst[i+2] = s0*wj2 - s1*wi2; */
+ "swc1 %[temp1], -8(%[dst_j]) \n\t" /* dst[j-2] = s0*wi2 + s1*wj2; */
+ "swc1 %[temp2], 12(%[dst_i]) \n\t" /* dst[i+2] = s01*wj3 - s11*wi3; */
+ "swc1 %[temp3], -12(%[dst_j]) \n\t" /* dst[j-3] = s01*wi3 + s11*wj3; */
+ "addiu %[dst_i], 16 \n\t"
+ "addiu %[dst_j], -16 \n\t"
+ "bne %[win_i], %[lp_end], 1b \n\t"
+ : [temp]"=&f"(temp), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+ [temp3]"=&f"(temp3), [src0_i]"+r"(src0_i), [win_i]"+r"(win_i),
+ [src1_j]"+r"(src1_j), [win_j]"+r"(win_j), [dst_i]"+r"(dst_i),
+ [dst_j]"+r"(dst_j), [s0] "=&f"(s0), [s01]"=&f"(s01), [s1] "=&f"(s1),
+ [s11]"=&f"(s11), [wi] "=&f"(wi), [wj] "=&f"(wj), [wi2]"=&f"(wi2),
+ [wj2]"=&f"(wj2), [wi3]"=&f"(wi3), [wj3]"=&f"(wj3), [wi1]"=&f"(wi1),
+ [wj1]"=&f"(wj1)
+ : [lp_end]"r"(lp_end)
+ : "memory"
+ );
}
static void butterflies_float_mips(float *av_restrict v1, float *av_restrict v2,