summaryrefslogtreecommitdiff
path: root/libswscale/ppc
diff options
context:
space:
mode:
authorMans Rullgard <mans@mansr.com>2012-10-04 17:30:34 +0100
committerMans Rullgard <mans@mansr.com>2012-10-05 22:33:32 +0100
commit07eb7e20af63a244d9e1813626fac38a84e8c869 (patch)
treeb83758778ef483137dac80981d1333083279078b /libswscale/ppc
parent642b4efaf7b3055ab4b26bda252149eb35babc4b (diff)
ppc: swscale: rework yuv2planeX_altivec()
This gets rid of the variable-length scratch buffer by filtering 16 pixels at a time and writing directly to the destination. The extra loads this requires to load the source values are compensated by not doing a round-trip to memory before shifting. Signed-off-by: Mans Rullgard <mans@mansr.com>
Diffstat (limited to 'libswscale/ppc')
-rw-r--r--libswscale/ppc/swscale_altivec.c152
1 files changed, 63 insertions, 89 deletions
diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c
index 0e66ec1f7b..7616ddf1fa 100644
--- a/libswscale/ppc/swscale_altivec.c
+++ b/libswscale/ppc/swscale_altivec.c
@@ -32,78 +32,37 @@
#define vzero vec_splat_s32(0)
-static inline void altivec_packIntArrayToCharArray(int *val, uint8_t *dest,
- int dstW)
+#define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do { \
+ vector signed short l2 = vec_ld(((x) << 1) + 16, src); \
+ vector signed short ls = vec_perm(l1, l2, perm); \
+ vector signed int i1 = vec_mule(filter, ls); \
+ vector signed int i2 = vec_mulo(filter, ls); \
+ vector signed int vf1 = vec_mergeh(i1, i2); \
+ vector signed int vf2 = vec_mergel(i1, i2); \
+ d1 = vec_add(d1, vf1); \
+ d2 = vec_add(d2, vf2); \
+ l1 = l2; \
+ } while (0)
+
+static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest,
+ const uint8_t *dither, int offset, int x)
{
- register int i;
+ register int i, j;
+ DECLARE_ALIGNED(16, int, val)[16];
+ vector signed int vo1, vo2, vo3, vo4;
+ vector unsigned short vs1, vs2;
+ vector unsigned char vf;
vector unsigned int altivec_vectorShiftInt19 =
vec_add(vec_splat_u32(10), vec_splat_u32(9));
- if ((uintptr_t)dest % 16) {
- /* badly aligned store, we force store alignment */
- /* and will handle load misalignment on val w/ vec_perm */
- vector unsigned char perm1;
- vector signed int v1;
- for (i = 0; (i < dstW) &&
- (((uintptr_t)dest + i) % 16); i++) {
- int t = val[i] >> 19;
- dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t);
- }
- perm1 = vec_lvsl(i << 2, val);
- v1 = vec_ld(i << 2, val);
- for (; i < (dstW - 15); i += 16) {
- int offset = i << 2;
- vector signed int v2 = vec_ld(offset + 16, val);
- vector signed int v3 = vec_ld(offset + 32, val);
- vector signed int v4 = vec_ld(offset + 48, val);
- vector signed int v5 = vec_ld(offset + 64, val);
- vector signed int v12 = vec_perm(v1, v2, perm1);
- vector signed int v23 = vec_perm(v2, v3, perm1);
- vector signed int v34 = vec_perm(v3, v4, perm1);
- vector signed int v45 = vec_perm(v4, v5, perm1);
-
- vector signed int vA = vec_sra(v12, altivec_vectorShiftInt19);
- vector signed int vB = vec_sra(v23, altivec_vectorShiftInt19);
- vector signed int vC = vec_sra(v34, altivec_vectorShiftInt19);
- vector signed int vD = vec_sra(v45, altivec_vectorShiftInt19);
- vector unsigned short vs1 = vec_packsu(vA, vB);
- vector unsigned short vs2 = vec_packsu(vC, vD);
- vector unsigned char vf = vec_packsu(vs1, vs2);
- vec_st(vf, i, dest);
- v1 = v5;
- }
- } else { // dest is properly aligned, great
- for (i = 0; i < (dstW - 15); i += 16) {
- int offset = i << 2;
- vector signed int v1 = vec_ld(offset, val);
- vector signed int v2 = vec_ld(offset + 16, val);
- vector signed int v3 = vec_ld(offset + 32, val);
- vector signed int v4 = vec_ld(offset + 48, val);
- vector signed int v5 = vec_sra(v1, altivec_vectorShiftInt19);
- vector signed int v6 = vec_sra(v2, altivec_vectorShiftInt19);
- vector signed int v7 = vec_sra(v3, altivec_vectorShiftInt19);
- vector signed int v8 = vec_sra(v4, altivec_vectorShiftInt19);
- vector unsigned short vs1 = vec_packsu(v5, v6);
- vector unsigned short vs2 = vec_packsu(v7, v8);
- vector unsigned char vf = vec_packsu(vs1, vs2);
- vec_st(vf, i, dest);
- }
- }
- for (; i < dstW; i++) {
- int t = val[i] >> 19;
- dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t);
- }
-}
-// FIXME remove the usage of scratch buffers.
-static void yuv2planeX_altivec(const int16_t *filter, int filterSize,
- const int16_t **src, uint8_t *dest, int dstW,
- const uint8_t *dither, int offset)
-{
- register int i, j;
- DECLARE_ALIGNED(16, int, val)[dstW];
+ for (i = 0; i < 16; i++)
+ val[i] = dither[(x + i + offset) & 7] << 12;
- for (i = 0; i < dstW; i++)
- val[i] = dither[(i + offset) & 7] << 12;
+ vo1 = vec_ld(0, val);
+ vo2 = vec_ld(16, val);
+ vo3 = vec_ld(32, val);
+ vo4 = vec_ld(48, val);
for (j = 0; j < filterSize; j++) {
vector signed short l1, vLumFilter = vec_ld(j << 1, filter);
@@ -111,36 +70,51 @@ static void yuv2planeX_altivec(const int16_t *filter, int filterSize,
vLumFilter = vec_perm(vLumFilter, vLumFilter, perm0);
vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter
- perm = vec_lvsl(0, src[j]);
- l1 = vec_ld(0, src[j]);
+ perm = vec_lvsl(x << 1, src[j]);
+ l1 = vec_ld(x << 1, src[j]);
- for (i = 0; i < (dstW - 7); i += 8) {
- int offset = i << 2;
- vector signed short l2 = vec_ld((i << 1) + 16, src[j]);
+ yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter);
+ yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
+ }
- vector signed int v1 = vec_ld(offset, val);
- vector signed int v2 = vec_ld(offset + 16, val);
+ vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
+ vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
+ vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
+ vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
+ vs1 = vec_packsu(vo1, vo2);
+ vs2 = vec_packsu(vo3, vo4);
+ vf = vec_packsu(vs1, vs2);
+ vec_st(vf, 0, dest);
+}
- vector signed short ls = vec_perm(l1, l2, perm); // lumSrc[j][i] ... lumSrc[j][i+7]
+static inline void yuv2planeX_u(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset, int x)
+{
+ int i, j;
- vector signed int i1 = vec_mule(vLumFilter, ls);
- vector signed int i2 = vec_mulo(vLumFilter, ls);
+ for (i = x; i < dstW; i++) {
+ int t = dither[(i + offset) & 7] << 12;
+ for (j = 0; j < filterSize; j++)
+ t += src[j][i] * filter[j];
+ dest[i] = av_clip_uint8(t >> 19);
+ }
+}
- vector signed int vf1 = vec_mergeh(i1, i2);
- vector signed int vf2 = vec_mergel(i1, i2); // lumSrc[j][i] * lumFilter[j] ... lumSrc[j][i+7] * lumFilter[j]
+static void yuv2planeX_altivec(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset)
+{
+ int dst_u = -(uintptr_t)dest & 15;
+ int i;
- vector signed int vo1 = vec_add(v1, vf1);
- vector signed int vo2 = vec_add(v2, vf2);
+ yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
- vec_st(vo1, offset, val);
- vec_st(vo2, offset + 16, val);
+ for (i = dst_u; i < dstW - 15; i += 16)
+ yuv2planeX_16_altivec(filter, filterSize, src, dest + i, dither,
+ offset, i);
- l1 = l2;
- }
- for (; i < dstW; i++)
- val[i] += src[j][i] * filter[j];
- }
- altivec_packIntArrayToCharArray(val, dest, dstW);
+ yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
}
static void hScale_altivec_real(SwsContext *c, int16_t *dst, int dstW,