summaryrefslogtreecommitdiff
path: root/libavcodec/arm/h264dsp_neon.S
diff options
context:
space:
mode:
authorMåns Rullgård <mans@mansr.com>2009-01-25 13:04:41 +0000
committerMåns Rullgård <mans@mansr.com>2009-01-25 13:04:41 +0000
commit5a29589b8101a90feabbfd5ad6ffc9c88ab1157f (patch)
tree9e740693e5b3b4e0201b8748bdddb90b52d9521a /libavcodec/arm/h264dsp_neon.S
parent1615fb91a110ef6cf2d8058e43b9cd5ff40cd3da (diff)
ARM: NEON optimised H.264 biweighted prediction
Originally committed as revision 16770 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/arm/h264dsp_neon.S')
-rw-r--r--libavcodec/arm/h264dsp_neon.S168
1 files changed, 168 insertions, 0 deletions
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
index e57cb6f1e6..616a8132e5 100644
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -1368,3 +1368,171 @@ function ff_put_h264_qpel16_mc33_neon, export=1
sub r1, r1, #1
b put_h264_qpel16_mc11
.endfunc
+
+@ Biweighted prediction
+
+ .macro biweight_16 macs, macd
+ vdup.8 d0, r4
+ vdup.8 d1, r5
+ vmov q2, q8
+ vmov q3, q8
+1: subs ip, ip, #2
+ vld1.8 {d20-d21},[r0,:128], r2
+ \macd q2, d0, d20
+ pld [r0]
+ \macd q3, d0, d21
+ vld1.8 {d22-d23},[r1,:128], r2
+ \macs q2, d1, d22
+ pld [r1]
+ \macs q3, d1, d23
+ vmov q12, q8
+ vld1.8 {d28-d29},[r0,:128], r2
+ vmov q13, q8
+ \macd q12, d0, d28
+ pld [r0]
+ \macd q13, d0, d29
+ vld1.8 {d30-d31},[r1,:128], r2
+ \macs q12, d1, d30
+ pld [r1]
+ \macs q13, d1, d31
+ vshl.s16 q2, q2, q9
+ vshl.s16 q3, q3, q9
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d5, q3
+ vshl.s16 q12, q12, q9
+ vshl.s16 q13, q13, q9
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vmov q3, q8
+ vst1.8 {d4- d5}, [r6,:128], r2
+ vmov q2, q8
+ vst1.8 {d24-d25},[r6,:128], r2
+ bne 1b
+ pop {r4-r6, pc}
+ .endm
+
+ .macro biweight_8 macs, macd
+ vdup.8 d0, r4
+ vdup.8 d1, r5
+ vmov q1, q8
+ vmov q10, q8
+1: subs ip, ip, #2
+ vld1.8 {d4},[r0,:64], r2
+ \macd q1, d0, d4
+ pld [r0]
+ vld1.8 {d5},[r1,:64], r2
+ \macs q1, d1, d5
+ pld [r1]
+ vld1.8 {d6},[r0,:64], r2
+ \macd q10, d0, d6
+ pld [r0]
+ vld1.8 {d7},[r1,:64], r2
+ \macs q10, d1, d7
+ pld [r1]
+ vshl.s16 q1, q1, q9
+ vqmovun.s16 d2, q1
+ vshl.s16 q10, q10, q9
+ vqmovun.s16 d4, q10
+ vmov q10, q8
+ vst1.8 {d2},[r6,:64], r2
+ vmov q1, q8
+ vst1.8 {d4},[r6,:64], r2
+ bne 1b
+ pop {r4-r6, pc}
+ .endm
+
+ .macro biweight_4 macs, macd
+ vdup.8 d0, r4
+ vdup.8 d1, r5
+ vmov q1, q8
+ vmov q10, q8
+1: subs ip, ip, #4
+ vld1.32 {d4[0]},[r0,:32], r2
+ vld1.32 {d4[1]},[r0,:32], r2
+ \macd q1, d0, d4
+ pld [r0]
+ vld1.32 {d5[0]},[r1,:32], r2
+ vld1.32 {d5[1]},[r1,:32], r2
+ \macs q1, d1, d5
+ pld [r1]
+ blt 2f
+ vld1.32 {d6[0]},[r0,:32], r2
+ vld1.32 {d6[1]},[r0,:32], r2
+ \macd q10, d0, d6
+ pld [r0]
+ vld1.32 {d7[0]},[r1,:32], r2
+ vld1.32 {d7[1]},[r1,:32], r2
+ \macs q10, d1, d7
+ pld [r1]
+ vshl.s16 q1, q1, q9
+ vqmovun.s16 d2, q1
+ vshl.s16 q10, q10, q9
+ vqmovun.s16 d4, q10
+ vmov q10, q8
+ vst1.32 {d2[0]},[r6,:32], r2
+ vst1.32 {d2[1]},[r6,:32], r2
+ vmov q1, q8
+ vst1.32 {d4[0]},[r6,:32], r2
+ vst1.32 {d4[1]},[r6,:32], r2
+ bne 1b
+ pop {r4-r6, pc}
+2: vshl.s16 q1, q1, q9
+ vqmovun.s16 d2, q1
+ vst1.32 {d2[0]},[r6,:32], r2
+ vst1.32 {d2[1]},[r6,:32], r2
+ pop {r4-r6, pc}
+ .endm
+
+ .macro biweight_func w
+function biweight_h264_pixels_\w\()_neon
+ push {r4-r6, lr}
+ add r4, sp, #16
+ ldm r4, {r4-r6}
+ lsr lr, r4, #31
+ add r6, r6, #1
+ eors lr, lr, r5, lsr #30
+ orr r6, r6, #1
+ vdup.16 q9, r3
+ lsl r6, r6, r3
+ vmvn q9, q9
+ vdup.16 q8, r6
+ mov r6, r0
+ beq 10f
+ subs lr, lr, #1
+ beq 20f
+ subs lr, lr, #1
+ beq 30f
+ b 40f
+10: biweight_\w vmlal.u8, vmlal.u8
+20: rsb r4, r4, #0
+ biweight_\w vmlal.u8, vmlsl.u8
+30: rsb r4, r4, #0
+ rsb r5, r5, #0
+ biweight_\w vmlsl.u8, vmlsl.u8
+40: rsb r5, r5, #0
+ biweight_\w vmlsl.u8, vmlal.u8
+ .endfunc
+ .endm
+
+ .macro biweight_entry w, h, b=1
+function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
+ mov ip, #\h
+.if \b
+ b biweight_h264_pixels_\w\()_neon
+.endif
+ .endfunc
+ .endm
+
+ biweight_entry 16, 8
+ biweight_entry 16, 16, b=0
+ biweight_func 16
+
+ biweight_entry 8, 16
+ biweight_entry 8, 4
+ biweight_entry 8, 8, b=0
+ biweight_func 8
+
+ biweight_entry 4, 8
+ biweight_entry 4, 2
+ biweight_entry 4, 4, b=0
+ biweight_func 4