diff options
author | Jason Sams <jsams@google.com> | 2013-02-05 13:08:12 -0800 |
---|---|---|
committer | Android Git Automerger <android-git-automerger@android.com> | 2013-02-05 13:08:12 -0800 |
commit | 1f80b2c8b2b52c836c4039f716f702f96f213b0d (patch) | |
tree | 7224500f0a1b5bb6bdffa88d3832a4acf725f740 | |
parent | b53dc856325cfb12401abef33cc7b3b439c3152b (diff) | |
parent | 902868b7c94f0c16b53e28ee1dd68c4e4a24f964 (diff) | |
download | rs-tools_r22.tar.gz |
am 902868b7: Merge "YUV(NV21) to RGBA function NEON optimizations."tools_r22jb-mr1.1-dev-plus-aosp
# By Vassilis Laganakos
# Via David Butcher (1) and Gerrit Code Review (1)
* commit '902868b7c94f0c16b53e28ee1dd68c4e4a24f964':
YUV(NV21) to RGBA function NEON optimizations.
-rw-r--r-- | driver/rsdIntrinsics_Convolve.S | 151 |
1 files changed, 80 insertions, 71 deletions
diff --git a/driver/rsdIntrinsics_Convolve.S b/driver/rsdIntrinsics_Convolve.S index 04dd8b1c..4e4b3992 100644 --- a/driver/rsdIntrinsics_Convolve.S +++ b/driver/rsdIntrinsics_Convolve.S @@ -379,85 +379,94 @@ ENTRY(rsdIntrinsicBlurHF_K) END(rsdIntrinsicBlurHF_K) /* + Function called with the following arguments: dst, Y, vu, len, YuvCoeff r0 = dst r1 = Y r2 = VU r3 = length (pixels / 8) - r4 = sp, params + ---- Args below will be in the stack ---- + sp = YuvCoeff This function converts 8 pixels per iteration */ ENTRY(rsdIntrinsicYuv_K) - push {r4-r8, r10, r11, lr} - vpush {q4-q7} - - ldr r4, [sp, #32+64] - vld1.16 {q2}, [r4]! // mults - vld1.16 {q3}, [r4]! // y offset - vld1.16 {q4}, [r4]! // 128 - vdup.8 d3, d5[1] - -1: - vld1.8 {d10}, [r1]! - vld1.8 {d12}, [r2]! - vmovl.u8 q5, d10 // Y at .16 - vmovl.u8 q6, d12 // vu at .16 - - vsub.i16 q5, q5, q3 - vsub.i16 q6, q6, q4 - vtrn.16 d12, d13 // d12 = u, d13 = v - vmov q7, q6 - vtrn.16 d12, d14 - vtrn.32 d12, d14 - vtrn.16 d13, d15 - vtrn.32 d13, d15 - - vmull.s16 q8, d10, d4[0] - vmull.s16 q11, d11, d4[0] - vmov q9, q8 - vmov q10, q8 - vmov q12, q11 - vmov q13, q11 - - vmlal.s16 q8, d12, d4[1] - vmlal.s16 q9, d12, d5[0] - vmlal.s16 q10, d13, d4[3] - vmlal.s16 q9, d13, d4[2] - - vmlal.s16 q11, d14, d4[1] - vmlal.s16 q12, d14, d5[0] - vmlal.s16 q13, d15, d4[3] - vmlal.s16 q12, d15, d4[2] - - - vshrn.i32 d16, q8, #8 - vshrn.i32 d18, q9, #8 - vshrn.i32 d20, q10, #8 - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! - vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! - vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! - vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! - - vshrn.i32 d16, q11, #8 - vshrn.i32 d18, q12, #8 - vshrn.i32 d20, q13, #8 - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! - vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! - vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! - vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! - - subs r3, r3, #1 - bne 1b - - vpop {q4-q7} - pop {r4-r8, r10, r11, lr} - bx lr + push {r4, r5, lr} @ preserve clobbered int registers + vpush {Q4-Q7} @ preserve Vregisters we clobber + + mov r5, #16 @ Integer 16 in r5; used as an incrementing value + + ldr r4, [sp, #64+12] @ load the coeffs address in memory in r4 (16*4 + 4*3) + vld1.16 {Q2}, [r4]! @ load the multipliers from the coeffs matrix (r4) in Q2 + vld1.8 {d6[]}, [r4], r5 @ load y offset 16 from the coeffs matrix (r4) in d6 + vld1.8 {d8[]}, [r4], r5 @ load V and U offset of 128 from the coeffs matrix (r4) in d8 + + mov r4, #8 @ Integer 8 in r4; used as an incrementing value + + vdup.8 d3, d5[1] @ d3 = 255 (alpha) from the multipliers line in + @ the coeffs matrix (Q2) + + 1: + vld1.8 {d10}, [r1]! @ get Y (r1->Y) + vld2.8 {d12, d14}, [r2], r4 @ split V from U (r2 -> VU) and increase pointer by 8 (in r4) + pld [r1, #64] @ preloading data from address y(r1) + 64 for subsequent loops + pld [r2, #64] @ preloading data from address vu(r2) + 64 for subsequent loops + + vsubl.u8 Q5, d10, d6 @ Y to 16 bit - 16 (in 16bit) (n to n+7) + vmull.s16 Q8, d10, d4[0] @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit) + vmull.s16 Q11, d11, d4[0] @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit) + + vsubl.u8 Q5, d12, d8 @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3) + vsubl.u8 Q6, d14, d8 @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3) + vmov.u16 d11, d10 @ Copying V to d11 + vmov.u16 d13, d12 @ Copying U to d13 + vzip.u16 d10, d11 @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3) + vzip.u16 d12, d13 @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3) + + + vmov Q9, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9 + vmov Q10, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10 + vmov Q12, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12 + vmov Q13, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13 + + @ R G B + @ Pixel(0-3) Q8, Q9, Q10 + @ Pixel(4-7) Q11, Q12, Q13 + @ + + @ Pixel(0-3) + vmlal.s16 Q8, d10, d4[1] @ R : Q8 = Q8(Y-16) + (V-128) * 409 + vmlal.s16 Q9, d10, d5[0] @ G : Q9 = Q9(Y-16) + (V-128) * (-208) + vmlal.s16 Q9, d12, d4[2] @ + (U-128) * (-100) + vmlal.s16 Q10, d12, d4[3] @ B : Q10 = Q10(Y-16) + (U-128) * 516 + + @ Pixel(4-7) + vmlal.s16 Q11, d11, d4[1] @ R : Q11 = Q11(Y-16) + (V-128) * 409 + vmlal.s16 Q12, d11, d5[0] @ G : Q12 = Q12(Y-16) + (V-128) * (-208) + vmlal.s16 Q12, d13, d4[2] @ + (U-128) * (-100) + vmlal.s16 Q13, d13, d4[3] @ B : Q13 = Q13(Y-16) + (U-128) * 516 + + @ Pixel(0-3) + vrshrn.i32 d16, Q8, #8 @ d16 : R shifted right by 8 rounded'n narrowed to 16bit + vrshrn.i32 d18, Q9, #8 @ d18 : G shifted right by 8 rounded'n narrowed to 16bit + vrshrn.i32 d20, Q10, #8 @ d20 : B shifted right by 8 rounded'n narrowed to 16bit + + @ Pixel(4-7) + vrshrn.i32 d17, Q11, #8 @ d17 : R shifted right by 8 rounded'n narrowed to 16bit + vrshrn.i32 d19, Q12, #8 @ d19 : G shifted right by 8 rounded'n narrowed to 16bit + vrshrn.i32 d21, Q13, #8 @ d21 : B shifted right by 8 rounded'n narrowed to 16bit + + vqmovun.s16 d0, Q8 @ r = d0 (saturated, unsigned and narrowed to 8bit) + vqmovun.s16 d1, Q9 @ g = d1 (saturated, unsigned and narrowed to 8bit) + vqmovun.s16 d2, Q10 @ b = d2 (saturated, unsigned and narrowed to 8bit) + + subs r3, r3, #1 @ Checking length (r3) + vst4.8 {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0) + + bne 1b @ if not done with length, loop + + vpop {Q4-Q7} @ Restore Vregisters + pop {r4, r5, lr} @ Restore int registers + bx lr END(rsdIntrinsicYuv_K) /* Convolve 5x5 */ |