diff options
author | agl@chromium.org <agl@chromium.org@4ff67af0-8c30-449e-8e8b-ad334ec8d88c> | 2013-11-12 20:40:33 +0000 |
---|---|---|
committer | agl@chromium.org <agl@chromium.org@4ff67af0-8c30-449e-8e8b-ad334ec8d88c> | 2013-11-12 20:40:33 +0000 |
commit | 668876756c77346a65d73e442c102d9814d24e76 (patch) | |
tree | a69804851b2dd951ab556ddd2f6ac55bc69fe9be | |
parent | 2b8f0063cec73f87917097bc4981b9d3489eb75e (diff) | |
download | openssl-668876756c77346a65d73e442c102d9814d24e76.tar.gz |
NEON fixes.
(Doesn't include the patch changes yet. They will be next.)
git-svn-id: http://src.chromium.org/svn/trunk/deps/third_party/openssl@234628 4ff67af0-8c30-449e-8e8b-ad334ec8d88c
-rw-r--r-- | openssl/crypto/chacha/chacha_enc.c | 4 | ||||
-rw-r--r-- | openssl/crypto/chacha/chacha_vec.c | 2 | ||||
-rw-r--r-- | openssl/crypto/chacha/chacha_vec_arm.S | 535 | ||||
-rw-r--r-- | openssl/crypto/evp/e_chacha20poly1305.c | 10 | ||||
-rw-r--r-- | openssl/crypto/poly1305/poly1305_arm.c | 22 |
5 files changed, 288 insertions, 285 deletions
diff --git a/openssl/crypto/chacha/chacha_enc.c b/openssl/crypto/chacha/chacha_enc.c index e4b648f..2d6b076 100644 --- a/openssl/crypto/chacha/chacha_enc.c +++ b/openssl/crypto/chacha/chacha_enc.c @@ -135,7 +135,9 @@ void CRYPTO_chacha_20(unsigned char *out, size_t todo, i; #if __arm__ - if (CRYPTO_is_NEON_capable()) + if (CRYPTO_is_NEON_capable() && + ((intptr_t)in & 15) == 0 && + ((intptr_t)out & 15) == 0) { CRYPTO_chacha_20_neon(out, in, in_len, key, nonce, counter); return; diff --git a/openssl/crypto/chacha/chacha_vec.c b/openssl/crypto/chacha/chacha_vec.c index 1226c39..274345d 100644 --- a/openssl/crypto/chacha/chacha_vec.c +++ b/openssl/crypto/chacha/chacha_vec.c @@ -289,7 +289,7 @@ void CRYPTO_chacha_20( op[9] = REVW_BE(REVW_BE(ip[9]) ^ (x9 + kp[5])); op[10] = REVW_BE(REVW_BE(ip[10]) ^ (x10 + kp[6])); op[11] = REVW_BE(REVW_BE(ip[11]) ^ (x11 + kp[7])); - op[12] = REVW_BE(REVW_BE(ip[12]) ^ (x12 + BPI*iters+(BPI-1))); + op[12] = REVW_BE(REVW_BE(ip[12]) ^ (x12 + counter+BPI*iters+(BPI-1))); op[13] = REVW_BE(REVW_BE(ip[13]) ^ (x13)); op[14] = REVW_BE(REVW_BE(ip[14]) ^ (x14 + np[0])); op[15] = REVW_BE(REVW_BE(ip[15]) ^ (x15 + np[1])); diff --git a/openssl/crypto/chacha/chacha_vec_arm.S b/openssl/crypto/chacha/chacha_vec_arm.S index 35dc74d..f2a6961 100644 --- a/openssl/crypto/chacha/chacha_vec_arm.S +++ b/openssl/crypto/chacha/chacha_vec_arm.S @@ -5,7 +5,7 @@ # # This file was generated by: # -# /opt/gcc-linaro-arm-linux-gnueabihf-4.7-2012.10-20121022_linux/bin/arm-linux-gnueabihf-gcc -O3 -mcpu=cortex-a8 -mfpu=neon -S chacha_vec.c -I ../../include -fpic +# /opt/gcc-linaro-arm-linux-gnueabihf-4.7-2012.10-20121022_linux/bin/arm-linux-gnueabihf-gcc -O3 -mcpu=cortex-a8 -mfpu=neon -S chacha_vec.c -I ../../include -fpic -o chacha_vec_arm.S # # And then EABI attribute 28 was set to zero to allow linking with soft-float # code. @@ -33,130 +33,133 @@ .thumb_func .type CRYPTO_chacha_20_neon, %function CRYPTO_chacha_20_neon: - @ args = 8, pretend = 0, frame = 296 + @ args = 8, pretend = 0, frame = 304 @ frame_needed = 1, uses_anonymous_args = 0 @ link register save eliminated. push {r4, r5, r6, r7, r8, r9, sl, fp} fstmfdd sp!, {d8, d9, d10, d11, d12, d13, d14, d15} - sub sp, sp, #296 + sub sp, sp, #304 add r7, sp, #0 movw ip, #43691 movt ip, 43690 - str r2, [r7, #192] + str r2, [r7, #196] sub sp, sp, #96 - ldr r4, [r7, #192] - ldr r6, [r7, #392] + ldr r4, [r7, #196] + ldr r6, [r7, #400] ldr r2, .L38+16 umull r4, ip, ip, r4 ldr r6, [r6, #0] - ldr r8, [r7, #392] + ldr r8, [r7, #400] .LPIC24: add r2, pc add r4, sp, #15 - str r3, [r7, #236] - str r6, [r7, #172] + str r3, [r7, #244] + str r6, [r7, #176] bic r4, r4, #15 - str r0, [r7, #184] - str r4, [r7, #196] + str r0, [r7, #188] + str r4, [r7, #200] lsrs ip, ip, #7 - str r1, [r7, #180] + str r1, [r7, #184] ldmia r2, {r0, r1, r2, r3} ldr r4, [r8, #4] - ldr r5, [r7, #236] + ldr r5, [r7, #244] vld1.64 {d24-d25}, [r5:64] vldr d26, [r5, #16] vldr d27, [r5, #24] - ldr fp, [r7, #196] - ldr r8, [r7, #396] - ldr r5, [r7, #172] - add r6, fp, #64 - str r4, [r7, #292] + ldr r9, [r7, #200] + ldr r8, [r7, #404] + ldr r5, [r7, #176] + add r6, r9, #64 + str r4, [r7, #300] mov r4, #0 - str r8, [r7, #280] - str r5, [r7, #288] - str r4, [r7, #284] + str r8, [r7, #288] + str r5, [r7, #296] + str r4, [r7, #292] stmia r6, {r0, r1, r2, r3} - vldr d22, [fp, #64] - vldr d23, [fp, #72] - vldr d20, [r7, #280] - vldr d21, [r7, #288] - str ip, [r7, #188] + vldr d22, [r9, #64] + vldr d23, [r9, #72] + vldr d20, [r7, #288] + vldr d21, [r7, #296] + str ip, [r7, #192] beq .L20 lsl r6, ip, #1 - ldr r1, [fp, #68] + ldr r1, [r9, #68] add r3, r6, ip - str r6, [r7, #176] - ldr r2, [fp, #72] + str r6, [r7, #180] + ldr r2, [r9, #72] add r8, r8, #2 - ldr r5, [fp, #76] + ldr r5, [r9, #76] vldr d18, .L38 vldr d19, .L38+8 - str r4, [r7, #232] - ldr r6, [r7, #180] - ldr r4, [r7, #184] - str r0, [r7, #220] - str r1, [r7, #216] - str r8, [r7, #200] - str r2, [r7, #212] - str r3, [r7, #204] - str r5, [r7, #208] - str r6, [r7, #244] str r4, [r7, #240] + ldr r6, [r7, #184] + ldr r4, [r7, #188] + str r0, [r7, #224] + str r1, [r7, #220] + str r8, [r7, #208] + str r2, [r7, #216] + str r3, [r7, #204] + str r5, [r7, #212] + str r6, [r7, #252] + str r4, [r7, #248] .L4: - ldr r6, [r7, #236] + ldr r2, [r7, #244] + add r9, r7, #216 + ldr r3, [r7, #244] vadd.i32 q8, q10, q9 - ldr r5, [r7, #236] + ldr r6, [r7, #208] vmov q15, q13 @ v4si - ldr r8, [r7, #232] + ldr r5, [r7, #240] vmov q3, q12 @ v4si - ldr r6, [r6, #4] + ldr r4, [r7, #244] vmov q2, q11 @ v4si - ldr fp, [r7, #200] + adds r5, r5, r6 + ldr r2, [r2, #8] + ldr r6, [r7, #400] vmov q5, q10 @ v4si - ldr r4, [r7, #236] + ldr r3, [r3, #12] vmov q1, q13 @ v4si - add ip, r8, fp - ldr r5, [r5, #0] - ldr r0, [r7, #236] - add r8, r7, #208 - ldr r1, [r7, #236] + ldr r0, [r7, #244] vmov q0, q12 @ v4si - str r6, [r7, #260] + ldr r1, [r7, #244] vmov q4, q11 @ v4si - ldr r6, [r7, #392] - ldmia r8, {r8, r9, sl, fp} - ldr r0, [r0, #8] - ldr r1, [r1, #12] - str r5, [r7, #224] + ldmia r9, {r9, sl, fp} + str r5, [r7, #228] ldr r5, [r4, #24] - ldr r3, [r4, #28] + ldr r0, [r0, #0] + ldr r1, [r1, #4] + str r2, [r7, #264] + str r3, [r7, #236] ldr r2, [r6, #4] - str r0, [r7, #256] - str r1, [r7, #228] - str r5, [r7, #272] + ldr r3, [r4, #28] + str r5, [r7, #280] ldr r5, [r6, #0] movs r6, #0 + ldr ip, [r7, #228] + ldr r8, [r7, #212] + str r0, [r7, #232] + str r1, [r7, #268] ldr r0, [r4, #16] ldr r1, [r4, #20] movs r4, #10 - str r2, [r7, #20] - str r3, [r7, #276] - str r9, [r7, #268] + str r2, [r7, #24] + str r3, [r7, #284] + str r4, [r7, #256] + ldr r2, [r7, #264] + str r9, [r7, #276] mov r9, r6 - str r4, [r7, #248] - ldr r2, [r7, #256] - ldr r3, [r7, #228] - str r8, [r7, #252] + ldr r6, [r7, #280] + str r8, [r7, #260] mov r8, sl - ldr r6, [r7, #272] + str r1, [r7, #272] mov sl, ip - str r1, [r7, #264] - ldr ip, [r7, #20] - str r6, [r7, #256] + str r6, [r7, #264] mov r6, r5 - ldr r1, [r7, #260] + ldr r3, [r7, #236] mov r5, r0 - ldr r0, [r7, #224] + ldr ip, [r7, #24] + ldr r1, [r7, #268] + ldr r0, [r7, #232] b .L39 .L40: .align 3 @@ -171,445 +174,445 @@ CRYPTO_chacha_20_neon: vadd.i32 q4, q4, q0 add r8, r8, r1 vadd.i32 q2, q2, q3 - str r8, [r7, #260] + str r8, [r7, #268] veor q5, q5, q4 - ldr r8, [r7, #268] + ldr r8, [r7, #276] veor q8, q8, q2 add fp, fp, r0 - str fp, [r7, #272] + str fp, [r7, #280] add r8, r8, r2 vrev32.16 q5, q5 - str r8, [r7, #268] + str r8, [r7, #276] vrev32.16 q8, q8 vadd.i32 q1, q1, q5 vadd.i32 q15, q15, q8 - ldr r8, [r7, #272] + ldr r8, [r7, #280] veor q0, q1, q0 - ldr r4, [r7, #252] + ldr r4, [r7, #260] veor q3, q15, q3 eor sl, sl, r8 - ldr r8, [r7, #268] + ldr r8, [r7, #276] add fp, r4, r3 vshl.i32 q7, q0, #12 - ldr r4, [r7, #260] + ldr r4, [r7, #268] vshl.i32 q6, q3, #12 eor r6, r6, r8 eor r9, r9, r4 - ldr r4, [r7, #264] + ldr r4, [r7, #272] vsri.32 q7, q0, #20 ror r8, r6, #16 - ldr r6, [r7, #256] + ldr r6, [r7, #264] eor ip, ip, fp vsri.32 q6, q3, #20 ror sl, sl, #16 ror r9, r9, #16 add r5, r5, sl vadd.i32 q4, q4, q7 - str r5, [r7, #228] + str r5, [r7, #236] vadd.i32 q2, q2, q6 add r5, r4, r9 add r4, r6, r8 - ldr r6, [r7, #276] + ldr r6, [r7, #284] ror ip, ip, #16 veor q5, q4, q5 veor q8, q2, q8 add r6, r6, ip - str r6, [r7, #256] + str r6, [r7, #264] eors r1, r1, r5 - ldr r6, [r7, #228] + ldr r6, [r7, #236] vshl.i32 q3, q5, #8 vshl.i32 q14, q8, #8 eors r2, r2, r4 eors r0, r0, r6 - ldr r6, [r7, #256] + ldr r6, [r7, #264] vsri.32 q3, q5, #24 ror r1, r1, #20 eors r3, r3, r6 - ldr r6, [r7, #272] + ldr r6, [r7, #280] ror r0, r0, #20 vsri.32 q14, q8, #24 adds r6, r0, r6 - str r6, [r7, #276] - ldr r6, [r7, #260] + str r6, [r7, #284] + ldr r6, [r7, #268] vadd.i32 q1, q1, q3 vadd.i32 q15, q15, q14 ror r2, r2, #20 adds r6, r1, r6 - str r6, [r7, #252] - ldr r6, [r7, #268] + str r6, [r7, #260] + ldr r6, [r7, #276] veor q6, q15, q6 veor q7, q1, q7 ror r3, r3, #20 adds r6, r2, r6 - str r6, [r7, #272] - ldr r6, [r7, #276] + str r6, [r7, #280] + ldr r6, [r7, #284] vshl.i32 q0, q6, #7 vshl.i32 q5, q7, #7 add fp, r3, fp eor sl, r6, sl - ldr r6, [r7, #252] + ldr r6, [r7, #260] eor ip, fp, ip vsri.32 q0, q6, #25 eor r9, r6, r9 - ldr r6, [r7, #272] + ldr r6, [r7, #280] ror sl, sl, #24 vsri.32 q5, q7, #25 eor r8, r6, r8 - ldr r6, [r7, #228] + ldr r6, [r7, #236] ror r9, r9, #24 ror ip, ip, #24 add r6, sl, r6 - str r6, [r7, #268] - ldr r6, [r7, #256] + str r6, [r7, #276] + ldr r6, [r7, #264] add r5, r9, r5 - str r5, [r7, #264] + str r5, [r7, #272] vext.32 q5, q5, q5, #1 add r5, ip, r6 - ldr r6, [r7, #268] + ldr r6, [r7, #276] vext.32 q0, q0, q0, #1 vadd.i32 q4, q4, q5 eors r0, r0, r6 - ldr r6, [r7, #264] + ldr r6, [r7, #272] vadd.i32 q2, q2, q0 vext.32 q3, q3, q3, #3 ror r8, r8, #24 eors r1, r1, r6 vext.32 q14, q14, q14, #3 add r4, r8, r4 - ldr r6, [r7, #276] + ldr r6, [r7, #284] veor q3, q4, q3 veor q14, q2, q14 eors r2, r2, r4 ror r1, r1, #25 vext.32 q1, q1, q1, #2 adds r6, r1, r6 - str r6, [r7, #276] + str r6, [r7, #284] vext.32 q15, q15, q15, #2 - ldr r6, [r7, #252] + ldr r6, [r7, #260] eors r3, r3, r5 ror r2, r2, #25 vrev32.16 q8, q14 adds r6, r2, r6 vrev32.16 q3, q3 - str r6, [r7, #260] + str r6, [r7, #268] vadd.i32 q1, q1, q3 - ldr r6, [r7, #272] + ldr r6, [r7, #280] vadd.i32 q15, q15, q8 ror r3, r3, #25 veor q5, q1, q5 adds r6, r3, r6 veor q0, q15, q0 - str r6, [r7, #256] - ldr r6, [r7, #260] + str r6, [r7, #264] + ldr r6, [r7, #268] ror r0, r0, #25 add fp, r0, fp vshl.i32 q6, q5, #12 eor sl, r6, sl - ldr r6, [r7, #276] + ldr r6, [r7, #284] vshl.i32 q14, q0, #12 eor r8, fp, r8 eor ip, r6, ip - ldr r6, [r7, #256] + ldr r6, [r7, #264] vsri.32 q6, q5, #20 ror sl, sl, #16 eor r9, r6, r9 ror r6, r8, #16 vsri.32 q14, q0, #20 - ldr r8, [r7, #264] + ldr r8, [r7, #272] ror ip, ip, #16 add r5, sl, r5 add r8, r6, r8 add r4, ip, r4 - str r4, [r7, #228] + str r4, [r7, #236] eor r0, r8, r0 - str r5, [r7, #272] + str r5, [r7, #280] vadd.i32 q4, q4, q6 - ldr r5, [r7, #228] + ldr r5, [r7, #236] vadd.i32 q2, q2, q14 - ldr r4, [r7, #268] + ldr r4, [r7, #276] ror r0, r0, #20 veor q3, q4, q3 eors r1, r1, r5 veor q0, q2, q8 - str r8, [r7, #264] - str r0, [r7, #20] + str r8, [r7, #272] + str r0, [r7, #24] add fp, r0, fp - ldr r8, [r7, #272] + ldr r8, [r7, #280] ror r9, r9, #16 - ldr r0, [r7, #276] + ldr r0, [r7, #284] add r4, r9, r4 - str fp, [r7, #252] + str fp, [r7, #260] ror r1, r1, #20 add fp, r1, r0 eor r2, r8, r2 - ldr r0, [r7, #252] + ldr r0, [r7, #260] eors r3, r3, r4 vshl.i32 q5, q3, #8 - str r4, [r7, #224] + str r4, [r7, #232] vshl.i32 q8, q0, #8 - ldr r4, [r7, #260] - ldr r5, [r7, #256] + ldr r4, [r7, #268] + ldr r5, [r7, #264] ror r2, r2, #20 ror r3, r3, #20 eors r6, r6, r0 adds r5, r3, r5 add r8, r2, r4 vsri.32 q5, q3, #24 - ldr r4, [r7, #264] + ldr r4, [r7, #272] eor r9, r5, r9 eor ip, fp, ip vsri.32 q8, q0, #24 eor sl, r8, sl ror r6, r6, #24 - ldr r0, [r7, #272] - str r5, [r7, #268] + ldr r0, [r7, #280] + str r5, [r7, #276] adds r4, r6, r4 - ldr r5, [r7, #228] + ldr r5, [r7, #236] vadd.i32 q1, q1, q5 - str r4, [r7, #264] + str r4, [r7, #272] vadd.i32 q15, q15, q8 - ldr r4, [r7, #224] + ldr r4, [r7, #232] ror ip, ip, #24 ror sl, sl, #24 ror r9, r9, #24 add r5, ip, r5 add r0, sl, r0 - str r5, [r7, #256] + str r5, [r7, #264] add r5, r9, r4 - str r0, [r7, #276] + str r0, [r7, #284] veor q6, q1, q6 - ldr r4, [r7, #20] + ldr r4, [r7, #24] veor q14, q15, q14 - ldr r0, [r7, #264] + ldr r0, [r7, #272] eors r3, r3, r5 vshl.i32 q0, q6, #7 vext.32 q1, q1, q1, #2 eors r0, r0, r4 - ldr r4, [r7, #276] - str r0, [r7, #272] + ldr r4, [r7, #284] + str r0, [r7, #280] vshl.i32 q3, q14, #7 eors r2, r2, r4 - ldr r4, [r7, #272] - ldr r0, [r7, #256] + ldr r4, [r7, #280] + ldr r0, [r7, #264] vsri.32 q0, q6, #25 ror r2, r2, #25 ror r3, r3, #25 eors r1, r1, r0 vsri.32 q3, q14, #25 ror r0, r4, #25 - ldr r4, [r7, #248] + ldr r4, [r7, #256] ror r1, r1, #25 vext.32 q5, q5, q5, #1 subs r4, r4, #1 - str r4, [r7, #248] + str r4, [r7, #256] vext.32 q15, q15, q15, #2 vext.32 q8, q8, q8, #1 vext.32 q0, q0, q0, #3 vext.32 q3, q3, q3, #3 bne .L3 - ldr r4, [r7, #256] - vadd.i32 q4, q11, q4 - str r2, [r7, #256] + ldr r4, [r7, #264] vadd.i32 q14, q10, q9 - ldr r2, [r7, #244] - vld1.64 {d12-d13}, [r2:64] - str r4, [r7, #272] - veor q4, q4, q6 - ldr r4, [r7, #220] + str r2, [r7, #264] vadd.i32 q10, q10, q5 - ldr r2, [r7, #216] - vadd.i32 q0, q12, q0 - add fp, fp, r4 - str ip, [r7, #20] - ldr r4, [r7, #212] + ldr r2, [r7, #252] + vld1.64 {d12-d13}, [r2:64] + ldr r2, [r7, #220] + vadd.i32 q4, q11, q4 + str ip, [r7, #24] mov ip, sl - str r0, [r7, #224] mov sl, r8 + ldr r8, [r7, #260] + add sl, sl, r2 + ldr r2, [r7, #212] + str r4, [r7, #280] + vadd.i32 q0, q12, q0 + ldr r4, [r7, #224] + add r8, r8, r2 + ldr r2, [r7, #240] + vadd.i32 q1, q13, q1 + str r0, [r7, #232] + add fp, fp, r4 mov r0, r5 - ldr r8, [r7, #252] + ldr r4, [r7, #216] mov r5, r6 - add sl, sl, r2 mov r6, r9 - ldr r2, [r7, #208] - ldr r9, [r7, #268] - vadd.i32 q1, q13, q1 + ldr r9, [r7, #276] + adds r2, r2, #3 + str r2, [r7, #240] vadd.i32 q2, q11, q2 - str r1, [r7, #260] + ldr r2, [r7, #252] add r9, r9, r4 - add r4, r8, r2 - ldr r8, [r7, #232] vadd.i32 q3, q12, q3 + ldr r4, [r7, #228] vadd.i32 q15, q13, q15 - str r3, [r7, #228] - add r2, r8, #2 + str r1, [r7, #268] vadd.i32 q8, q14, q8 - add ip, r2, ip - ldr r2, [r7, #240] - vst1.64 {d8-d9}, [r2:64] - ldr r2, [r7, #244] - ldr r3, [r7, #276] + str r3, [r7, #236] + veor q4, q4, q6 + ldr r3, [r7, #284] + ldr r1, [r7, #272] + add ip, r4, ip + ldr r4, [r7, #248] + vst1.64 {d8-d9}, [r4:64] vldr d8, [r2, #16] vldr d9, [r2, #24] - ldr r1, [r7, #264] veor q0, q0, q4 - add r8, r8, #3 - str r8, [r7, #232] - ldr r8, [r7, #240] - vstr d0, [r8, #16] - vstr d1, [r8, #24] + vstr d0, [r4, #16] + vstr d1, [r4, #24] vldr d0, [r2, #32] vldr d1, [r2, #40] veor q1, q1, q0 - vstr d2, [r8, #32] - vstr d3, [r8, #40] + vstr d2, [r4, #32] + vstr d3, [r4, #40] vldr d2, [r2, #48] vldr d3, [r2, #56] veor q10, q10, q1 - vstr d20, [r8, #48] - vstr d21, [r8, #56] + vstr d20, [r4, #48] + vstr d21, [r4, #56] vldr d8, [r2, #64] vldr d9, [r2, #72] veor q2, q2, q4 - vstr d4, [r8, #64] - vstr d5, [r8, #72] + vstr d4, [r4, #64] + vstr d5, [r4, #72] vldr d10, [r2, #80] vldr d11, [r2, #88] veor q3, q3, q5 - vstr d6, [r8, #80] - vstr d7, [r8, #88] + vstr d6, [r4, #80] + vstr d7, [r4, #88] vldr d12, [r2, #96] vldr d13, [r2, #104] veor q15, q15, q6 - vstr d30, [r8, #96] - vstr d31, [r8, #104] + vstr d30, [r4, #96] + vstr d31, [r4, #104] vldr d20, [r2, #112] vldr d21, [r2, #120] veor q8, q8, q10 - vstr d16, [r8, #112] - vstr d17, [r8, #120] - mov r8, r2 - ldr r2, [r2, #128] + vstr d16, [r4, #112] + vstr d17, [r4, #120] + ldr r4, [r2, #128] + ldr r2, [r7, #248] vadd.i32 q10, q14, q9 - eor r2, fp, r2 - ldr fp, [r7, #240] + eor r4, fp, r4 vadd.i32 q10, q10, q9 - str r2, [fp, #128] - ldr r2, [r8, #132] + str r4, [r2, #128] + ldr r4, [r7, #252] + ldr r2, [r4, #132] eor r2, sl, r2 - str r2, [fp, #132] - ldr r2, [r8, #136] + ldr sl, [r7, #248] + str r2, [sl, #132] + ldr r2, [r4, #136] eor r2, r9, r2 - str r2, [fp, #136] - ldr r2, [r8, #140] - eors r2, r2, r4 - str r2, [fp, #140] - ldr r2, [r7, #236] - ldr r4, [r8, #144] + str r2, [sl, #136] + ldr r2, [r4, #140] + eor r2, r8, r2 + str r2, [sl, #140] + ldr r2, [r7, #244] + ldr r4, [r4, #144] ldr r2, [r2, #0] - str r4, [r7, #168] - ldr r4, [r7, #224] + str r4, [r7, #44] + ldr r4, [r7, #232] add r8, r4, r2 - ldr r2, [r7, #168] - ldr r4, [r7, #236] + ldr r2, [r7, #44] + ldr r4, [r7, #244] eor r8, r8, r2 - ldr r2, [r7, #244] - str r8, [fp, #144] + ldr r2, [r7, #252] + str r8, [sl, #144] ldr r4, [r4, #4] ldr r2, [r2, #148] - str r2, [r7, #36] - ldr r2, [r7, #260] + str r2, [r7, #40] + ldr r2, [r7, #268] add r8, r2, r4 - ldr r4, [r7, #36] - ldr r2, [r7, #236] + ldr r4, [r7, #40] + ldr r2, [r7, #244] eor r8, r8, r4 - ldr r4, [r7, #244] - str r8, [fp, #148] + ldr r4, [r7, #252] + str r8, [sl, #148] ldr r2, [r2, #8] ldr r4, [r4, #152] - str r4, [r7, #32] - ldr r4, [r7, #256] + str r4, [r7, #36] + ldr r4, [r7, #264] add r8, r4, r2 - ldr r2, [r7, #32] + ldr r2, [r7, #36] eor r8, r8, r2 - str r8, [fp, #152] - ldr r2, [r7, #244] - ldr r4, [r7, #236] + str r8, [sl, #152] + ldr r2, [r7, #252] + ldr r4, [r7, #244] ldr r2, [r2, #156] ldr r4, [r4, #12] - str r2, [r7, #28] - ldr r2, [r7, #228] + str r2, [r7, #32] + ldr r2, [r7, #236] add r8, r2, r4 - ldr r4, [r7, #28] - ldr r2, [r7, #244] + ldr r4, [r7, #32] + ldr r2, [r7, #252] eor r8, r8, r4 - str r8, [fp, #156] - ldr r8, [r7, #236] + str r8, [sl, #156] + ldr r8, [r7, #244] ldr r2, [r2, #160] ldr r4, [r8, #16] adds r0, r0, r4 - ldr r4, [r7, #244] + ldr r4, [r7, #252] eors r0, r0, r2 - str r0, [fp, #160] + str r0, [sl, #160] ldr r0, [r8, #20] ldr r2, [r4, #164] adds r1, r1, r0 - ldr r0, [r7, #272] + ldr r0, [r7, #280] eors r1, r1, r2 - str r1, [fp, #164] + str r1, [sl, #164] ldr r2, [r8, #24] ldr r1, [r4, #168] adds r2, r0, r2 eors r2, r2, r1 - str r2, [fp, #168] + str r2, [sl, #168] ldr r1, [r8, #28] ldr r2, [r4, #172] adds r3, r3, r1 eors r3, r3, r2 - str r3, [fp, #172] + str r3, [sl, #172] ldr r3, [r4, #176] eor r3, ip, r3 - str r3, [fp, #176] + str r3, [sl, #176] ldr r3, [r4, #180] - ldr r4, [r7, #392] + ldr r4, [r7, #400] eors r6, r6, r3 - str r6, [fp, #180] - ldr r6, [r7, #244] + str r6, [sl, #180] + ldr r6, [r7, #252] ldr r2, [r4, #0] ldr r3, [r6, #184] adds r5, r5, r2 eors r5, r5, r3 - str r5, [fp, #184] + str r5, [sl, #184] ldr r2, [r6, #188] adds r6, r6, #192 ldr r3, [r4, #4] - str r6, [r7, #244] - ldr r0, [r7, #20] - ldr r1, [r7, #232] + str r6, [r7, #252] + ldr r0, [r7, #24] + ldr r1, [r7, #240] adds r4, r0, r3 eors r4, r4, r2 ldr r2, [r7, #204] - str r4, [fp, #188] - add fp, fp, #192 + str r4, [sl, #188] + add sl, sl, #192 cmp r1, r2 - str fp, [r7, #240] + str sl, [r7, #248] bne .L4 - ldr r4, [r7, #188] - ldr r3, [r7, #176] - ldr r6, [r7, #184] + ldr r4, [r7, #192] + ldr r3, [r7, #180] + ldr r6, [r7, #188] adds r5, r3, r4 - ldr r8, [r7, #180] + ldr r8, [r7, #184] lsls r5, r5, #6 adds r4, r6, r5 add r5, r8, r5 .L2: - ldr fp, [r7, #192] + ldr r9, [r7, #196] movw r3, #43691 movt r3, 43690 - ldr r6, [r7, #192] - umull fp, r3, r3, fp + ldr sl, [r7, #196] + umull r9, r3, r3, r9 lsrs r3, r3, #7 add r3, r3, r3, lsl #1 - sub r3, r6, r3, lsl #6 + sub r3, sl, r3, lsl #6 lsrs r6, r3, #6 beq .L5 add r1, r5, #16 @@ -693,7 +696,7 @@ CRYPTO_chacha_20_neon: adds r4, r4, r6 adds r5, r5, r6 .L5: - ldr r6, [r7, #192] + ldr r6, [r7, #196] ands ip, r6, #63 beq .L1 vmov q8, q10 @ v4si @@ -743,11 +746,11 @@ CRYPTO_chacha_20_neon: cmp ip, #15 vadd.i32 q11, q11, q15 bhi .L37 - ldr fp, [r7, #196] - vst1.64 {d22-d23}, [fp:128] + ldr r9, [r7, #200] + vst1.64 {d22-d23}, [r9:128] .L14: - ldr r6, [r7, #192] - and r3, r6, #48 + ldr sl, [r7, #196] + and r3, sl, #48 cmp ip, r3 bls .L1 adds r0, r5, r3 @@ -772,7 +775,7 @@ CRYPTO_chacha_20_neon: orreq r2, r2, #1 lsl sl, r8, #4 cbnz r2, .L35 - ldr fp, [r7, #196] + ldr fp, [r7, #200] add r6, fp, r3 .L17: vld1.8 {q8}, [r0]! @@ -786,7 +789,7 @@ CRYPTO_chacha_20_neon: add r3, r3, sl beq .L1 .L35: - ldr r0, [r7, #196] + ldr r0, [r7, #200] .L25: ldrb r2, [r5, r3] @ zero_extendqisi2 ldrb r1, [r3, r0] @ zero_extendqisi2 @@ -796,7 +799,7 @@ CRYPTO_chacha_20_neon: cmp ip, r3 bhi .L25 .L1: - add r7, r7, #296 + add r7, r7, #304 mov sp, r7 fldmfdd sp!, {d8, d9, d10, d11, d12, d13, d14, d15} pop {r4, r5, r6, r7, r8, r9, sl, fp} @@ -819,7 +822,7 @@ CRYPTO_chacha_20_neon: vadd.i32 q8, q8, q10 vldr d0, [r5, #32] vldr d1, [r5, #40] - ldr r6, [r7, #196] + ldr r6, [r7, #200] vstr d16, [r6, #48] vstr d17, [r6, #56] veor q8, q13, q0 @@ -827,16 +830,16 @@ CRYPTO_chacha_20_neon: vstr d17, [r4, #40] b .L14 .L12: - ldr r8, [r7, #196] + ldr r8, [r7, #200] vstr d18, [r8, #16] vstr d19, [r8, #24] b .L14 .L20: - ldr r5, [r7, #180] - ldr r4, [r7, #184] + ldr r5, [r7, #184] + ldr r4, [r7, #188] b .L2 .L13: - ldr r6, [r7, #196] + ldr r6, [r7, #200] vstr d26, [r6, #32] vstr d27, [r6, #40] b .L14 diff --git a/openssl/crypto/evp/e_chacha20poly1305.c b/openssl/crypto/evp/e_chacha20poly1305.c index 1c0c0fb..c556ddc 100644 --- a/openssl/crypto/evp/e_chacha20poly1305.c +++ b/openssl/crypto/evp/e_chacha20poly1305.c @@ -120,6 +120,12 @@ static void poly1305_update_with_length(poly1305_state *poly1305, CRYPTO_poly1305_update(poly1305, length_bytes, sizeof(length_bytes)); } +#if __arm__ +#define ALIGNED __attribute__((aligned(16))) +#else +#define ALIGNED +#endif + static ssize_t aead_chacha20_poly1305_seal(const EVP_AEAD_CTX *ctx, unsigned char *out, size_t max_out_len, const unsigned char *nonce, size_t nonce_len, @@ -127,7 +133,7 @@ static ssize_t aead_chacha20_poly1305_seal(const EVP_AEAD_CTX *ctx, const unsigned char *ad, size_t ad_len) { const struct aead_chacha20_poly1305_ctx *c20_ctx = ctx->aead_state; - unsigned char poly1305_key[32]; + unsigned char poly1305_key[32] ALIGNED; poly1305_state poly1305; const uint64_t in_len_64 = in_len; @@ -184,7 +190,7 @@ static ssize_t aead_chacha20_poly1305_open(const EVP_AEAD_CTX *ctx, { const struct aead_chacha20_poly1305_ctx *c20_ctx = ctx->aead_state; unsigned char mac[POLY1305_TAG_LEN]; - unsigned char poly1305_key[32]; + unsigned char poly1305_key[32] ALIGNED; size_t out_len; poly1305_state poly1305; const uint64_t in_len_64 = in_len; diff --git a/openssl/crypto/poly1305/poly1305_arm.c b/openssl/crypto/poly1305/poly1305_arm.c index 34e339d..c7d2991 100644 --- a/openssl/crypto/poly1305/poly1305_arm.c +++ b/openssl/crypto/poly1305/poly1305_arm.c @@ -238,8 +238,6 @@ void CRYPTO_poly1305_update_neon(poly1305_state *state, const unsigned char *in, fe1305x2 *const c = h + 1; fe1305x2 *const precomp = c + 1; unsigned int i; - unsigned char data[sizeof(fe1305x2) + 16]; - fe1305x2 *const r2r = (fe1305x2 *) (data + (15 & (-(int) data))); if (st->buf_used) { @@ -252,20 +250,12 @@ void CRYPTO_poly1305_update_neon(poly1305_state *state, const unsigned char *in, in_len -= todo; in += todo; - if (st->buf_used == sizeof(st->buf)) + if (st->buf_used == sizeof(st->buf) && in_len) { + addmulmod(h,h,precomp,&zero); fe1305x2_frombytearray(c, st->buf, sizeof(st->buf)); - r2r->v[0] = precomp->v[0]; - r2r->v[2] = precomp->v[2]; - r2r->v[4] = precomp->v[4]; - r2r->v[6] = precomp->v[6]; - r2r->v[8] = precomp->v[8]; - r2r->v[1] = r->v[1]; - r2r->v[3] = r->v[3]; - r2r->v[5] = r->v[5]; - r2r->v[7] = r->v[7]; - r2r->v[9] = r->v[9]; - addmulmod(h,h,r2r,c); + for (i = 0; i < 10; i++) + h->v[i] += c->v[i]; st->buf_used = 0; } } @@ -273,7 +263,7 @@ void CRYPTO_poly1305_update_neon(poly1305_state *state, const unsigned char *in, while (in_len > 32) { unsigned int tlen = 1048576; - if (in_len < 1048576) + if (in_len < tlen) tlen = in_len; tlen -= blocks(h, precomp, in, tlen); in_len -= tlen; @@ -296,6 +286,8 @@ void CRYPTO_poly1305_finish_neon(poly1305_state* state, unsigned char mac[16]) fe1305x2 *const c = h + 1; fe1305x2 *const precomp = c + 1; + addmulmod(h,h,precomp,&zero); + if (st->buf_used > 16) { fe1305x2_frombytearray(c, st->buf, st->buf_used); |