diff options
author | Treehugger Robot <treehugger-gerrit@google.com> | 2019-09-18 11:57:37 +0000 |
---|---|---|
committer | Gerrit Code Review <noreply-gerritcodereview@google.com> | 2019-09-18 11:57:37 +0000 |
commit | 6a751e14fcef594edf6a0eb7fab98799deaa484e (patch) | |
tree | d58d9102b2010c48c3d95474fb5914ae4c2371d9 | |
parent | 055b2f3c3849d992f9287a2fa0dbe79bd9f2f251 (diff) | |
parent | 68a0658ee72ca4c3961719900e9f97562dd66931 (diff) | |
download | arm-optimized-routines-ndk-sysroot-r21.tar.gz |
Merge "Upgrade arm-optimized-routines to 9c8399909a9835e6f55977df1661cf6306c56707"ndk-sysroot-r21
66 files changed, 5916 insertions, 7 deletions
@@ -9,11 +9,11 @@ third_party { type: GIT value: "https://github.com/ARM-software/optimized-routines.git" } - version: "6b594432c8ac46e71686ea21fad30d1c3f79e65a" + version: "9c8399909a9835e6f55977df1661cf6306c56707" license_type: NOTICE last_upgrade_date { year: 2019 - month: 7 - day: 31 + month: 9 + day: 3 } } @@ -5,6 +5,7 @@ * SPDX-License-Identifier: MIT */ +#include <float.h> #include <math.h> #include <stdint.h> #include "math_config.h" @@ -169,4 +170,7 @@ __exp_dd (double x, double xtail) strong_alias (exp, __exp_finite) hidden_alias (exp, __ieee754_exp) hidden_alias (__exp_dd, __exp1) +# if LDBL_MANT_DIG == 53 +long double expl (long double x) { return exp (x); } +# endif #endif diff --git a/math/exp2.c b/math/exp2.c index fbedbcb..47aa479 100644 --- a/math/exp2.c +++ b/math/exp2.c @@ -5,6 +5,7 @@ * SPDX-License-Identifier: MIT */ +#include <float.h> #include <math.h> #include <stdint.h> #include "math_config.h" @@ -136,4 +137,7 @@ exp2 (double x) #if USE_GLIBC_ABI strong_alias (exp2, __exp2_finite) hidden_alias (exp2, __ieee754_exp2) +# if LDBL_MANT_DIG == 53 +long double exp2l (long double x) { return exp2 (x); } +# endif #endif diff --git a/math/include/mathlib.h b/math/include/mathlib.h index aac2d4d..eed294b 100644 --- a/math/include/mathlib.h +++ b/math/include/mathlib.h @@ -5,9 +5,6 @@ * SPDX-License-Identifier: MIT */ -float sinf (float); -float cosf (float); -float tanf (float); float expf (float); float exp2f (float); float logf (float); @@ -5,6 +5,7 @@ * SPDX-License-Identifier: MIT */ +#include <float.h> #include <math.h> #include <stdint.h> #include "math_config.h" @@ -155,4 +156,7 @@ log (double x) #if USE_GLIBC_ABI strong_alias (log, __log_finite) hidden_alias (log, __ieee754_log) +# if LDBL_MANT_DIG == 53 +long double logl (long double x) { return log (x); } +# endif #endif diff --git a/math/log2.c b/math/log2.c index 478b33d..804fb85 100644 --- a/math/log2.c +++ b/math/log2.c @@ -5,6 +5,7 @@ * SPDX-License-Identifier: MIT */ +#include <float.h> #include <math.h> #include <stdint.h> #include "math_config.h" @@ -134,4 +135,7 @@ log2 (double x) #if USE_GLIBC_ABI strong_alias (log2, __log2_finite) hidden_alias (log2, __ieee754_log2) +# if LDBL_MANT_DIG == 53 +long double log2l (long double x) { return log2 (x); } +# endif #endif @@ -5,6 +5,7 @@ * SPDX-License-Identifier: MIT */ +#include <float.h> #include <math.h> #include <stdint.h> #include "math_config.h" @@ -371,4 +372,7 @@ pow (double x, double y) #if USE_GLIBC_ABI strong_alias (pow, __pow_finite) hidden_alias (pow, __ieee754_pow) +# if LDBL_MANT_DIG == 53 +long double powl (long double x, long double y) { return pow (x, y); } +# endif #endif diff --git a/math/test/ulp.c b/math/test/ulp.c index 8de6e5b..8782fb0 100644 --- a/math/test/ulp.c +++ b/math/test/ulp.c @@ -704,7 +704,7 @@ main (int argc, char *argv[]) if (!USE_MPFR && conf.mpfr) { puts ("mpfr is not available."); - return 1; + return 0; } argc--; argv++; diff --git a/string/Dir.mk b/string/Dir.mk index e179642..bd9979f 100644 --- a/string/Dir.mk +++ b/string/Dir.mk @@ -14,6 +14,18 @@ string-libs := \ string-tools := \ build/bin/test/memcpy \ + build/bin/test/memmove \ + build/bin/test/memset \ + build/bin/test/memchr \ + build/bin/test/memcmp \ + build/bin/test/strcpy \ + build/bin/test/strcmp \ + build/bin/test/strchr \ + build/bin/test/strrchr \ + build/bin/test/strchrnul \ + build/bin/test/strlen \ + build/bin/test/strnlen \ + build/bin/test/strncmp string-lib-base := $(basename $(string-lib-srcs)) string-lib-objs := $(string-lib-base:$(srcdir)/%=build/%.o) @@ -47,5 +59,17 @@ build/bin/%.sh: $(srcdir)/string/test/%.sh check-string: $(string-tools) $(EMULATOR) build/bin/test/memcpy + $(EMULATOR) build/bin/test/memmove + $(EMULATOR) build/bin/test/memset + $(EMULATOR) build/bin/test/memchr + $(EMULATOR) build/bin/test/memcmp + $(EMULATOR) build/bin/test/strcpy + $(EMULATOR) build/bin/test/strcmp + $(EMULATOR) build/bin/test/strchr + $(EMULATOR) build/bin/test/strrchr + $(EMULATOR) build/bin/test/strchrnul + $(EMULATOR) build/bin/test/strlen + $(EMULATOR) build/bin/test/strnlen + $(EMULATOR) build/bin/test/strncmp .PHONY: all-string check-string diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S new file mode 100644 index 0000000..0d75acd --- /dev/null +++ b/string/aarch64/memchr-sve.S @@ -0,0 +1,62 @@ +/* + * memchr - find a character in a memory zone + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * SVE Available. + */ + + .arch armv8-a+sve + .text + + .globl __memchr_aarch64_sve + .type __memchr_aarch64_sve, %function + .p2align 4 +__memchr_aarch64_sve: + dup z1.b, w1 /* duplicate c to a vector */ + setffr /* initialize FFR */ + mov x3, 0 /* initialize off */ + nop + +0: whilelo p1.b, x3, x2 /* make sure off < max */ + b.none 9f + + /* Read a vector's worth of bytes, bounded by max, + stopping on first fault. */ + ldff1b z0.b, p1/z, [x0, x3] + rdffrs p0.b, p1/z + b.nlast 2f + + /* First fault did not fail: the vector bounded by max is valid. + Avoid depending on the contents of FFR beyond the branch. */ + incb x3 /* speculate increment */ + cmpeq p2.b, p1/z, z0.b, z1.b /* search for c */ + b.none 0b + decb x3 /* undo speculate */ + + /* Found C. */ +1: brkb p2.b, p1/z, p2.b /* find the first c */ + add x0, x0, x3 /* form partial pointer */ + incp x0, p2.b /* form final pointer to c */ + ret + + /* First fault failed: only some of the vector is valid. + Perform the comparision only on the valid bytes. */ +2: cmpeq p2.b, p0/z, z0.b, z1.b + b.any 1b + + /* No C found. Re-init FFR, increment, and loop. */ + setffr + incp x3, p0.b + b 0b + + /* Found end of count. */ +9: mov x0, 0 /* return null */ + ret + + .size __memchr_aarch64_sve, . - __memchr_aarch64_sve diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S new file mode 100644 index 0000000..e5a3abf --- /dev/null +++ b/string/aarch64/memchr.S @@ -0,0 +1,149 @@ +/* + * memchr - find a character in a memory zone + * + * Copyright (c) 2014-2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Neon Available. + */ + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 +#define cntin x2 + +#define result x0 + +#define src x3 +#define tmp x4 +#define wtmp2 w5 +#define synd x6 +#define soff x9 +#define cntrem x10 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_chr1 v3 +#define vhas_chr2 v4 +#define vrepmask v5 +#define vend v6 + +/* + * Core algorithm: + * + * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits + * per byte. For each tuple, bit 0 is set if the relevant byte matched the + * requested character and bit 1 is not used (faster than using a 32bit + * syndrome). Since the bits in the syndrome reflect exactly the order in which + * things occur in the original string, counting trailing zeros allows to + * identify exactly which byte has matched. + */ + + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + +def_fn __memchr_aarch64 + /* Do not dereference srcin if no bytes to compare. */ + cbz cntin, .Lzero_length + /* + * Magic constant 0x40100401 allows us to identify which lane matches + * the requested byte. + */ + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + /* Work with aligned 32-byte chunks */ + bic src, srcin, #31 + dup vrepmask.4s, wtmp2 + ands soff, srcin, #31 + and cntrem, cntin, #31 + b.eq .Lloop + + /* + * Input string is not 32-byte aligned. We calculate the syndrome + * value for the aligned 32 bytes block containing the first bytes + * and mask the irrelevant part. + */ + + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + sub tmp, soff, #32 + adds cntin, cntin, tmp + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ + addp vend.16b, vend.16b, vend.16b /* 128->64 */ + mov synd, vend.2d[0] + /* Clear the soff*2 lower bits */ + lsl tmp, soff, #1 + lsr synd, synd, tmp + lsl synd, synd, tmp + /* The first block can also be the last */ + b.ls .Lmasklast + /* Have we found something already? */ + cbnz synd, .Ltail + +.Lloop: + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + subs cntin, cntin, #32 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + /* If we're out of data we finish regardless of the result */ + b.ls .Lend + /* Use a fast check for the termination condition */ + orr vend.16b, vhas_chr1.16b, vhas_chr2.16b + addp vend.2d, vend.2d, vend.2d + mov synd, vend.2d[0] + /* We're not out of data, loop if we haven't found the character */ + cbz synd, .Lloop + +.Lend: + /* Termination condition found, let's calculate the syndrome value */ + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ + addp vend.16b, vend.16b, vend.16b /* 128->64 */ + mov synd, vend.2d[0] + /* Only do the clear for the last possible block */ + b.hi .Ltail + +.Lmasklast: + /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ + add tmp, cntrem, soff + and tmp, tmp, #31 + sub tmp, tmp, #32 + neg tmp, tmp, lsl #1 + lsl synd, synd, tmp + lsr synd, synd, tmp + +.Ltail: + /* Count the trailing zeros using bit reversing */ + rbit synd, synd + /* Compensate the last post-increment */ + sub src, src, #32 + /* Check that we have found a character */ + cmp synd, #0 + /* And count the leading zeros */ + clz synd, synd + /* Compute the potential result */ + add result, src, synd, lsr #1 + /* Select result or NULL */ + csel result, xzr, result, eq + ret + +.Lzero_length: + mov result, #0 + ret + + .size __memchr_aarch64, . - __memchr_aarch64 diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S new file mode 100644 index 0000000..d4f6026 --- /dev/null +++ b/string/aarch64/memcmp-sve.S @@ -0,0 +1,48 @@ +/* + * memcmp - compare memory + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * SVE Available. + */ + + .arch armv8-a+sve + .text + + .globl __memcmp_aarch64_sve + .type __memcmp_aarch64_sve, %function + .p2align 4 +__memcmp_aarch64_sve: + mov x3, 0 /* initialize off */ + +0: whilelo p0.b, x3, x2 /* while off < max */ + b.none 9f + + ld1b z0.b, p0/z, [x0, x3] /* read vectors bounded by max. */ + ld1b z1.b, p0/z, [x1, x3] + + /* Increment for a whole vector, even if we've only read a partial. + This is significantly cheaper than INCP, and since OFF is not + used after the loop it is ok to increment OFF past MAX. */ + incb x3 + + cmpne p1.b, p0/z, z0.b, z1.b /* while no inequalities */ + b.none 0b + + /* Found inequality. */ +1: brkb p1.b, p0/z, p1.b /* find first such */ + lasta w0, p1, z0.b /* extract each byte */ + lasta w1, p1, z1.b + sub x0, x0, x1 /* return comparison */ + ret + + /* Found end-of-count. */ +9: mov x0, 0 /* return equality */ + ret + + .size __memcmp_aarch64_sve, . - __memcmp_aarch64_sve diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S new file mode 100644 index 0000000..72a66bc --- /dev/null +++ b/string/aarch64/memcmp.S @@ -0,0 +1,141 @@ +/* memcmp - compare memory + * + * Copyright (c) 2013, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + */ + +#define L(l) .L ## l + +/* Parameters and result. */ +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result w0 + +/* Internal variables. */ +#define data1 x3 +#define data1w w3 +#define data1h x4 +#define data2 x5 +#define data2w w5 +#define data2h x6 +#define tmp1 x7 +#define tmp2 x8 + + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + +def_fn __memcmp_aarch64 p2align=6 + subs limit, limit, 8 + b.lo L(less8) + + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + b.ne L(return) + + subs limit, limit, 8 + b.gt L(more16) + + ldr data1, [src1, limit] + ldr data2, [src2, limit] + b L(return) + +L(more16): + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + bne L(return) + + /* Jump directly to comparing the last 16 bytes for 32 byte (or less) + strings. */ + subs limit, limit, 16 + b.ls L(last_bytes) + + /* We overlap loads between 0-32 bytes at either side of SRC1 when we + try to align, so limit it only to strings larger than 128 bytes. */ + cmp limit, 96 + b.ls L(loop16) + + /* Align src1 and adjust src2 with bytes not yet done. */ + and tmp1, src1, 15 + add limit, limit, tmp1 + sub src1, src1, tmp1 + sub src2, src2, tmp1 + + /* Loop performing 16 bytes per iteration using aligned src1. + Limit is pre-decremented by 16 and must be larger than zero. + Exit if <= 16 bytes left to do or if the data is not equal. */ + .p2align 4 +L(loop16): + ldp data1, data1h, [src1], 16 + ldp data2, data2h, [src2], 16 + subs limit, limit, 16 + ccmp data1, data2, 0, hi + ccmp data1h, data2h, 0, eq + b.eq L(loop16) + + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + bne L(return) + + /* Compare last 1-16 bytes using unaligned access. */ +L(last_bytes): + add src1, src1, limit + add src2, src2, limit + ldp data1, data1h, [src1] + ldp data2, data2h, [src2] + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + + /* Compare data bytes and set return value to 0, -1 or 1. */ +L(return): +#ifndef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + cmp data1, data2 +L(ret_eq): + cset result, ne + cneg result, result, lo + ret + + .p2align 4 + /* Compare up to 8 bytes. Limit is [-8..-1]. */ +L(less8): + adds limit, limit, 4 + b.lo L(less4) + ldr data1w, [src1], 4 + ldr data2w, [src2], 4 + cmp data1w, data2w + b.ne L(return) + sub limit, limit, 4 +L(less4): + adds limit, limit, 4 + beq L(ret_eq) +L(byte_loop): + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + subs limit, limit, 1 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.eq L(byte_loop) + sub result, data1w, data2w + ret + + .size __memcmp_aarch64, . - __memcmp_aarch64 diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S new file mode 100644 index 0000000..4bbd288 --- /dev/null +++ b/string/aarch64/memcpy.S @@ -0,0 +1,178 @@ +/* + * memcpy - copy memory area + * + * Copyright (c) 2012, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + * + */ + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define A_hw w7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l src +#define E_h count +#define F_l srcend +#define F_h dst +#define tmp1 x9 + +#define L(l) .L ## l + + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + +/* Copies are split into 3 main cases: small copies of up to 16 bytes, + medium copies of 17..96 bytes which are fully unrolled. Large copies + of more than 96 bytes align the destination and use an unrolled loop + processing 64 bytes per iteration. + Small and medium copies read all data before writing, allowing any + kind of overlap, and memmove tailcalls memcpy for these cases as + well as non-overlapping copies. +*/ + +def_fn __memcpy_aarch64 p2align=6 + prfm PLDL1KEEP, [src] + add srcend, src, count + add dstend, dstin, count + cmp count, 16 + b.ls L(copy16) + cmp count, 96 + b.hi L(copy_long) + + /* Medium copies: 17..96 bytes. */ + sub tmp1, count, 1 + ldp A_l, A_h, [src] + tbnz tmp1, 6, L(copy96) + ldp D_l, D_h, [srcend, -16] + tbz tmp1, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] +1: + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Small copies: 0..16 bytes. */ +L(copy16): + cmp count, 8 + b.lo 1f + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + .p2align 4 +1: + tbz count, 2, 1f + ldr A_lw, [src] + ldr A_hw, [srcend, -4] + str A_lw, [dstin] + str A_hw, [dstend, -4] + ret + + /* Copy 0..3 bytes. Use a branchless sequence that copies the same + byte 3 times if count==1, or the 2nd byte twice if count==2. */ +1: + cbz count, 2f + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb A_hw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] +2: ret + + .p2align 4 + /* Copy 64..96 bytes. Copy 64 bytes from the start and + 32 bytes from the end. */ +L(copy96): + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [src, 32] + ldp D_l, D_h, [src, 48] + ldp E_l, E_h, [srcend, -32] + ldp F_l, F_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin, 32] + stp D_l, D_h, [dstin, 48] + stp E_l, E_h, [dstend, -32] + stp F_l, F_h, [dstend, -16] + ret + + /* Align DST to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + .p2align 4 +L(copy_long): + and tmp1, dstin, 15 + bic dst, dstin, 15 + ldp D_l, D_h, [src] + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls 2f +1: + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi 1b + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the end even if + there is just 1 byte left. */ +2: + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + .size __memcpy_aarch64, . - __memcpy_aarch64 diff --git a/string/aarch64/memmove.S b/string/aarch64/memmove.S new file mode 100644 index 0000000..5e70f21 --- /dev/null +++ b/string/aarch64/memmove.S @@ -0,0 +1,103 @@ +/* + * memmove - copy memory area + * + * Copyright (c) 2013, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses + */ + + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + +/* Parameters and result. */ +#define dstin x0 +#define src x1 +#define count x2 +#define srcend x3 +#define dstend x4 +#define tmp1 x5 +#define A_l x6 +#define A_h x7 +#define B_l x8 +#define B_h x9 +#define C_l x10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l count +#define E_h tmp1 + +/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps. + Larger backwards copies are also handled by memcpy. The only remaining + case is forward large copies. The destination is aligned, and an + unrolled loop processes 64 bytes per iteration. +*/ + +def_fn __memmove_aarch64, 6 + sub tmp1, dstin, src + cmp count, 96 + ccmp tmp1, count, 2, hi + b.hs __memcpy_aarch64 + + cbz tmp1, 3f + add dstend, dstin, count + add srcend, src, count + + /* Align dstend to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + and tmp1, dstend, 15 + ldp D_l, D_h, [srcend, -16] + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls 2f + nop +1: + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi 1b + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the start even if + there is just 1 byte left. */ +2: + ldp E_l, E_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp E_l, E_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] +3: ret + + .size __memmove_aarch64, . - __memmove_aarch64 diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S new file mode 100644 index 0000000..aef22e9 --- /dev/null +++ b/string/aarch64/memset.S @@ -0,0 +1,188 @@ +/* + * memset - fill memory with a constant byte + * + * Copyright (c) 2012, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses + * + */ + + +#define dstin x0 +#define val x1 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend x4 +#define tmp1 x5 +#define tmp1w w5 +#define tmp2 x6 +#define tmp2w w6 +#define zva_len x7 +#define zva_lenw w7 + +#define L(l) .L ## l + + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + +def_fn __memset_aarch64 p2align=6 + + dup v0.16B, valw + add dstend, dstin, count + + cmp count, 96 + b.hi L(set_long) + cmp count, 16 + b.hs L(set_medium) + mov val, v0.D[0] + + /* Set 0..15 bytes. */ + tbz count, 3, 1f + str val, [dstin] + str val, [dstend, -8] + ret + nop +1: tbz count, 2, 2f + str valw, [dstin] + str valw, [dstend, -4] + ret +2: cbz count, 3f + strb valw, [dstin] + tbz count, 1, 3f + strh valw, [dstend, -2] +3: ret + + /* Set 17..96 bytes. */ +L(set_medium): + str q0, [dstin] + tbnz count, 6, L(set96) + str q0, [dstend, -16] + tbz count, 5, 1f + str q0, [dstin, 16] + str q0, [dstend, -32] +1: ret + + .p2align 4 + /* Set 64..96 bytes. Write 64 bytes from the start and + 32 bytes from the end. */ +L(set96): + str q0, [dstin, 16] + stp q0, q0, [dstin, 32] + stp q0, q0, [dstend, -32] + ret + + .p2align 3 + nop +L(set_long): + and valw, valw, 255 + bic dst, dstin, 15 + str q0, [dstin] + cmp count, 256 + ccmp valw, 0, 0, cs + b.eq L(try_zva) +L(no_zva): + sub count, dstend, dst /* Count is 16 too large. */ + add dst, dst, 16 + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +1: stp q0, q0, [dst], 64 + stp q0, q0, [dst, -32] +L(tail64): + subs count, count, 64 + b.hi 1b +2: stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + + .p2align 3 +L(try_zva): + mrs tmp1, dczid_el0 + tbnz tmp1w, 4, L(no_zva) + and tmp1w, tmp1w, 15 + cmp tmp1w, 4 /* ZVA size is 64 bytes. */ + b.ne L(zva_128) + + /* Write the first and last 64 byte aligned block using stp rather + than using DC ZVA. This is faster on some cores. + */ +L(zva_64): + str q0, [dst, 16] + stp q0, q0, [dst, 32] + bic dst, dst, 63 + stp q0, q0, [dst, 64] + stp q0, q0, [dst, 96] + sub count, dstend, dst /* Count is now 128 too large. */ + sub count, count, 128+64+64 /* Adjust count and bias for loop. */ + add dst, dst, 128 + nop +1: dc zva, dst + add dst, dst, 64 + subs count, count, 64 + b.hi 1b + stp q0, q0, [dst, 0] + stp q0, q0, [dst, 32] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + + .p2align 3 +L(zva_128): + cmp tmp1w, 5 /* ZVA size is 128 bytes. */ + b.ne L(zva_other) + + str q0, [dst, 16] + stp q0, q0, [dst, 32] + stp q0, q0, [dst, 64] + stp q0, q0, [dst, 96] + bic dst, dst, 127 + sub count, dstend, dst /* Count is now 128 too large. */ + sub count, count, 128+128 /* Adjust count and bias for loop. */ + add dst, dst, 128 +1: dc zva, dst + add dst, dst, 128 + subs count, count, 128 + b.hi 1b + stp q0, q0, [dstend, -128] + stp q0, q0, [dstend, -96] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +L(zva_other): + mov tmp2w, 4 + lsl zva_lenw, tmp2w, tmp1w + add tmp1, zva_len, 64 /* Max alignment bytes written. */ + cmp count, tmp1 + blo L(no_zva) + + sub tmp2, zva_len, 1 + add tmp1, dst, zva_len + add dst, dst, 16 + subs count, tmp1, dst /* Actual alignment bytes to write. */ + bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ + beq 2f +1: stp q0, q0, [dst], 64 + stp q0, q0, [dst, -32] + subs count, count, 64 + b.hi 1b +2: mov dst, tmp1 + sub count, dstend, tmp1 /* Remaining bytes to write. */ + subs count, count, zva_len + b.lo 4f +3: dc zva, dst + add dst, dst, zva_len + subs count, count, zva_len + b.hs 3b +4: add count, count, zva_len + b L(tail64) + + .size __memset_aarch64, . - __memset_aarch64 diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S new file mode 100644 index 0000000..8d8a319 --- /dev/null +++ b/string/aarch64/strchr-sve.S @@ -0,0 +1,69 @@ +/* + * strchr/strchrnul - find a character in a string + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * SVE Available. + */ + + .arch armv8-a+sve + .text + +/* To build as strchrnul, define BUILD_STRCHRNUL before compiling this file. */ +#ifdef BUILD_STRCHRNUL +#define FUNC __strchrnul_aarch64_sve +#else +#define FUNC __strchr_aarch64_sve +#endif + + .globl FUNC + .type FUNC, %function + .p2align 4 +FUNC: + dup z1.b, w1 /* replicate byte across vector */ + setffr /* initialize FFR */ + ptrue p1.b /* all ones; loop invariant */ + + .p2align 4 + /* Read a vector's worth of bytes, stopping on first fault. */ +0: ldff1b z0.b, p1/z, [x0, xzr] + rdffrs p0.b, p1/z + b.nlast 2f + + /* First fault did not fail: the whole vector is valid. + Avoid depending on the contents of FFR beyond the branch. */ + incb x0 /* speculate increment */ + cmpeq p2.b, p1/z, z0.b, z1.b /* search for c */ + cmpeq p3.b, p1/z, z0.b, 0 /* search for 0 */ + orrs p4.b, p1/z, p2.b, p3.b /* c | 0 */ + b.none 0b + decb x0 /* undo speculate */ + + /* Found C or 0. */ +1: brka p4.b, p1/z, p4.b /* find first such */ + sub x0, x0, 1 /* adjust pointer for that byte */ + incp x0, p4.b +#ifndef BUILD_STRCHRNUL + ptest p4, p2.b /* was first in c? */ + csel x0, xzr, x0, none /* if there was no c, return null */ +#endif + ret + + /* First fault failed: only some of the vector is valid. + Perform the comparision only on the valid bytes. */ +2: cmpeq p2.b, p0/z, z0.b, z1.b /* search for c */ + cmpeq p3.b, p0/z, z0.b, 0 /* search for 0 */ + orrs p4.b, p0/z, p2.b, p3.b /* c | 0 */ + b.any 1b + + /* No C or 0 found. Re-init FFR, increment, and loop. */ + setffr + incp x0, p0.b + b 0b + + .size FUNC, . - FUNC diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S new file mode 100644 index 0000000..945be3d --- /dev/null +++ b/string/aarch64/strchr.S @@ -0,0 +1,137 @@ +/* + * strchr - find a character in a string + * + * Copyright (c) 2014-2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Neon Available. + */ + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 + +#define result x0 + +#define src x2 +#define tmp1 x3 +#define wtmp2 w4 +#define tmp3 x5 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_nul1 v3 +#define vhas_nul2 v4 +#define vhas_chr1 v5 +#define vhas_chr2 v6 +#define vrepmask_0 v7 +#define vrepmask_c v16 +#define vend1 v17 +#define vend2 v18 + +/* Core algorithm. + + For each 32-byte hunk we calculate a 64-bit syndrome value, with + two bits per byte (LSB is always in bits 0 and 1, for both big + and little-endian systems). For each tuple, bit 0 is set iff + the relevant byte matched the requested character; bit 1 is set + iff the relevant byte matched the NUL end of string (we trigger + off bit0 for the special case of looking for NUL). Since the bits + in the syndrome reflect exactly the order in which things occur + in the original string a count_trailing_zeros() operation will + identify exactly which byte is causing the termination, and why. */ + +/* Locals and temporaries. */ + + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + +def_fn __strchr_aarch64 + /* Magic constant 0x40100401 to allow us to identify which lane + matches the requested byte. Magic constant 0x80200802 used + similarly for NUL termination. */ + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ + dup vrepmask_c.4s, wtmp2 + ands tmp1, srcin, #31 + add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ + b.eq .Lloop + + /* Input string is not 32-byte aligned. Rather than forcing + the padding bytes to a safe value, we calculate the syndrome + for all the bytes, but then mask off those bits of the + syndrome that are related to the padding. */ + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + neg tmp1, tmp1 + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_nul2.16b, vdata2.16b, #0 + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b + and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b + orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b + orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b + lsl tmp1, tmp1, #1 + addp vend1.16b, vend1.16b, vend2.16b // 256->128 + mov tmp3, #~0 + addp vend1.16b, vend1.16b, vend2.16b // 128->64 + lsr tmp1, tmp3, tmp1 + + mov tmp3, vend1.2d[0] + bic tmp1, tmp3, tmp1 // Mask padding bits. + cbnz tmp1, .Ltail + +.Lloop: + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_nul2.16b, vdata2.16b, #0 + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + /* Use a fast check for the termination condition. */ + orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b + orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b + orr vend1.16b, vend1.16b, vend2.16b + addp vend1.2d, vend1.2d, vend1.2d + mov tmp1, vend1.2d[0] + cbz tmp1, .Lloop + + /* Termination condition found. Now need to establish exactly why + we terminated. */ + and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b + and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b + orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b + orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b + addp vend1.16b, vend1.16b, vend2.16b // 256->128 + addp vend1.16b, vend1.16b, vend2.16b // 128->64 + + mov tmp1, vend1.2d[0] +.Ltail: + /* Count the trailing zeros, by bit reversing... */ + rbit tmp1, tmp1 + /* Re-bias source. */ + sub src, src, #32 + clz tmp1, tmp1 /* And counting the leading zeros. */ + /* Tmp1 is even if the target charager was found first. Otherwise + we've found the end of string and we weren't looking for NUL. */ + tst tmp1, #1 + add result, src, tmp1, lsr #1 + csel result, result, xzr, eq + ret + + .size __strchr_aarch64, . - __strchr_aarch64 diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S new file mode 100644 index 0000000..5140e59 --- /dev/null +++ b/string/aarch64/strchrnul-sve.S @@ -0,0 +1,9 @@ +/* + * strchrnul - find a character or nul in a string + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define BUILD_STRCHRNUL +#include "strchr-sve.S" diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S new file mode 100644 index 0000000..d19c0e8 --- /dev/null +++ b/string/aarch64/strchrnul.S @@ -0,0 +1,122 @@ +/* + * strchrnul - find a character or nul in a string + * + * Copyright (c) 2014-2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Neon Available. + */ + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 + +#define result x0 + +#define src x2 +#define tmp1 x3 +#define wtmp2 w4 +#define tmp3 x5 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_nul1 v3 +#define vhas_nul2 v4 +#define vhas_chr1 v5 +#define vhas_chr2 v6 +#define vrepmask v7 +#define vend1 v16 + +/* Core algorithm. + + For each 32-byte hunk we calculate a 64-bit syndrome value, with + two bits per byte (LSB is always in bits 0 and 1, for both big + and little-endian systems). For each tuple, bit 0 is set iff + the relevant byte matched the requested character or nul. Since the + bits in the syndrome reflect exactly the order in which things occur + in the original string a count_trailing_zeros() operation will + identify exactly which byte is causing the termination. */ + +/* Locals and temporaries. */ + + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + +def_fn __strchrnul_aarch64 + /* Magic constant 0x40100401 to allow us to identify which lane + matches the termination condition. */ + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ + dup vrepmask.4s, wtmp2 + ands tmp1, srcin, #31 + b.eq .Lloop + + /* Input string is not 32-byte aligned. Rather than forcing + the padding bytes to a safe value, we calculate the syndrome + for all the bytes, but then mask off those bits of the + syndrome that are related to the padding. */ + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + neg tmp1, tmp1 + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_nul2.16b, vdata2.16b, #0 + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b + orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + lsl tmp1, tmp1, #1 + addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + mov tmp3, #~0 + addp vend1.16b, vend1.16b, vend1.16b // 128->64 + lsr tmp1, tmp3, tmp1 + + mov tmp3, vend1.2d[0] + bic tmp1, tmp3, tmp1 // Mask padding bits. + cbnz tmp1, .Ltail + +.Lloop: + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_nul2.16b, vdata2.16b, #0 + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + /* Use a fast check for the termination condition. */ + orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b + orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b + orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b + addp vend1.2d, vend1.2d, vend1.2d + mov tmp1, vend1.2d[0] + cbz tmp1, .Lloop + + /* Termination condition found. Now need to establish exactly why + we terminated. */ + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + addp vend1.16b, vend1.16b, vend1.16b // 128->64 + + mov tmp1, vend1.2d[0] +.Ltail: + /* Count the trailing zeros, by bit reversing... */ + rbit tmp1, tmp1 + /* Re-bias source. */ + sub src, src, #32 + clz tmp1, tmp1 /* ... and counting the leading zeros. */ + /* tmp1 is twice the offset into the fragment. */ + add result, src, tmp1, lsr #1 + ret + + .size __strchrnul_aarch64, . - __strchrnul_aarch64 diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S new file mode 100644 index 0000000..91bac19 --- /dev/null +++ b/string/aarch64/strcmp-sve.S @@ -0,0 +1,57 @@ +/* + * __strcmp_aarch64_sve - compare two strings + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * SVE Available. + */ + + .arch armv8-a+sve + .text + + .globl __strcmp_aarch64_sve + .type __strcmp_aarch64_sve, %function + .p2align 4 +__strcmp_aarch64_sve: + setffr /* initialize FFR */ + ptrue p1.b, all /* all ones; loop invariant */ + mov x2, 0 /* initialize offset */ + nop + + /* Read a vector's worth of bytes, stopping on first fault. */ +0: ldff1b z0.b, p1/z, [x0, x2] + ldff1b z1.b, p1/z, [x1, x2] + rdffrs p0.b, p1/z + b.nlast 2f + + /* First fault did not fail: the whole vector is valid. + Avoid depending on the contents of FFR beyond the branch. */ + incb x2, all /* skip bytes for next round */ + cmpeq p2.b, p1/z, z0.b, z1.b /* compare strings */ + cmpne p3.b, p1/z, z0.b, 0 /* search for ~zero */ + nands p2.b, p1/z, p2.b, p3.b /* ~(eq & ~zero) -> ne | zero */ + b.none 0b + + /* Found end-of-string or inequality. */ +1: brkb p2.b, p1/z, p2.b /* find first such */ + lasta w0, p2, z0.b /* extract each char */ + lasta w1, p2, z1.b + sub x0, x0, x1 /* return comparison */ + ret + + /* First fault failed: only some of the vector is valid. + Perform the comparison only on the valid bytes. */ +2: incp x2, p0.b /* skip bytes for next round */ + setffr /* re-init FFR for next round */ + cmpeq p2.b, p0/z, z0.b, z1.b /* compare strings, as above */ + cmpne p3.b, p0/z, z0.b, 0 + nands p2.b, p0/z, p2.b, p3.b + b.none 0b + b 1b + + .size __strcmp_aarch64_sve, . - __strcmp_aarch64_sve diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S new file mode 100644 index 0000000..2aa367c --- /dev/null +++ b/string/aarch64/strcmp.S @@ -0,0 +1,177 @@ +/* + * strcmp - compare two strings + * + * Copyright (c) 2012, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + */ + + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + +#define L(label) .L ## label + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +/* Parameters and result. */ +#define src1 x0 +#define src2 x1 +#define result x0 + +/* Internal variables. */ +#define data1 x2 +#define data1w w2 +#define data2 x3 +#define data2w w3 +#define has_nul x4 +#define diff x5 +#define syndrome x6 +#define tmp1 x7 +#define tmp2 x8 +#define tmp3 x9 +#define zeroones x10 +#define pos x11 + + /* Start of performance-critical section -- one 64B cache line. */ +def_fn __strcmp_aarch64 p2align=6 + eor tmp1, src1, src2 + mov zeroones, #REP8_01 + tst tmp1, #7 + b.ne L(misaligned8) + ands tmp1, src1, #7 + b.ne L(mutual_align) + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ +L(loop_aligned): + ldr data1, [src1], #8 + ldr data2, [src2], #8 +L(start_realigned): + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + orr syndrome, diff, has_nul + cbz syndrome, L(loop_aligned) + /* End of performance-critical section -- one 64B cache line. */ + +L(end): +#ifndef __AARCH64EB__ + rev syndrome, syndrome + rev data1, data1 + /* The MS-non-zero bit of the syndrome marks either the first bit + that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + clz pos, syndrome + rev data2, data2 + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +#else + /* For big-endian we cannot use the trick with the syndrome value + as carry-propagation can corrupt the upper bits if the trailing + bytes in the string contain 0x01. */ + /* However, if there is no NUL byte in the dword, we can generate + the result directly. We can't just subtract the bytes as the + MSB might be significant. */ + cbnz has_nul, 1f + cmp data1, data2 + cset result, ne + cneg result, result, lo + ret +1: + /* Re-compute the NUL-byte detection, using a byte-reversed value. */ + rev tmp3, data1 + sub tmp1, tmp3, zeroones + orr tmp2, tmp3, #REP8_7f + bic has_nul, tmp1, tmp2 + rev has_nul, has_nul + orr syndrome, diff, has_nul + clz pos, syndrome + /* The MS-non-zero bit of the syndrome marks either the first bit + that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +#endif + +L(mutual_align): + /* Sources are mutually aligned, but are not currently at an + alignment boundary. Round down the addresses and then mask off + the bytes that preceed the start point. */ + bic src1, src1, #7 + bic src2, src2, #7 + lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ + ldr data1, [src1], #8 + neg tmp1, tmp1 /* Bits to alignment -64. */ + ldr data2, [src2], #8 + mov tmp2, #~0 +#ifdef __AARCH64EB__ + /* Big-endian. Early bytes are at MSB. */ + lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ +#else + /* Little-endian. Early bytes are at LSB. */ + lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ +#endif + orr data1, data1, tmp2 + orr data2, data2, tmp2 + b L(start_realigned) + +L(misaligned8): + /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always + checking to make sure that we don't access beyond page boundary in + SRC2. */ + tst src1, #7 + b.eq L(loop_misaligned) +L(do_misaligned): + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + cmp data1w, #1 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.ne L(done) + tst src1, #7 + b.ne L(do_misaligned) + +L(loop_misaligned): + /* Test if we are within the last dword of the end of a 4K page. If + yes then jump back to the misaligned loop to copy a byte at a time. */ + and tmp1, src2, #0xff8 + eor tmp1, tmp1, #0xff8 + cbz tmp1, L(do_misaligned) + ldr data1, [src1], #8 + ldr data2, [src2], #8 + + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + orr syndrome, diff, has_nul + cbz syndrome, L(loop_misaligned) + b L(end) + +L(done): + sub result, data1, data2 + ret + .size __strcmp_aarch64, .-__strcmp_aarch64 diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S new file mode 100644 index 0000000..c929f37 --- /dev/null +++ b/string/aarch64/strcpy-sve.S @@ -0,0 +1,69 @@ +/* + * strcpy/stpcpy - copy a string returning pointer to start/end. + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * SVE Available. + */ + + .arch armv8-a+sve + .text + +/* To build as stpcpy, define BUILD_STPCPY before compiling this file. */ +#ifdef BUILD_STPCPY +#define FUNC __stpcpy_aarch64_sve +#else +#define FUNC __strcpy_aarch64_sve +#endif + + .globl FUNC + .type FUNC, %function + .p2align 4 +FUNC: + setffr /* initialize FFR */ + ptrue p2.b, all /* all ones; loop invariant */ + mov x2, 0 /* initialize offset */ + + .p2align 4 + /* Read a vector's worth of bytes, stopping on first fault. */ +0: ldff1b z0.b, p2/z, [x1, x2] + rdffrs p0.b, p2/z + b.nlast 1f + + /* First fault did not fail: the whole vector is valid. + Avoid depending on the contexts of FFR beyond the branch. */ + cmpeq p1.b, p2/z, z0.b, 0 /* search for zeros */ + b.any 2f + + /* No zero found. Store the whole vector and loop. */ + st1b z0.b, p2, [x0, x2] + incb x2, all + b 0b + + /* First fault failed: only some of the vector is valid. + Perform the comparison only on the valid bytes. */ +1: cmpeq p1.b, p0/z, z0.b, 0 /* search for zeros */ + b.any 2f + + /* No zero found. Store the valid portion of the vector and loop. */ + setffr /* re-init FFR */ + st1b z0.b, p0, [x0, x2] + incp x2, p0.b + b 0b + + /* Zero found. Crop the vector to the found zero and finish. */ +2: brka p0.b, p2/z, p1.b + st1b z0.b, p0, [x0, x2] +#ifdef BUILD_STPCPY + add x0, x0, x2 + sub x0, x0, 1 + incp x0, p0.b +#endif + ret + + .size FUNC, . - FUNC diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S new file mode 100644 index 0000000..4e10b4d --- /dev/null +++ b/string/aarch64/strcpy.S @@ -0,0 +1,314 @@ +/* + * strcpy/stpcpy - copy a string returning pointer to start/end. + * + * Copyright (c) 2013-2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses, min page size 4k. + */ + +/* To build as stpcpy, define BUILD_STPCPY before compiling this file. + + To test the page crossing code path more thoroughly, compile with + -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower + entry path. This option is not intended for production use. */ + +/* Arguments and results. */ +#define dstin x0 +#define srcin x1 + +/* Locals and temporaries. */ +#define src x2 +#define dst x3 +#define data1 x4 +#define data1w w4 +#define data2 x5 +#define data2w w5 +#define has_nul1 x6 +#define has_nul2 x7 +#define tmp1 x8 +#define tmp2 x9 +#define tmp3 x10 +#define tmp4 x11 +#define zeroones x12 +#define data1a x13 +#define data2a x14 +#define pos x15 +#define len x16 +#define to_align x17 + +#ifdef BUILD_STPCPY +#define STRCPY __stpcpy_aarch64 +#else +#define STRCPY __strcpy_aarch64 +#endif + + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + + /* AArch64 systems have a minimum page size of 4k. We can do a quick + page size check for crossing this boundary on entry and if we + do not, then we can short-circuit much of the entry code. We + expect early page-crossing strings to be rare (probability of + 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite + predictable, even with random strings. + + We don't bother checking for larger page sizes, the cost of setting + up the correct page size is just not worth the extra gain from + a small reduction in the cases taking the slow path. Note that + we only care about whether the first fetch, which may be + misaligned, crosses a page boundary - after that we move to aligned + fetches for the remainder of the string. */ + +#ifdef STRCPY_TEST_PAGE_CROSS + /* Make everything that isn't Qword aligned look like a page cross. */ +#define MIN_PAGE_P2 4 +#else +#define MIN_PAGE_P2 12 +#endif + +#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2) + +def_fn STRCPY p2align=6 + /* For moderately short strings, the fastest way to do the copy is to + calculate the length of the string in the same way as strlen, then + essentially do a memcpy of the result. This avoids the need for + multiple byte copies and further means that by the time we + reach the bulk copy loop we know we can always use DWord + accesses. We expect __strcpy_aarch64 to rarely be called repeatedly + with the same source string, so branch prediction is likely to + always be difficult - we mitigate against this by preferring + conditional select operations over branches whenever this is + feasible. */ + and tmp2, srcin, #(MIN_PAGE_SIZE - 1) + mov zeroones, #REP8_01 + and to_align, srcin, #15 + cmp tmp2, #(MIN_PAGE_SIZE - 16) + neg tmp1, to_align + /* The first fetch will straddle a (possible) page boundary iff + srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte + aligned string will never fail the page align check, so will + always take the fast path. */ + b.gt .Lpage_cross + +.Lpage_cross_ok: + ldp data1, data2, [srcin] +#ifdef __AARCH64EB__ + /* Because we expect the end to be found within 16 characters + (profiling shows this is the most common case), it's worth + swapping the bytes now to save having to recalculate the + termination syndrome later. We preserve data1 and data2 + so that we can re-use the values later on. */ + rev tmp2, data1 + sub tmp1, tmp2, zeroones + orr tmp2, tmp2, #REP8_7f + bics has_nul1, tmp1, tmp2 + b.ne .Lfp_le8 + rev tmp4, data2 + sub tmp3, tmp4, zeroones + orr tmp4, tmp4, #REP8_7f +#else + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bics has_nul1, tmp1, tmp2 + b.ne .Lfp_le8 + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f +#endif + bics has_nul2, tmp3, tmp4 + b.eq .Lbulk_entry + + /* The string is short (<=16 bytes). We don't know exactly how + short though, yet. Work out the exact length so that we can + quickly select the optimal copy strategy. */ +.Lfp_gt8: + rev has_nul2, has_nul2 + clz pos, has_nul2 + mov tmp2, #56 + add dst, dstin, pos, lsr #3 /* Bits to bytes. */ + sub pos, tmp2, pos +#ifdef __AARCH64EB__ + lsr data2, data2, pos +#else + lsl data2, data2, pos +#endif + str data2, [dst, #1] + str data1, [dstin] +#ifdef BUILD_STPCPY + add dstin, dst, #8 +#endif + ret + +.Lfp_le8: + rev has_nul1, has_nul1 + clz pos, has_nul1 + add dst, dstin, pos, lsr #3 /* Bits to bytes. */ + subs tmp2, pos, #24 /* Pos in bits. */ + b.lt .Lfp_lt4 +#ifdef __AARCH64EB__ + mov tmp2, #56 + sub pos, tmp2, pos + lsr data2, data1, pos + lsr data1, data1, #32 +#else + lsr data2, data1, tmp2 +#endif + /* 4->7 bytes to copy. */ + str data2w, [dst, #-3] + str data1w, [dstin] +#ifdef BUILD_STPCPY + mov dstin, dst +#endif + ret +.Lfp_lt4: + cbz pos, .Lfp_lt2 + /* 2->3 bytes to copy. */ +#ifdef __AARCH64EB__ + lsr data1, data1, #48 +#endif + strh data1w, [dstin] + /* Fall-through, one byte (max) to go. */ +.Lfp_lt2: + /* Null-terminated string. Last character must be zero! */ + strb wzr, [dst] +#ifdef BUILD_STPCPY + mov dstin, dst +#endif + ret + + .p2align 6 + /* Aligning here ensures that the entry code and main loop all lies + within one 64-byte cache line. */ +.Lbulk_entry: + sub to_align, to_align, #16 + stp data1, data2, [dstin] + sub src, srcin, to_align + sub dst, dstin, to_align + b .Lentry_no_page_cross + + /* The inner loop deals with two Dwords at a time. This has a + slightly higher start-up cost, but we should win quite quickly, + especially on cores with a high number of issue slots per + cycle, as we get much better parallelism out of the operations. */ +.Lmain_loop: + stp data1, data2, [dst], #16 +.Lentry_no_page_cross: + ldp data1, data2, [src], #16 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bics has_nul2, tmp3, tmp4 + ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ + b.eq .Lmain_loop + + /* Since we know we are copying at least 16 bytes, the fastest way + to deal with the tail is to determine the location of the + trailing NUL, then (re)copy the 16 bytes leading up to that. */ + cmp has_nul1, #0 +#ifdef __AARCH64EB__ + /* For big-endian, carry propagation (if the final byte in the + string is 0x01) means we cannot use has_nul directly. The + easiest way to get the correct byte is to byte-swap the data + and calculate the syndrome a second time. */ + csel data1, data1, data2, ne + rev data1, data1 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bic has_nul1, tmp1, tmp2 +#else + csel has_nul1, has_nul1, has_nul2, ne +#endif + rev has_nul1, has_nul1 + clz pos, has_nul1 + add tmp1, pos, #72 + add pos, pos, #8 + csel pos, pos, tmp1, ne + add src, src, pos, lsr #3 + add dst, dst, pos, lsr #3 + ldp data1, data2, [src, #-32] + stp data1, data2, [dst, #-16] +#ifdef BUILD_STPCPY + sub dstin, dst, #1 +#endif + ret + +.Lpage_cross: + bic src, srcin, #15 + /* Start by loading two words at [srcin & ~15], then forcing the + bytes that precede srcin to 0xff. This means they never look + like termination bytes. */ + ldp data1, data2, [src] + lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ + tst to_align, #7 + csetm tmp2, ne +#ifdef __AARCH64EB__ + lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ +#else + lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ +#endif + orr data1, data1, tmp2 + orr data2a, data2, tmp2 + cmp to_align, #8 + csinv data1, data1, xzr, lt + csel data2, data2, data2a, lt + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bics has_nul2, tmp3, tmp4 + ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ + b.eq .Lpage_cross_ok + /* We now need to make data1 and data2 look like they've been + loaded directly from srcin. Do a rotate on the 128-bit value. */ + lsl tmp1, to_align, #3 /* Bytes->bits. */ + neg tmp2, to_align, lsl #3 +#ifdef __AARCH64EB__ + lsl data1a, data1, tmp1 + lsr tmp4, data2, tmp2 + lsl data2, data2, tmp1 + orr tmp4, tmp4, data1a + cmp to_align, #8 + csel data1, tmp4, data2, lt + rev tmp2, data1 + rev tmp4, data2 + sub tmp1, tmp2, zeroones + orr tmp2, tmp2, #REP8_7f + sub tmp3, tmp4, zeroones + orr tmp4, tmp4, #REP8_7f +#else + lsr data1a, data1, tmp1 + lsl tmp4, data2, tmp2 + lsr data2, data2, tmp1 + orr tmp4, tmp4, data1a + cmp to_align, #8 + csel data1, tmp4, data2, lt + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f +#endif + bic has_nul1, tmp1, tmp2 + cbnz has_nul1, .Lfp_le8 + bic has_nul2, tmp3, tmp4 + b .Lfp_gt8 + + .size STRCPY, . - STRCPY diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S new file mode 100644 index 0000000..64ede85 --- /dev/null +++ b/string/aarch64/strlen-sve.S @@ -0,0 +1,55 @@ +/* + * __strlen_aarch64_sve - compute the length of a string + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * SVE Available. + */ + + .arch armv8-a+sve + .text + + .globl __strlen_aarch64_sve + .type __strlen_aarch64_sve, %function + .p2align 4 +__strlen_aarch64_sve: + setffr /* initialize FFR */ + ptrue p2.b /* all ones; loop invariant */ + mov x1, 0 /* initialize length */ + nop + + /* Read a vector's worth of bytes, stopping on first fault. */ +0: ldff1b z0.b, p2/z, [x0, x1] + nop + rdffrs p0.b, p2/z + b.nlast 2f + + /* First fault did not fail: the whole vector is valid. + Avoid depending on the contents of FFR beyond the branch. */ + incb x1, all /* speculate increment */ + cmpeq p1.b, p2/z, z0.b, 0 /* loop if no zeros */ + b.none 0b + decb x1, all /* undo speculate */ + + /* Zero found. Select the bytes before the first and count them. */ +1: brkb p0.b, p2/z, p1.b + incp x1, p0.b + mov x0, x1 + ret + + /* First fault failed: only some of the vector is valid. + Perform the comparison only on the valid bytes. */ +2: cmpeq p1.b, p0/z, z0.b, 0 + b.any 1b + + /* No zero found. Re-init FFR, increment, and loop. */ + setffr + incp x1, p0.b + b 0b + + .size __strlen_aarch64_sve, . - __strlen_aarch64_sve diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S new file mode 100644 index 0000000..26388d7 --- /dev/null +++ b/string/aarch64/strlen.S @@ -0,0 +1,214 @@ +/* + * strlen - calculate the length of a string + * + * Copyright (c) 2013, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses, min page size 4k. + */ + +/* To test the page crossing code path more thoroughly, compile with + -DTEST_PAGE_CROSS - this will force all calls through the slower + entry path. This option is not intended for production use. */ + +/* Arguments and results. */ +#define srcin x0 +#define len x0 + +/* Locals and temporaries. */ +#define src x1 +#define data1 x2 +#define data2 x3 +#define has_nul1 x4 +#define has_nul2 x5 +#define tmp1 x4 +#define tmp2 x5 +#define tmp3 x6 +#define tmp4 x7 +#define zeroones x8 + +#define L(l) .L ## l + + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. A faster check + (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives + false hits for characters 129..255. */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +#ifdef TEST_PAGE_CROSS +# define MIN_PAGE_SIZE 15 +#else +# define MIN_PAGE_SIZE 4096 +#endif + + /* Since strings are short on average, we check the first 16 bytes + of the string for a NUL character. In order to do an unaligned ldp + safely we have to do a page cross check first. If there is a NUL + byte we calculate the length from the 2 8-byte words using + conditional select to reduce branch mispredictions (it is unlikely + __strlen_aarch64 will be repeatedly called on strings with the same length). + + If the string is longer than 16 bytes, we align src so don't need + further page cross checks, and process 32 bytes per iteration + using the fast NUL check. If we encounter non-ASCII characters, + fallback to a second loop using the full NUL check. + + If the page cross check fails, we read 16 bytes from an aligned + address, remove any characters before the string, and continue + in the main loop using aligned loads. Since strings crossing a + page in the first 16 bytes are rare (probability of + 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. + + AArch64 systems have a minimum page size of 4k. We don't bother + checking for larger page sizes - the cost of setting up the correct + page size is just not worth the extra gain from a small reduction in + the cases taking the slow path. Note that we only care about + whether the first fetch, which may be misaligned, crosses a page + boundary. */ + +def_fn __strlen_aarch64 p2align=6 + and tmp1, srcin, MIN_PAGE_SIZE - 1 + mov zeroones, REP8_01 + cmp tmp1, MIN_PAGE_SIZE - 16 + b.gt L(page_cross) + ldp data1, data2, [srcin] +#ifdef __AARCH64EB__ + /* For big-endian, carry propagation (if the final byte in the + string is 0x01) means we cannot use has_nul1/2 directly. + Since we expect strings to be small and early-exit, + byte-swap the data now so has_null1/2 will be correct. */ + rev data1, data1 + rev data2, data2 +#endif + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + beq L(main_loop_entry) + + /* Enter with C = has_nul1 == 0. */ + csel has_nul1, has_nul1, has_nul2, cc + mov len, 8 + rev has_nul1, has_nul1 + clz tmp1, has_nul1 + csel len, xzr, len, cc + add len, len, tmp1, lsr 3 + ret + + /* The inner loop processes 32 bytes per iteration and uses the fast + NUL check. If we encounter non-ASCII characters, use a second + loop with the accurate NUL check. */ + .p2align 4 +L(main_loop_entry): + bic src, srcin, 15 + sub src, src, 16 +L(main_loop): + ldp data1, data2, [src, 32]! +.Lpage_cross_entry: + sub tmp1, data1, zeroones + sub tmp3, data2, zeroones + orr tmp2, tmp1, tmp3 + tst tmp2, zeroones, lsl 7 + bne 1f + ldp data1, data2, [src, 16] + sub tmp1, data1, zeroones + sub tmp3, data2, zeroones + orr tmp2, tmp1, tmp3 + tst tmp2, zeroones, lsl 7 + beq L(main_loop) + add src, src, 16 +1: + /* The fast check failed, so do the slower, accurate NUL check. */ + orr tmp2, data1, REP8_7f + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + beq L(nonascii_loop) + + /* Enter with C = has_nul1 == 0. */ +L(tail): +#ifdef __AARCH64EB__ + /* For big-endian, carry propagation (if the final byte in the + string is 0x01) means we cannot use has_nul1/2 directly. The + easiest way to get the correct byte is to byte-swap the data + and calculate the syndrome a second time. */ + csel data1, data1, data2, cc + rev data1, data1 + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + bic has_nul1, tmp1, tmp2 +#else + csel has_nul1, has_nul1, has_nul2, cc +#endif + sub len, src, srcin + rev has_nul1, has_nul1 + add tmp2, len, 8 + clz tmp1, has_nul1 + csel len, len, tmp2, cc + add len, len, tmp1, lsr 3 + ret + +L(nonascii_loop): + ldp data1, data2, [src, 16]! + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + bne L(tail) + ldp data1, data2, [src, 16]! + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + beq L(nonascii_loop) + b L(tail) + + /* Load 16 bytes from [srcin & ~15] and force the bytes that precede + srcin to 0x7f, so we ignore any NUL bytes before the string. + Then continue in the aligned loop. */ +L(page_cross): + bic src, srcin, 15 + ldp data1, data2, [src] + lsl tmp1, srcin, 3 + mov tmp4, -1 +#ifdef __AARCH64EB__ + /* Big-endian. Early bytes are at MSB. */ + lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ +#else + /* Little-endian. Early bytes are at LSB. */ + lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ +#endif + orr tmp1, tmp1, REP8_80 + orn data1, data1, tmp1 + orn tmp2, data2, tmp1 + tst srcin, 8 + csel data1, data1, tmp4, eq + csel data2, data2, tmp2, eq + b L(page_cross_entry) + + .size __strlen_aarch64, . - __strlen_aarch64 diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S new file mode 100644 index 0000000..6f31eca --- /dev/null +++ b/string/aarch64/strncmp-sve.S @@ -0,0 +1,66 @@ +/* + * strncmp - compare two strings with limit + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * SVE Available. + */ + + .arch armv8-a+sve + .text + + .globl __strncmp_aarch64_sve + .type __strncmp_aarch64_sve, %function + .p2align 4 +__strncmp_aarch64_sve: + setffr /* initialize FFR */ + mov x3, 0 /* initialize off */ + +0: whilelo p0.b, x3, x2 /* while off < max */ + b.none 9f + + ldff1b z0.b, p0/z, [x0, x3] + ldff1b z1.b, p0/z, [x1, x3] + rdffrs p1.b, p0/z + b.nlast 2f + + /* First fault did not fail: the vector up to max is valid. + Avoid depending on the contents of FFR beyond the branch. + Increment for a whole vector, even if we've only read a partial. + This is significantly cheaper than INCP, and since OFF is not + used after the loop it is ok to increment OFF past MAX. */ + incb x3 + cmpeq p1.b, p0/z, z0.b, z1.b /* compare strings */ + cmpne p2.b, p0/z, z0.b, 0 /* search for ~zero */ + nands p2.b, p0/z, p1.b, p2.b /* ~(eq & ~zero) -> ne | zero */ + b.none 0b + + /* Found end-of-string or inequality. */ +1: brkb p2.b, p0/z, p2.b /* find first such */ + lasta w0, p2, z0.b /* extract each char */ + lasta w1, p2, z1.b + sub x0, x0, x1 /* return comparison */ + ret + + /* First fault failed: only some of the vector is valid. + Perform the comparison only on the valid bytes. */ +2: cmpeq p2.b, p1/z, z0.b, z1.b /* compare strings, as above */ + cmpne p3.b, p1/z, z0.b, 0 + nands p2.b, p1/z, p2.b, p3.b + b.any 1b + + /* No inequality or zero found. Re-init FFR, incr and loop. */ + setffr + incp x3, p1.b + b 0b + + /* Found end-of-count. */ +9: mov x0, 0 /* return equal */ + ret + + .size __strncmp_aarch64_sve, . - __strncmp_aarch64_sve diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S new file mode 100644 index 0000000..ced72b9 --- /dev/null +++ b/string/aarch64/strncmp.S @@ -0,0 +1,266 @@ +/* + * strncmp - compare two strings + * + * Copyright (c) 2013, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + */ + + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +/* Parameters and result. */ +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result x0 + +/* Internal variables. */ +#define data1 x3 +#define data1w w3 +#define data2 x4 +#define data2w w4 +#define has_nul x5 +#define diff x6 +#define syndrome x7 +#define tmp1 x8 +#define tmp2 x9 +#define tmp3 x10 +#define zeroones x11 +#define pos x12 +#define limit_wd x13 +#define mask x14 +#define endloop x15 +#define count mask + + .text + .p2align 6 + .rep 7 + nop /* Pad so that the loop below fits a cache line. */ + .endr +def_fn __strncmp_aarch64 + cbz limit, .Lret0 + eor tmp1, src1, src2 + mov zeroones, #REP8_01 + tst tmp1, #7 + and count, src1, #7 + b.ne .Lmisaligned8 + cbnz count, .Lmutual_align + /* Calculate the number of full and partial words -1. */ + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ + lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ + + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ + /* Start of performance-critical section -- one 64B cache line. */ +.Lloop_aligned: + ldr data1, [src1], #8 + ldr data2, [src2], #8 +.Lstart_realigned: + subs limit_wd, limit_wd, #1 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, pl /* Last Dword or differences. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp endloop, #0, #0, eq + b.eq .Lloop_aligned + /* End of performance-critical section -- one 64B cache line. */ + + /* Not reached the limit, must have found the end or a diff. */ + tbz limit_wd, #63, .Lnot_limit + + /* Limit % 8 == 0 => all bytes significant. */ + ands limit, limit, #7 + b.eq .Lnot_limit + + lsl limit, limit, #3 /* Bits -> bytes. */ + mov mask, #~0 +#ifdef __AARCH64EB__ + lsr mask, mask, limit +#else + lsl mask, mask, limit +#endif + bic data1, data1, mask + bic data2, data2, mask + + /* Make sure that the NUL byte is marked in the syndrome. */ + orr has_nul, has_nul, mask + +.Lnot_limit: + orr syndrome, diff, has_nul + +#ifndef __AARCH64EB__ + rev syndrome, syndrome + rev data1, data1 + /* The MS-non-zero bit of the syndrome marks either the first bit + that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + clz pos, syndrome + rev data2, data2 + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +#else + /* For big-endian we cannot use the trick with the syndrome value + as carry-propagation can corrupt the upper bits if the trailing + bytes in the string contain 0x01. */ + /* However, if there is no NUL byte in the dword, we can generate + the result directly. We can't just subtract the bytes as the + MSB might be significant. */ + cbnz has_nul, 1f + cmp data1, data2 + cset result, ne + cneg result, result, lo + ret +1: + /* Re-compute the NUL-byte detection, using a byte-reversed value. */ + rev tmp3, data1 + sub tmp1, tmp3, zeroones + orr tmp2, tmp3, #REP8_7f + bic has_nul, tmp1, tmp2 + rev has_nul, has_nul + orr syndrome, diff, has_nul + clz pos, syndrome + /* The MS-non-zero bit of the syndrome marks either the first bit + that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +#endif + +.Lmutual_align: + /* Sources are mutually aligned, but are not currently at an + alignment boundary. Round down the addresses and then mask off + the bytes that precede the start point. + We also need to adjust the limit calculations, but without + overflowing if the limit is near ULONG_MAX. */ + bic src1, src1, #7 + bic src2, src2, #7 + ldr data1, [src1], #8 + neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ + ldr data2, [src2], #8 + mov tmp2, #~0 + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ +#ifdef __AARCH64EB__ + /* Big-endian. Early bytes are at MSB. */ + lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */ +#else + /* Little-endian. Early bytes are at LSB. */ + lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */ +#endif + and tmp3, limit_wd, #7 + lsr limit_wd, limit_wd, #3 + /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ + add limit, limit, count + add tmp3, tmp3, count + orr data1, data1, tmp2 + orr data2, data2, tmp2 + add limit_wd, limit_wd, tmp3, lsr #3 + b .Lstart_realigned + + .p2align 6 + /* Don't bother with dwords for up to 16 bytes. */ +.Lmisaligned8: + cmp limit, #16 + b.hs .Ltry_misaligned_words + +.Lbyte_loop: + /* Perhaps we can do better than this. */ + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs limit, limit, #1 + ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.eq .Lbyte_loop +.Ldone: + sub result, data1, data2 + ret + /* Align the SRC1 to a dword by doing a bytewise compare and then do + the dword loop. */ +.Ltry_misaligned_words: + lsr limit_wd, limit, #3 + cbz count, .Ldo_misaligned + + neg count, count + and count, count, #7 + sub limit, limit, count + lsr limit_wd, limit, #3 + +.Lpage_end_loop: + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + cmp data1w, #1 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.ne .Ldone + subs count, count, #1 + b.hi .Lpage_end_loop + +.Ldo_misaligned: + /* Prepare ourselves for the next page crossing. Unlike the aligned + loop, we fetch 1 less dword because we risk crossing bounds on + SRC2. */ + mov count, #8 + subs limit_wd, limit_wd, #1 + b.lo .Ldone_loop +.Lloop_misaligned: + and tmp2, src2, #0xff8 + eor tmp2, tmp2, #0xff8 + cbz tmp2, .Lpage_end_loop + + ldr data1, [src1], #8 + ldr data2, [src2], #8 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp diff, #0, #0, eq + b.ne .Lnot_limit + subs limit_wd, limit_wd, #1 + b.pl .Lloop_misaligned + +.Ldone_loop: + /* We found a difference or a NULL before the limit was reached. */ + and limit, limit, #7 + cbz limit, .Lnot_limit + /* Read the last word. */ + sub src1, src1, 8 + sub src2, src2, 8 + ldr data1, [src1, limit] + ldr data2, [src2, limit] + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp diff, #0, #0, eq + b.ne .Lnot_limit + +.Lret0: + mov result, #0 + ret + .size __strncmp_aarch64, . - __strncmp_aarch64 diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S new file mode 100644 index 0000000..3a9be08 --- /dev/null +++ b/string/aarch64/strnlen-sve.S @@ -0,0 +1,72 @@ +/* + * strnlen - calculate the length of a string with limit. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * SVE Available. + */ + + .arch armv8-a+sve + .text + + .globl __strnlen_aarch64_sve + .type __strnlen_aarch64_sve, %function + .p2align 4 +__strnlen_aarch64_sve: + setffr /* initialize FFR */ + mov x2, 0 /* initialize len */ + b 1f + + .p2align 4 + /* We have off + vl <= max, and so may read the whole vector. */ +0: ldff1b z0.b, p0/z, [x0, x2] + rdffrs p1.b, p0/z + b.nlast 2f + + /* First fault did not fail: the whole vector is valid. + Avoid depending on the contents of FFR beyond the branch. */ + cmpeq p2.b, p0/z, z0.b, 0 + b.any 8f + incb x2 + +1: whilelo p0.b, x2, x1 + b.last 0b + + /* We have off + vl < max. Test for off == max before proceeding. */ + b.none 9f + + ldff1b z0.b, p0/z, [x0, x2] + rdffrs p1.b, p0/z + b.nlast 2f + + /* First fault did not fail: the vector up to max is valid. + Avoid depending on the contents of FFR beyond the branch. + Compare for end-of-string, but there are no more bytes. */ + cmpeq p2.b, p0/z, z0.b, 0 + + /* Found end-of-string or zero. */ +8: brkb p2.b, p0/z, p2.b + mov x0, x2 + incp x0, p2.b + ret + + /* First fault failed: only some of the vector is valid. + Perform the comparison only on the valid bytes. */ +2: cmpeq p2.b, p1/z, z0.b, 0 + b.any 8b + + /* No inequality or zero found. Re-init FFR, incr and loop. */ + setffr + incp x2, p1.b + b 1b + + /* End of count. Return max. */ +9: mov x0, x2 + ret + + .size __strnlen_aarch64_sve, . - __strnlen_aarch64_sve diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S new file mode 100644 index 0000000..b02c846 --- /dev/null +++ b/string/aarch64/strnlen.S @@ -0,0 +1,160 @@ +/* + * strnlen - calculate the length of a string with limit. + * + * Copyright (c) 2013, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + */ + +/* Arguments and results. */ +#define srcin x0 +#define len x0 +#define limit x1 + +/* Locals and temporaries. */ +#define src x2 +#define data1 x3 +#define data2 x4 +#define data2a x5 +#define has_nul1 x6 +#define has_nul2 x7 +#define tmp1 x8 +#define tmp2 x9 +#define tmp3 x10 +#define tmp4 x11 +#define zeroones x12 +#define pos x13 +#define limit_wd x14 + + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + + .text + .p2align 6 +.Lstart: + /* Pre-pad to ensure critical loop begins an icache line. */ + .rep 7 + nop + .endr + /* Put this code here to avoid wasting more space with pre-padding. */ +.Lhit_limit: + mov len, limit + ret + +def_fn __strnlen_aarch64 + cbz limit, .Lhit_limit + mov zeroones, #REP8_01 + bic src, srcin, #15 + ands tmp1, srcin, #15 + b.ne .Lmisaligned + /* Calculate the number of full and partial words -1. */ + sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ + lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ + + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ + /* The inner loop deals with two Dwords at a time. This has a + slightly higher start-up cost, but we should win quite quickly, + especially on cores with a high number of issue slots per + cycle, as we get much better parallelism out of the operations. */ + + /* Start of critial section -- keep to one 64Byte cache line. */ +.Lloop: + ldp data1, data2, [src], #16 +.Lrealigned: + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + subs limit_wd, limit_wd, #1 + orr tmp1, has_nul1, has_nul2 + ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ + b.eq .Lloop + /* End of critical section -- keep to one 64Byte cache line. */ + + orr tmp1, has_nul1, has_nul2 + cbz tmp1, .Lhit_limit /* No null in final Qword. */ + + /* We know there's a null in the final Qword. The easiest thing + to do now is work out the length of the string and return + MIN (len, limit). */ + + sub len, src, srcin + cbz has_nul1, .Lnul_in_data2 +#ifdef __AARCH64EB__ + mov data2, data1 +#endif + sub len, len, #8 + mov has_nul2, has_nul1 +.Lnul_in_data2: +#ifdef __AARCH64EB__ + /* For big-endian, carry propagation (if the final byte in the + string is 0x01) means we cannot use has_nul directly. The + easiest way to get the correct byte is to byte-swap the data + and calculate the syndrome a second time. */ + rev data2, data2 + sub tmp1, data2, zeroones + orr tmp2, data2, #REP8_7f + bic has_nul2, tmp1, tmp2 +#endif + sub len, len, #8 + rev has_nul2, has_nul2 + clz pos, has_nul2 + add len, len, pos, lsr #3 /* Bits to bytes. */ + cmp len, limit + csel len, len, limit, ls /* Return the lower value. */ + ret + +.Lmisaligned: + /* Deal with a partial first word. + We're doing two things in parallel here; + 1) Calculate the number of words (but avoiding overflow if + limit is near ULONG_MAX) - to do this we need to work out + limit + tmp1 - 1 as a 65-bit value before shifting it; + 2) Load and mask the initial data words - we force the bytes + before the ones we are interested in to 0xff - this ensures + early bytes will not hit any zero detection. */ + sub limit_wd, limit, #1 + neg tmp4, tmp1 + cmp tmp1, #8 + + and tmp3, limit_wd, #15 + lsr limit_wd, limit_wd, #4 + mov tmp2, #~0 + + ldp data1, data2, [src], #16 + lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ + add tmp3, tmp3, tmp1 + +#ifdef __AARCH64EB__ + /* Big-endian. Early bytes are at MSB. */ + lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ +#else + /* Little-endian. Early bytes are at LSB. */ + lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ +#endif + add limit_wd, limit_wd, tmp3, lsr #4 + + orr data1, data1, tmp2 + orr data2a, data2, tmp2 + + csinv data1, data1, xzr, le + csel data2, data2, data2a, le + b .Lrealigned + .size __strnlen_aarch64, . - .Lstart /* Include pre-padding in size. */ diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S new file mode 100644 index 0000000..bb522e7 --- /dev/null +++ b/string/aarch64/strrchr-sve.S @@ -0,0 +1,83 @@ +/* + * strrchr - find the last of a character in a string + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * SVE Available. + */ + + .arch armv8-a+sve + .text + + .globl __strrchr_aarch64_sve + .type __strrchr_aarch64_sve, %function + .p2align 4 +__strrchr_aarch64_sve: + dup z1.b, w1 /* replicate byte across vector */ + setffr /* initialize FFR */ + ptrue p1.b /* all ones; loop invariant */ + mov x2, 0 /* no match found so far */ + pfalse p2.b + + .p2align 4 + /* Read a vector's worth of bytes, stopping on first fault. */ +0: ldff1b z0.b, p1/z, [x0, xzr] + rdffrs p0.b, p1/z + b.nlast 1f + + /* First fault did not fail: the whole vector is valid. + Avoid depending on the contents of FFR beyond the branch. */ + incb x0, all /* skip bytes this round */ + cmpeq p3.b, p1/z, z0.b, 0 /* search for 0 */ + b.any 3f + + cmpeq p3.b, p1/z, z0.b, z1.b /* search for c; no eos */ + b.none 0b + + mov x2, x0 /* save advanced base */ + mov p2.b, p3.b /* save current search */ + b 0b + + /* First fault failed: only some of the vector is valid. + Perform the comparisions only on the valid bytes. */ +1: cmpeq p3.b, p0/z, z0.b, 0 /* search for 0 */ + b.any 2f + + cmpeq p3.b, p0/z, z0.b, z1.b /* search for c; no eos */ + mov x3, x0 + incp x0, p0.b /* skip bytes this round */ + setffr /* re-init FFR */ + b.none 0b + + addvl x2, x3, 1 /* save advanced base */ + mov p2.b, p3.b /* save current search */ + b 0b + + /* Found end-of-string. */ +2: incb x0, all /* advance base */ +3: brka p3.b, p1/z, p3.b /* mask after first 0 */ + cmpeq p3.b, p3/z, z0.b, z1.b /* search for c not after eos */ + b.any 4f + + /* No C within last vector. Did we have one before? */ + cbz x2, 5f + mov x0, x2 /* restore advanced base */ + mov p3.b, p2.b /* restore saved search */ + + /* Find the *last* match in the predicate. This is slightly + more complicated than finding the first match. */ +4: rev p3.b, p3.b /* reverse the bits */ + brka p3.b, p1/z, p3.b /* find position of last match */ + decp x0, p3.b /* retard pointer to last match */ + ret + + /* No C whatsoever. Return NULL. */ +5: mov x0, 0 + ret + + .size __strrchr_aarch64_sve, . - __strrchr_aarch64_sve diff --git a/string/arm/memchr.S b/string/arm/memchr.S new file mode 100644 index 0000000..2eff4d1 --- /dev/null +++ b/string/arm/memchr.S @@ -0,0 +1,133 @@ +/* + * memchr - scan memory for a character + * + * Copyright (c) 2010, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* + Written by Dave Gilbert <david.gilbert@linaro.org> + + This __memchr_arm routine is optimised on a Cortex-A9 and should work on + all ARMv7 processors. It has a fast past for short sizes, and has + an optimised path for large data sets; the worst case is finding the + match early in a large data set. + + */ + +@ 2011-02-07 david.gilbert@linaro.org +@ Extracted from local git a5b438d861 +@ 2011-07-14 david.gilbert@linaro.org +@ Import endianness fix from local git ea786f1b +@ 2011-12-07 david.gilbert@linaro.org +@ Removed unneeded cbz from align loop + + .syntax unified + .arch armv7-a + +@ this lets us check a flag in a 00/ff byte easily in either endianness +#ifdef __ARMEB__ +#define CHARTSTMASK(c) 1<<(31-(c*8)) +#else +#define CHARTSTMASK(c) 1<<(c*8) +#endif + .text + .thumb + +@ --------------------------------------------------------------------------- + .thumb_func + .align 2 + .p2align 4,,15 + .global __memchr_arm + .type __memchr_arm,%function +__memchr_arm: + @ r0 = start of memory to scan + @ r1 = character to look for + @ r2 = length + @ returns r0 = pointer to character or NULL if not found + and r1,r1,#0xff @ Don't think we can trust the caller to actually pass a char + + cmp r2,#16 @ If it's short don't bother with anything clever + blt 20f + + tst r0, #7 @ If it's already aligned skip the next bit + beq 10f + + @ Work up to an aligned point +5: + ldrb r3, [r0],#1 + subs r2, r2, #1 + cmp r3, r1 + beq 50f @ If it matches exit found + tst r0, #7 + bne 5b @ If not aligned yet then do next byte + +10: + @ At this point, we are aligned, we know we have at least 8 bytes to work with + push {r4,r5,r6,r7} + orr r1, r1, r1, lsl #8 @ expand the match word across to all bytes + orr r1, r1, r1, lsl #16 + bic r4, r2, #7 @ Number of double words to work with + mvns r7, #0 @ all F's + movs r3, #0 + +15: + ldmia r0!,{r5,r6} + subs r4, r4, #8 + eor r5,r5, r1 @ Get it so that r5,r6 have 00's where the bytes match the target + eor r6,r6, r1 + uadd8 r5, r5, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r5, r3, r7 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + uadd8 r6, r6, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r6, r5, r7 @ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + cbnz r6, 60f + bne 15b @ (Flags from the subs above) If not run out of bytes then go around again + + pop {r4,r5,r6,r7} + and r1,r1,#0xff @ Get r1 back to a single character from the expansion above + and r2,r2,#7 @ Leave the count remaining as the number after the double words have been done + +20: + cbz r2, 40f @ 0 length or hit the end already then not found + +21: @ Post aligned section, or just a short call + ldrb r3,[r0],#1 + subs r2,r2,#1 + eor r3,r3,r1 @ r3 = 0 if match - doesn't break flags from sub + cbz r3, 50f + bne 21b @ on r2 flags + +40: + movs r0,#0 @ not found + bx lr + +50: + subs r0,r0,#1 @ found + bx lr + +60: @ We're here because the fast path found a hit - now we have to track down exactly which word it was + @ r0 points to the start of the double word after the one that was tested + @ r5 has the 00/ff pattern for the first word, r6 has the chained value + cmp r5, #0 + itte eq + moveq r5, r6 @ the end is in the 2nd word + subeq r0,r0,#3 @ Points to 2nd byte of 2nd word + subne r0,r0,#7 @ or 2nd byte of 1st word + + @ r0 currently points to the 3rd byte of the word containing the hit + tst r5, # CHARTSTMASK(0) @ 1st character + bne 61f + adds r0,r0,#1 + tst r5, # CHARTSTMASK(1) @ 2nd character + ittt eq + addeq r0,r0,#1 + tsteq r5, # (3<<15) @ 2nd & 3rd character + @ If not the 3rd must be the last one + addeq r0,r0,#1 + +61: + pop {r4,r5,r6,r7} + subs r0,r0,#1 + bx lr + + .size __memchr_arm, . - __memchr_arm diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S new file mode 100644 index 0000000..3346e4f --- /dev/null +++ b/string/arm/memcpy.S @@ -0,0 +1,593 @@ +/* + * memcpy - copy memory area + * + * Copyright (c) 2013, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* + This memcpy routine is optimised for Cortex-A15 cores and takes advantage + of VFP or NEON when built with the appropriate flags. + + Assumptions: + + ARMv6 (ARMv7-a if using Neon) + ARM state + Unaligned accesses + + */ + + .syntax unified + /* This implementation requires ARM state. */ + .arm + +#ifdef __ARM_NEON__ + + .fpu neon + .arch armv7-a +# define FRAME_SIZE 4 +# define USE_VFP +# define USE_NEON + +#elif !defined (__SOFTFP__) + + .arch armv6 + .fpu vfpv2 +# define FRAME_SIZE 32 +# define USE_VFP + +#else + .arch armv6 +# define FRAME_SIZE 32 + +#endif + +/* Old versions of GAS incorrectly implement the NEON align semantics. */ +#ifdef BROKEN_ASM_NEON_ALIGN +#define ALIGN(addr, align) addr,:align +#else +#define ALIGN(addr, align) addr:align +#endif + +#define PC_OFFSET 8 /* PC pipeline compensation. */ +#define INSN_SIZE 4 + +/* Call parameters. */ +#define dstin r0 +#define src r1 +#define count r2 + +/* Locals. */ +#define tmp1 r3 +#define dst ip +#define tmp2 r10 + +#ifndef USE_NEON +/* For bulk copies using GP registers. */ +#define A_l r2 /* Call-clobbered. */ +#define A_h r3 /* Call-clobbered. */ +#define B_l r4 +#define B_h r5 +#define C_l r6 +#define C_h r7 +#define D_l r8 +#define D_h r9 +#endif + +/* Number of lines ahead to pre-fetch data. If you change this the code + below will need adjustment to compensate. */ + +#define prefetch_lines 5 + +#ifdef USE_VFP + .macro cpy_line_vfp vreg, base + vstr \vreg, [dst, #\base] + vldr \vreg, [src, #\base] + vstr d0, [dst, #\base + 8] + vldr d0, [src, #\base + 8] + vstr d1, [dst, #\base + 16] + vldr d1, [src, #\base + 16] + vstr d2, [dst, #\base + 24] + vldr d2, [src, #\base + 24] + vstr \vreg, [dst, #\base + 32] + vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] + vstr d0, [dst, #\base + 40] + vldr d0, [src, #\base + 40] + vstr d1, [dst, #\base + 48] + vldr d1, [src, #\base + 48] + vstr d2, [dst, #\base + 56] + vldr d2, [src, #\base + 56] + .endm + + .macro cpy_tail_vfp vreg, base + vstr \vreg, [dst, #\base] + vldr \vreg, [src, #\base] + vstr d0, [dst, #\base + 8] + vldr d0, [src, #\base + 8] + vstr d1, [dst, #\base + 16] + vldr d1, [src, #\base + 16] + vstr d2, [dst, #\base + 24] + vldr d2, [src, #\base + 24] + vstr \vreg, [dst, #\base + 32] + vstr d0, [dst, #\base + 40] + vldr d0, [src, #\base + 40] + vstr d1, [dst, #\base + 48] + vldr d1, [src, #\base + 48] + vstr d2, [dst, #\base + 56] + vldr d2, [src, #\base + 56] + .endm +#endif + + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + +def_fn __memcpy_arm p2align=6 + + mov dst, dstin /* Preserve dstin, we need to return it. */ + cmp count, #64 + bge .Lcpy_not_short + /* Deal with small copies quickly by dropping straight into the + exit block. */ + +.Ltail63unaligned: +#ifdef USE_NEON + and tmp1, count, #0x38 + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) + add pc, pc, tmp1 + vld1.8 {d0}, [src]! /* 14 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 12 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 10 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 8 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 6 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 4 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 2 words to go. */ + vst1.8 {d0}, [dst]! + + tst count, #4 + ldrne tmp1, [src], #4 + strne tmp1, [dst], #4 +#else + /* Copy up to 15 full words of data. May not be aligned. */ + /* Cannot use VFP for unaligned data. */ + and tmp1, count, #0x3c + add dst, dst, tmp1 + add src, src, tmp1 + rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) + /* Jump directly into the sequence below at the correct offset. */ + add pc, pc, tmp1, lsl #1 + + ldr tmp1, [src, #-60] /* 15 words to go. */ + str tmp1, [dst, #-60] + + ldr tmp1, [src, #-56] /* 14 words to go. */ + str tmp1, [dst, #-56] + ldr tmp1, [src, #-52] + str tmp1, [dst, #-52] + + ldr tmp1, [src, #-48] /* 12 words to go. */ + str tmp1, [dst, #-48] + ldr tmp1, [src, #-44] + str tmp1, [dst, #-44] + + ldr tmp1, [src, #-40] /* 10 words to go. */ + str tmp1, [dst, #-40] + ldr tmp1, [src, #-36] + str tmp1, [dst, #-36] + + ldr tmp1, [src, #-32] /* 8 words to go. */ + str tmp1, [dst, #-32] + ldr tmp1, [src, #-28] + str tmp1, [dst, #-28] + + ldr tmp1, [src, #-24] /* 6 words to go. */ + str tmp1, [dst, #-24] + ldr tmp1, [src, #-20] + str tmp1, [dst, #-20] + + ldr tmp1, [src, #-16] /* 4 words to go. */ + str tmp1, [dst, #-16] + ldr tmp1, [src, #-12] + str tmp1, [dst, #-12] + + ldr tmp1, [src, #-8] /* 2 words to go. */ + str tmp1, [dst, #-8] + ldr tmp1, [src, #-4] + str tmp1, [dst, #-4] +#endif + + lsls count, count, #31 + ldrhcs tmp1, [src], #2 + ldrbne src, [src] /* Src is dead, use as a scratch. */ + strhcs tmp1, [dst], #2 + strbne src, [dst] + bx lr + +.Lcpy_not_short: + /* At least 64 bytes to copy, but don't know the alignment yet. */ + str tmp2, [sp, #-FRAME_SIZE]! + and tmp2, src, #7 + and tmp1, dst, #7 + cmp tmp1, tmp2 + bne .Lcpy_notaligned + +#ifdef USE_VFP + /* Magic dust alert! Force VFP on Cortex-A9. Experiments show + that the FP pipeline is much better at streaming loads and + stores. This is outside the critical loop. */ + vmov.f32 s0, s0 +#endif + + /* SRC and DST have the same mutual 64-bit alignment, but we may + still need to pre-copy some bytes to get to natural alignment. + We bring SRC and DST into full 64-bit alignment. */ + lsls tmp2, dst, #29 + beq 1f + rsbs tmp2, tmp2, #0 + sub count, count, tmp2, lsr #29 + ldrmi tmp1, [src], #4 + strmi tmp1, [dst], #4 + lsls tmp2, tmp2, #2 + ldrhcs tmp1, [src], #2 + ldrbne tmp2, [src], #1 + strhcs tmp1, [dst], #2 + strbne tmp2, [dst], #1 + +1: + subs tmp2, count, #64 /* Use tmp2 for count. */ + blt .Ltail63aligned + + cmp tmp2, #512 + bge .Lcpy_body_long + +.Lcpy_body_medium: /* Count in tmp2. */ +#ifdef USE_VFP +1: + vldr d0, [src, #0] + subs tmp2, tmp2, #64 + vldr d1, [src, #8] + vstr d0, [dst, #0] + vldr d0, [src, #16] + vstr d1, [dst, #8] + vldr d1, [src, #24] + vstr d0, [dst, #16] + vldr d0, [src, #32] + vstr d1, [dst, #24] + vldr d1, [src, #40] + vstr d0, [dst, #32] + vldr d0, [src, #48] + vstr d1, [dst, #40] + vldr d1, [src, #56] + vstr d0, [dst, #48] + add src, src, #64 + vstr d1, [dst, #56] + add dst, dst, #64 + bge 1b + tst tmp2, #0x3f + beq .Ldone + +.Ltail63aligned: /* Count in tmp2. */ + and tmp1, tmp2, #0x38 + add dst, dst, tmp1 + add src, src, tmp1 + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) + add pc, pc, tmp1 + + vldr d0, [src, #-56] /* 14 words to go. */ + vstr d0, [dst, #-56] + vldr d0, [src, #-48] /* 12 words to go. */ + vstr d0, [dst, #-48] + vldr d0, [src, #-40] /* 10 words to go. */ + vstr d0, [dst, #-40] + vldr d0, [src, #-32] /* 8 words to go. */ + vstr d0, [dst, #-32] + vldr d0, [src, #-24] /* 6 words to go. */ + vstr d0, [dst, #-24] + vldr d0, [src, #-16] /* 4 words to go. */ + vstr d0, [dst, #-16] + vldr d0, [src, #-8] /* 2 words to go. */ + vstr d0, [dst, #-8] +#else + sub src, src, #8 + sub dst, dst, #8 +1: + ldrd A_l, A_h, [src, #8] + strd A_l, A_h, [dst, #8] + ldrd A_l, A_h, [src, #16] + strd A_l, A_h, [dst, #16] + ldrd A_l, A_h, [src, #24] + strd A_l, A_h, [dst, #24] + ldrd A_l, A_h, [src, #32] + strd A_l, A_h, [dst, #32] + ldrd A_l, A_h, [src, #40] + strd A_l, A_h, [dst, #40] + ldrd A_l, A_h, [src, #48] + strd A_l, A_h, [dst, #48] + ldrd A_l, A_h, [src, #56] + strd A_l, A_h, [dst, #56] + ldrd A_l, A_h, [src, #64]! + strd A_l, A_h, [dst, #64]! + subs tmp2, tmp2, #64 + bge 1b + tst tmp2, #0x3f + bne 1f + ldr tmp2,[sp], #FRAME_SIZE + bx lr +1: + add src, src, #8 + add dst, dst, #8 + +.Ltail63aligned: /* Count in tmp2. */ + /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but + we know that the src and dest are 64-bit aligned so we can use + LDRD/STRD to improve efficiency. */ + /* TMP2 is now negative, but we don't care about that. The bottom + six bits still tell us how many bytes are left to copy. */ + + and tmp1, tmp2, #0x38 + add dst, dst, tmp1 + add src, src, tmp1 + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) + add pc, pc, tmp1 + ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ + strd A_l, A_h, [dst, #-56] + ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ + strd A_l, A_h, [dst, #-48] + ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ + strd A_l, A_h, [dst, #-40] + ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ + strd A_l, A_h, [dst, #-32] + ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ + strd A_l, A_h, [dst, #-24] + ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ + strd A_l, A_h, [dst, #-16] + ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ + strd A_l, A_h, [dst, #-8] + +#endif + tst tmp2, #4 + ldrne tmp1, [src], #4 + strne tmp1, [dst], #4 + lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ + ldrhcs tmp1, [src], #2 + ldrbne tmp2, [src] + strhcs tmp1, [dst], #2 + strbne tmp2, [dst] + +.Ldone: + ldr tmp2, [sp], #FRAME_SIZE + bx lr + +.Lcpy_body_long: /* Count in tmp2. */ + + /* Long copy. We know that there's at least (prefetch_lines * 64) + bytes to go. */ +#ifdef USE_VFP + /* Don't use PLD. Instead, read some data in advance of the current + copy position into a register. This should act like a PLD + operation but we won't have to repeat the transfer. */ + + vldr d3, [src, #0] + vldr d4, [src, #64] + vldr d5, [src, #128] + vldr d6, [src, #192] + vldr d7, [src, #256] + + vldr d0, [src, #8] + vldr d1, [src, #16] + vldr d2, [src, #24] + add src, src, #32 + + subs tmp2, tmp2, #prefetch_lines * 64 * 2 + blt 2f +1: + cpy_line_vfp d3, 0 + cpy_line_vfp d4, 64 + cpy_line_vfp d5, 128 + add dst, dst, #3 * 64 + add src, src, #3 * 64 + cpy_line_vfp d6, 0 + cpy_line_vfp d7, 64 + add dst, dst, #2 * 64 + add src, src, #2 * 64 + subs tmp2, tmp2, #prefetch_lines * 64 + bge 1b + +2: + cpy_tail_vfp d3, 0 + cpy_tail_vfp d4, 64 + cpy_tail_vfp d5, 128 + add src, src, #3 * 64 + add dst, dst, #3 * 64 + cpy_tail_vfp d6, 0 + vstr d7, [dst, #64] + vldr d7, [src, #64] + vstr d0, [dst, #64 + 8] + vldr d0, [src, #64 + 8] + vstr d1, [dst, #64 + 16] + vldr d1, [src, #64 + 16] + vstr d2, [dst, #64 + 24] + vldr d2, [src, #64 + 24] + vstr d7, [dst, #64 + 32] + add src, src, #96 + vstr d0, [dst, #64 + 40] + vstr d1, [dst, #64 + 48] + vstr d2, [dst, #64 + 56] + add dst, dst, #128 + add tmp2, tmp2, #prefetch_lines * 64 + b .Lcpy_body_medium +#else + /* Long copy. Use an SMS style loop to maximize the I/O + bandwidth of the core. We don't have enough spare registers + to synthesise prefetching, so use PLD operations. */ + /* Pre-bias src and dst. */ + sub src, src, #8 + sub dst, dst, #8 + pld [src, #8] + pld [src, #72] + subs tmp2, tmp2, #64 + pld [src, #136] + ldrd A_l, A_h, [src, #8] + strd B_l, B_h, [sp, #8] + ldrd B_l, B_h, [src, #16] + strd C_l, C_h, [sp, #16] + ldrd C_l, C_h, [src, #24] + strd D_l, D_h, [sp, #24] + pld [src, #200] + ldrd D_l, D_h, [src, #32]! + b 1f + .p2align 6 +2: + pld [src, #232] + strd A_l, A_h, [dst, #40] + ldrd A_l, A_h, [src, #40] + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [src, #48] + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [src, #56] + strd D_l, D_h, [dst, #64]! + ldrd D_l, D_h, [src, #64]! + subs tmp2, tmp2, #64 +1: + strd A_l, A_h, [dst, #8] + ldrd A_l, A_h, [src, #8] + strd B_l, B_h, [dst, #16] + ldrd B_l, B_h, [src, #16] + strd C_l, C_h, [dst, #24] + ldrd C_l, C_h, [src, #24] + strd D_l, D_h, [dst, #32] + ldrd D_l, D_h, [src, #32] + bcs 2b + /* Save the remaining bytes and restore the callee-saved regs. */ + strd A_l, A_h, [dst, #40] + add src, src, #40 + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [sp, #8] + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [sp, #16] + strd D_l, D_h, [dst, #64] + ldrd D_l, D_h, [sp, #24] + add dst, dst, #72 + tst tmp2, #0x3f + bne .Ltail63aligned + ldr tmp2, [sp], #FRAME_SIZE + bx lr +#endif + +.Lcpy_notaligned: + pld [src] + pld [src, #64] + /* There's at least 64 bytes to copy, but there is no mutual + alignment. */ + /* Bring DST to 64-bit alignment. */ + lsls tmp2, dst, #29 + pld [src, #(2 * 64)] + beq 1f + rsbs tmp2, tmp2, #0 + sub count, count, tmp2, lsr #29 + ldrmi tmp1, [src], #4 + strmi tmp1, [dst], #4 + lsls tmp2, tmp2, #2 + ldrbne tmp1, [src], #1 + ldrhcs tmp2, [src], #2 + strbne tmp1, [dst], #1 + strhcs tmp2, [dst], #2 +1: + pld [src, #(3 * 64)] + subs count, count, #64 + ldrmi tmp2, [sp], #FRAME_SIZE + bmi .Ltail63unaligned + pld [src, #(4 * 64)] + +#ifdef USE_NEON + vld1.8 {d0-d3}, [src]! + vld1.8 {d4-d7}, [src]! + subs count, count, #64 + bmi 2f +1: + pld [src, #(4 * 64)] + vst1.8 {d0-d3}, [ALIGN (dst, 64)]! + vld1.8 {d0-d3}, [src]! + vst1.8 {d4-d7}, [ALIGN (dst, 64)]! + vld1.8 {d4-d7}, [src]! + subs count, count, #64 + bpl 1b +2: + vst1.8 {d0-d3}, [ALIGN (dst, 64)]! + vst1.8 {d4-d7}, [ALIGN (dst, 64)]! + ands count, count, #0x3f +#else + /* Use an SMS style loop to maximize the I/O bandwidth. */ + sub src, src, #4 + sub dst, dst, #8 + subs tmp2, count, #64 /* Use tmp2 for count. */ + ldr A_l, [src, #4] + ldr A_h, [src, #8] + strd B_l, B_h, [sp, #8] + ldr B_l, [src, #12] + ldr B_h, [src, #16] + strd C_l, C_h, [sp, #16] + ldr C_l, [src, #20] + ldr C_h, [src, #24] + strd D_l, D_h, [sp, #24] + ldr D_l, [src, #28] + ldr D_h, [src, #32]! + b 1f + .p2align 6 +2: + pld [src, #(5 * 64) - (32 - 4)] + strd A_l, A_h, [dst, #40] + ldr A_l, [src, #36] + ldr A_h, [src, #40] + strd B_l, B_h, [dst, #48] + ldr B_l, [src, #44] + ldr B_h, [src, #48] + strd C_l, C_h, [dst, #56] + ldr C_l, [src, #52] + ldr C_h, [src, #56] + strd D_l, D_h, [dst, #64]! + ldr D_l, [src, #60] + ldr D_h, [src, #64]! + subs tmp2, tmp2, #64 +1: + strd A_l, A_h, [dst, #8] + ldr A_l, [src, #4] + ldr A_h, [src, #8] + strd B_l, B_h, [dst, #16] + ldr B_l, [src, #12] + ldr B_h, [src, #16] + strd C_l, C_h, [dst, #24] + ldr C_l, [src, #20] + ldr C_h, [src, #24] + strd D_l, D_h, [dst, #32] + ldr D_l, [src, #28] + ldr D_h, [src, #32] + bcs 2b + + /* Save the remaining bytes and restore the callee-saved regs. */ + strd A_l, A_h, [dst, #40] + add src, src, #36 + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [sp, #8] + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [sp, #16] + strd D_l, D_h, [dst, #64] + ldrd D_l, D_h, [sp, #24] + add dst, dst, #72 + ands count, tmp2, #0x3f +#endif + ldr tmp2, [sp], #FRAME_SIZE + bne .Ltail63unaligned + bx lr + + .size __memcpy_arm, . - __memcpy_arm diff --git a/string/arm/memset.S b/string/arm/memset.S new file mode 100644 index 0000000..3ee5238 --- /dev/null +++ b/string/arm/memset.S @@ -0,0 +1,99 @@ +/* + * memset - fill memory with a constant + * + * Copyright (c) 2010, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* + Written by Dave Gilbert <david.gilbert@linaro.org> + + This memset routine is optimised on a Cortex-A9 and should work on + all ARMv7 processors. + + */ + + .syntax unified + .arch armv7-a + +@ 2011-08-30 david.gilbert@linaro.org +@ Extracted from local git 2f11b436 + +@ this lets us check a flag in a 00/ff byte easily in either endianness +#ifdef __ARMEB__ +#define CHARTSTMASK(c) 1<<(31-(c*8)) +#else +#define CHARTSTMASK(c) 1<<(c*8) +#endif + .text + .thumb + +@ --------------------------------------------------------------------------- + .thumb_func + .align 2 + .p2align 4,,15 + .global __memset_arm + .type __memset_arm,%function +__memset_arm: + @ r0 = address + @ r1 = character + @ r2 = count + @ returns original address in r0 + + mov r3, r0 @ Leave r0 alone + cbz r2, 10f @ Exit if 0 length + + tst r0, #7 + beq 2f @ Already aligned + + @ Ok, so we're misaligned here +1: + strb r1, [r3], #1 + subs r2,r2,#1 + tst r3, #7 + cbz r2, 10f @ Exit if we hit the end + bne 1b @ go round again if still misaligned + +2: + @ OK, so we're aligned + push {r4,r5,r6,r7} + bics r4, r2, #15 @ if less than 16 bytes then need to finish it off + beq 5f + +3: + @ POSIX says that ch is cast to an unsigned char. A uxtb is one + @ byte and takes two cycles, where an AND is four bytes but one + @ cycle. + and r1, #0xFF + orr r1, r1, r1, lsl#8 @ Same character into all bytes + orr r1, r1, r1, lsl#16 + mov r5,r1 + mov r6,r1 + mov r7,r1 + +4: + subs r4,r4,#16 + stmia r3!,{r1,r5,r6,r7} + bne 4b + and r2,r2,#15 + + @ At this point we're still aligned and we have upto align-1 bytes left to right + @ we can avoid some of the byte-at-a time now by testing for some big chunks + tst r2,#8 + itt ne + subne r2,r2,#8 + stmiane r3!,{r1,r5} + +5: + pop {r4,r5,r6,r7} + cbz r2, 10f + + @ Got to do any last < alignment bytes +6: + subs r2,r2,#1 + strb r1,[r3],#1 + bne 6b + +10: + bx lr @ goodbye + .size __memset_arm, . - __memset_arm diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S new file mode 100644 index 0000000..5ea06c9 --- /dev/null +++ b/string/arm/strcmp-armv6m.S @@ -0,0 +1,118 @@ +/* + * strcmp for ARMv6-M (optimized for performance, not size) + * + * Copyright (c) 2014-2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + + .thumb_func + .syntax unified + .arch armv6-m + + .macro DoSub n, label + subs r0, r0, r1 +#ifdef __ARM_BIG_ENDIAN + lsrs r1, r4, \n +#else + lsls r1, r4, \n +#endif + orrs r1, r0 + bne \label + .endm + + .macro Byte_Test n, label + lsrs r0, r2, \n + lsrs r1, r3, \n + DoSub \n, \label + .endm + + .text + .p2align 0 + .global __strcmp_armv6m + .type __strcmp_armv6m, %function +__strcmp_armv6m: + .cfi_startproc + mov r2, r0 + push {r4, r5, r6, lr} + orrs r2, r1 + lsls r2, r2, #30 + bne 6f + ldr r5, =0x01010101 + lsls r6, r5, #7 +1: + ldmia r0!, {r2} + ldmia r1!, {r3} + subs r4, r2, r5 + bics r4, r2 + ands r4, r6 + beq 3f + +#ifdef __ARM_BIG_ENDIAN + Byte_Test #24, 4f + Byte_Test #16, 4f + Byte_Test #8, 4f + + b 7f +3: + cmp r2, r3 + beq 1b + cmp r2, r3 +#else + uxtb r0, r2 + uxtb r1, r3 + DoSub #24, 2f + + uxth r0, r2 + uxth r1, r3 + DoSub #16, 2f + + lsls r0, r2, #8 + lsls r1, r3, #8 + lsrs r0, r0, #8 + lsrs r1, r1, #8 + DoSub #8, 2f + + lsrs r0, r2, #24 + lsrs r1, r3, #24 + subs r0, r0, r1 +2: + pop {r4, r5, r6, pc} + +3: + cmp r2, r3 + beq 1b + rev r0, r2 + rev r1, r3 + cmp r0, r1 +#endif + + bls 5f + movs r0, #1 +4: + pop {r4, r5, r6, pc} +5: + movs r0, #0 + mvns r0, r0 + pop {r4, r5, r6, pc} +6: + ldrb r2, [r0, #0] + ldrb r3, [r1, #0] + adds r0, #1 + adds r1, #1 + cmp r2, #0 + beq 7f + cmp r2, r3 + bne 7f + ldrb r2, [r0, #0] + ldrb r3, [r1, #0] + adds r0, #1 + adds r1, #1 + cmp r2, #0 + beq 7f + cmp r2, r3 + beq 6b +7: + subs r0, r2, r3 + pop {r4, r5, r6, pc} + .cfi_endproc + .size __strcmp_armv6m, . - __strcmp_armv6m diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S new file mode 100644 index 0000000..fb9cae3 --- /dev/null +++ b/string/arm/strcmp.S @@ -0,0 +1,479 @@ +/* + * strcmp for ARMv7 + * + * Copyright (c) 2012-2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Implementation of strcmp for ARMv7 when DSP instructions are + available. Use ldrd to support wider loads, provided the data + is sufficiently aligned. Use saturating arithmetic to optimize + the compares. */ + +/* Build Options: + STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first + byte in the string. If comparing completely random strings + the pre-check will save time, since there is a very high + probability of a mismatch in the first character: we save + significant overhead if this is the common case. However, + if strings are likely to be identical (eg because we're + verifying a hit in a hash table), then this check is largely + redundant. */ + +#define STRCMP_NO_PRECHECK 0 + + /* This version uses Thumb-2 code. */ + .thumb + .syntax unified + +#ifdef __ARM_BIG_ENDIAN +#define S2LO lsl +#define S2LOEQ lsleq +#define S2HI lsr +#define MSB 0x000000ff +#define LSB 0xff000000 +#define BYTE0_OFFSET 24 +#define BYTE1_OFFSET 16 +#define BYTE2_OFFSET 8 +#define BYTE3_OFFSET 0 +#else /* not __ARM_BIG_ENDIAN */ +#define S2LO lsr +#define S2LOEQ lsreq +#define S2HI lsl +#define BYTE0_OFFSET 0 +#define BYTE1_OFFSET 8 +#define BYTE2_OFFSET 16 +#define BYTE3_OFFSET 24 +#define MSB 0xff000000 +#define LSB 0x000000ff +#endif /* not __ARM_BIG_ENDIAN */ + + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + +/* Parameters and result. */ +#define src1 r0 +#define src2 r1 +#define result r0 /* Overlaps src1. */ + +/* Internal variables. */ +#define tmp1 r4 +#define tmp2 r5 +#define const_m1 r12 + +/* Additional internal variables for 64-bit aligned data. */ +#define data1a r2 +#define data1b r3 +#define data2a r6 +#define data2b r7 +#define syndrome_a tmp1 +#define syndrome_b tmp2 + +/* Additional internal variables for 32-bit aligned data. */ +#define data1 r2 +#define data2 r3 +#define syndrome tmp2 + + + /* Macro to compute and return the result value for word-aligned + cases. */ + .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 +#ifdef __ARM_BIG_ENDIAN + /* If data1 contains a zero byte, then syndrome will contain a 1 in + bit 7 of that byte. Otherwise, the highest set bit in the + syndrome will highlight the first different bit. It is therefore + sufficient to extract the eight bits starting with the syndrome + bit. */ + clz tmp1, \synd + lsl r1, \d2, tmp1 + .if \restore_r6 + ldrd r6, r7, [sp, #8] + .endif + .cfi_restore 6 + .cfi_restore 7 + lsl \d1, \d1, tmp1 + .cfi_remember_state + lsr result, \d1, #24 + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + sub result, result, r1, lsr #24 + bx lr +#else + /* To use the big-endian trick we'd have to reverse all three words. + that's slower than this approach. */ + rev \synd, \synd + clz tmp1, \synd + bic tmp1, tmp1, #7 + lsr r1, \d2, tmp1 + .cfi_remember_state + .if \restore_r6 + ldrd r6, r7, [sp, #8] + .endif + .cfi_restore 6 + .cfi_restore 7 + lsr \d1, \d1, tmp1 + and result, \d1, #255 + and r1, r1, #255 + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + sub result, result, r1 + + bx lr +#endif + .endm + + .text + .p2align 5 +.Lstrcmp_start_addr: +#if STRCMP_NO_PRECHECK == 0 +.Lfastpath_exit: + sub r0, r2, r3 + bx lr + nop +#endif +def_fn __strcmp_arm +#if STRCMP_NO_PRECHECK == 0 + ldrb r2, [src1] + ldrb r3, [src2] + cmp r2, #1 + it cs + cmpcs r2, r3 + bne .Lfastpath_exit +#endif + .cfi_startproc + strd r4, r5, [sp, #-16]! + .cfi_def_cfa_offset 16 + .cfi_offset 4, -16 + .cfi_offset 5, -12 + orr tmp1, src1, src2 + strd r6, r7, [sp, #8] + .cfi_offset 6, -8 + .cfi_offset 7, -4 + mvn const_m1, #0 + lsl r2, tmp1, #29 + cbz r2, .Lloop_aligned8 + +.Lnot_aligned: + eor tmp1, src1, src2 + tst tmp1, #7 + bne .Lmisaligned8 + + /* Deal with mutual misalignment by aligning downwards and then + masking off the unwanted loaded data to prevent a difference. */ + and tmp1, src1, #7 + bic src1, src1, #7 + and tmp2, tmp1, #3 + bic src2, src2, #7 + lsl tmp2, tmp2, #3 /* Bytes -> bits. */ + ldrd data1a, data1b, [src1], #16 + tst tmp1, #4 + ldrd data2a, data2b, [src2], #16 + /* In thumb code we can't use MVN with a register shift, but + we do have ORN. */ + S2HI tmp1, const_m1, tmp2 + orn data1a, data1a, tmp1 + orn data2a, data2a, tmp1 + beq .Lstart_realigned8 + orn data1b, data1b, tmp1 + mov data1a, const_m1 + orn data2b, data2b, tmp1 + mov data2a, const_m1 + b .Lstart_realigned8 + + /* Unwind the inner loop by a factor of 2, giving 16 bytes per + pass. */ + .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ + .p2align 2 /* Always word aligned. */ +.Lloop_aligned8: + ldrd data1a, data1b, [src1], #16 + ldrd data2a, data2b, [src2], #16 +.Lstart_realigned8: + uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ + eor syndrome_a, data1a, data2a + sel syndrome_a, syndrome_a, const_m1 + cbnz syndrome_a, .Ldiff_in_a + uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ + eor syndrome_b, data1b, data2b + sel syndrome_b, syndrome_b, const_m1 + cbnz syndrome_b, .Ldiff_in_b + + ldrd data1a, data1b, [src1, #-8] + ldrd data2a, data2b, [src2, #-8] + uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ + eor syndrome_a, data1a, data2a + sel syndrome_a, syndrome_a, const_m1 + uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ + eor syndrome_b, data1b, data2b + sel syndrome_b, syndrome_b, const_m1 + /* Can't use CBZ for backwards branch. */ + orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ + beq .Lloop_aligned8 + +.Ldiff_found: + cbnz syndrome_a, .Ldiff_in_a + +.Ldiff_in_b: + strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 + +.Ldiff_in_a: + .cfi_restore_state + strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 + + .cfi_restore_state +.Lmisaligned8: + tst tmp1, #3 + bne .Lmisaligned4 + ands tmp1, src1, #3 + bne .Lmutual_align4 + + /* Unrolled by a factor of 2, to reduce the number of post-increment + operations. */ +.Lloop_aligned4: + ldr data1, [src1], #8 + ldr data2, [src2], #8 +.Lstart_realigned4: + uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ + eor syndrome, data1, data2 + sel syndrome, syndrome, const_m1 + cbnz syndrome, .Laligned4_done + ldr data1, [src1, #-4] + ldr data2, [src2, #-4] + uadd8 syndrome, data1, const_m1 + eor syndrome, data1, data2 + sel syndrome, syndrome, const_m1 + cmp syndrome, #0 + beq .Lloop_aligned4 + +.Laligned4_done: + strcmp_epilogue_aligned syndrome, data1, data2, 0 + +.Lmutual_align4: + .cfi_restore_state + /* Deal with mutual misalignment by aligning downwards and then + masking off the unwanted loaded data to prevent a difference. */ + lsl tmp1, tmp1, #3 /* Bytes -> bits. */ + bic src1, src1, #3 + ldr data1, [src1], #8 + bic src2, src2, #3 + ldr data2, [src2], #8 + + /* In thumb code we can't use MVN with a register shift, but + we do have ORN. */ + S2HI tmp1, const_m1, tmp1 + orn data1, data1, tmp1 + orn data2, data2, tmp1 + b .Lstart_realigned4 + +.Lmisaligned4: + ands tmp1, src1, #3 + beq .Lsrc1_aligned + sub src2, src2, tmp1 + bic src1, src1, #3 + lsls tmp1, tmp1, #31 + ldr data1, [src1], #4 + beq .Laligned_m2 + bcs .Laligned_m1 + +#if STRCMP_NO_PRECHECK == 1 + ldrb data2, [src2, #1] + uxtb tmp1, data1, ror #BYTE1_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + cbz data2, .Lmisaligned_exit + +.Laligned_m2: + ldrb data2, [src2, #2] + uxtb tmp1, data1, ror #BYTE2_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + cbz data2, .Lmisaligned_exit + +.Laligned_m1: + ldrb data2, [src2, #3] + uxtb tmp1, data1, ror #BYTE3_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + add src2, src2, #4 + cbnz data2, .Lsrc1_aligned +#else /* STRCMP_NO_PRECHECK */ + /* If we've done the pre-check, then we don't need to check the + first byte again here. */ + ldrb data2, [src2, #2] + uxtb tmp1, data1, ror #BYTE2_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + cbz data2, .Lmisaligned_exit + +.Laligned_m2: + ldrb data2, [src2, #3] + uxtb tmp1, data1, ror #BYTE3_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + cbnz data2, .Laligned_m1 +#endif + +.Lmisaligned_exit: + .cfi_remember_state + mov result, tmp1 + ldr r4, [sp], #16 + .cfi_restore 4 + bx lr + +#if STRCMP_NO_PRECHECK == 0 +.Laligned_m1: + add src2, src2, #4 +#endif +.Lsrc1_aligned: + .cfi_restore_state + /* src1 is word aligned, but src2 has no common alignment + with it. */ + ldr data1, [src1], #4 + lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ + + bic src2, src2, #3 + ldr data2, [src2], #4 + bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ + bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ + + /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ +.Loverlap3: + bic tmp1, data1, #MSB + uadd8 syndrome, data1, const_m1 + eors syndrome, tmp1, data2, S2LO #8 + sel syndrome, syndrome, const_m1 + bne 4f + cbnz syndrome, 5f + ldr data2, [src2], #4 + eor tmp1, tmp1, data1 + cmp tmp1, data2, S2HI #24 + bne 6f + ldr data1, [src1], #4 + b .Loverlap3 +4: + S2LO data2, data2, #8 + b .Lstrcmp_tail + +5: + bics syndrome, syndrome, #MSB + bne .Lstrcmp_done_equal + + /* We can only get here if the MSB of data1 contains 0, so + fast-path the exit. */ + ldrb result, [src2] + .cfi_remember_state + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + /* R6/7 Not used in this sequence. */ + .cfi_restore 6 + .cfi_restore 7 + neg result, result + bx lr + +6: + .cfi_restore_state + S2LO data1, data1, #24 + and data2, data2, #LSB + b .Lstrcmp_tail + + .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ +.Loverlap2: + and tmp1, data1, const_m1, S2LO #16 + uadd8 syndrome, data1, const_m1 + eors syndrome, tmp1, data2, S2LO #16 + sel syndrome, syndrome, const_m1 + bne 4f + cbnz syndrome, 5f + ldr data2, [src2], #4 + eor tmp1, tmp1, data1 + cmp tmp1, data2, S2HI #16 + bne 6f + ldr data1, [src1], #4 + b .Loverlap2 +4: + S2LO data2, data2, #16 + b .Lstrcmp_tail +5: + ands syndrome, syndrome, const_m1, S2LO #16 + bne .Lstrcmp_done_equal + + ldrh data2, [src2] + S2LO data1, data1, #16 +#ifdef __ARM_BIG_ENDIAN + lsl data2, data2, #16 +#endif + b .Lstrcmp_tail + +6: + S2LO data1, data1, #16 + and data2, data2, const_m1, S2LO #16 + b .Lstrcmp_tail + + .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ +.Loverlap1: + and tmp1, data1, #LSB + uadd8 syndrome, data1, const_m1 + eors syndrome, tmp1, data2, S2LO #24 + sel syndrome, syndrome, const_m1 + bne 4f + cbnz syndrome, 5f + ldr data2, [src2], #4 + eor tmp1, tmp1, data1 + cmp tmp1, data2, S2HI #8 + bne 6f + ldr data1, [src1], #4 + b .Loverlap1 +4: + S2LO data2, data2, #24 + b .Lstrcmp_tail +5: + tst syndrome, #LSB + bne .Lstrcmp_done_equal + ldr data2, [src2] +6: + S2LO data1, data1, #8 + bic data2, data2, #MSB + b .Lstrcmp_tail + +.Lstrcmp_done_equal: + mov result, #0 + .cfi_remember_state + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + /* R6/7 not used in this sequence. */ + .cfi_restore 6 + .cfi_restore 7 + bx lr + +.Lstrcmp_tail: + .cfi_restore_state +#ifndef __ARM_BIG_ENDIAN + rev data1, data1 + rev data2, data2 + /* Now everything looks big-endian... */ +#endif + uadd8 tmp1, data1, const_m1 + eor tmp1, data1, data2 + sel syndrome, tmp1, const_m1 + clz tmp1, syndrome + lsl data1, data1, tmp1 + lsl data2, data2, tmp1 + lsr result, data1, #24 + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + /* R6/7 not used in this sequence. */ + .cfi_restore 6 + .cfi_restore 7 + sub result, result, data2, lsr #24 + bx lr + .cfi_endproc + .size __strcmp, . - .Lstrcmp_start_addr diff --git a/string/arm/strcpy.c b/string/arm/strcpy.c new file mode 100644 index 0000000..48ebbe8 --- /dev/null +++ b/string/arm/strcpy.c @@ -0,0 +1,129 @@ +/* + * strcpy + * + * Copyright (c) 2008-2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* For GLIBC: +#include <string.h> +#include <memcopy.h> + +#undef strcmp +*/ + +#ifdef __thumb2__ +#define magic1(REG) "#0x01010101" +#define magic2(REG) "#0x80808080" +#else +#define magic1(REG) #REG +#define magic2(REG) #REG ", lsl #7" +#endif + +char* __attribute__((naked)) +__strcpy_arm (char* dst, const char* src) +{ + __asm__ ( + "pld [r1, #0]\n\t" + "eor r2, r0, r1\n\t" + "mov ip, r0\n\t" + "tst r2, #3\n\t" + "bne 4f\n\t" + "tst r1, #3\n\t" + "bne 3f\n" + "5:\n\t" +# ifndef __thumb2__ + "str r5, [sp, #-4]!\n\t" + "mov r5, #0x01\n\t" + "orr r5, r5, r5, lsl #8\n\t" + "orr r5, r5, r5, lsl #16\n\t" +# endif + + "str r4, [sp, #-4]!\n\t" + "tst r1, #4\n\t" + "ldr r3, [r1], #4\n\t" + "beq 2f\n\t" + "sub r2, r3, "magic1(r5)"\n\t" + "bics r2, r2, r3\n\t" + "tst r2, "magic2(r5)"\n\t" + "itt eq\n\t" + "streq r3, [ip], #4\n\t" + "ldreq r3, [r1], #4\n" + "bne 1f\n\t" + /* Inner loop. We now know that r1 is 64-bit aligned, so we + can safely fetch up to two words. This allows us to avoid + load stalls. */ + ".p2align 2\n" + "2:\n\t" + "pld [r1, #8]\n\t" + "ldr r4, [r1], #4\n\t" + "sub r2, r3, "magic1(r5)"\n\t" + "bics r2, r2, r3\n\t" + "tst r2, "magic2(r5)"\n\t" + "sub r2, r4, "magic1(r5)"\n\t" + "bne 1f\n\t" + "str r3, [ip], #4\n\t" + "bics r2, r2, r4\n\t" + "tst r2, "magic2(r5)"\n\t" + "itt eq\n\t" + "ldreq r3, [r1], #4\n\t" + "streq r4, [ip], #4\n\t" + "beq 2b\n\t" + "mov r3, r4\n" + "1:\n\t" +# ifdef __ARMEB__ + "rors r3, r3, #24\n\t" +# endif + "strb r3, [ip], #1\n\t" + "tst r3, #0xff\n\t" +# ifdef __ARMEL__ + "ror r3, r3, #8\n\t" +# endif + "bne 1b\n\t" + "ldr r4, [sp], #4\n\t" +# ifndef __thumb2__ + "ldr r5, [sp], #4\n\t" +# endif + "BX LR\n" + + /* Strings have the same offset from word alignment, but it's + not zero. */ + "3:\n\t" + "tst r1, #1\n\t" + "beq 1f\n\t" + "ldrb r2, [r1], #1\n\t" + "strb r2, [ip], #1\n\t" + "cmp r2, #0\n\t" + "it eq\n" + "BXEQ LR\n" + "1:\n\t" + "tst r1, #2\n\t" + "beq 5b\n\t" + "ldrh r2, [r1], #2\n\t" +# ifdef __ARMEB__ + "tst r2, #0xff00\n\t" + "iteet ne\n\t" + "strneh r2, [ip], #2\n\t" + "lsreq r2, r2, #8\n\t" + "streqb r2, [ip]\n\t" + "tstne r2, #0xff\n\t" +# else + "tst r2, #0xff\n\t" + "itet ne\n\t" + "strneh r2, [ip], #2\n\t" + "streqb r2, [ip]\n\t" + "tstne r2, #0xff00\n\t" +# endif + "bne 5b\n\t" + "BX LR\n" + + /* src and dst do not have a common word-alignement. Fall back to + byte copying. */ + "4:\n\t" + "ldrb r2, [r1], #1\n\t" + "strb r2, [ip], #1\n\t" + "cmp r2, #0\n\t" + "bne 4b\n\t" + "BX LR"); +} +/* For GLIBC: libc_hidden_builtin_def (strcpy) */ diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S new file mode 100644 index 0000000..279ec87 --- /dev/null +++ b/string/arm/strlen-armv6t2.S @@ -0,0 +1,125 @@ +/* + * strlen - calculate the length of a string + * + * Copyright (c) 2010, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* + Assumes: + ARMv6T2, AArch32 + + */ + + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + +#ifdef __ARMEB__ +#define S2LO lsl +#define S2HI lsr +#else +#define S2LO lsr +#define S2HI lsl +#endif + + /* This code requires Thumb. */ + .thumb + .syntax unified + +/* Parameters and result. */ +#define srcin r0 +#define result r0 + +/* Internal variables. */ +#define src r1 +#define data1a r2 +#define data1b r3 +#define const_m1 r12 +#define const_0 r4 +#define tmp1 r4 /* Overlaps const_0 */ +#define tmp2 r5 + +def_fn __strlen_armv6t2 p2align=6 + pld [srcin, #0] + strd r4, r5, [sp, #-8]! + bic src, srcin, #7 + mvn const_m1, #0 + ands tmp1, srcin, #7 /* (8 - bytes) to alignment. */ + pld [src, #32] + bne.w .Lmisaligned8 + mov const_0, #0 + mov result, #-8 +.Lloop_aligned: + /* Bytes 0-7. */ + ldrd data1a, data1b, [src] + pld [src, #64] + add result, result, #8 +.Lstart_realigned: + uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ + sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ + uadd8 data1b, data1b, const_m1 + sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ + cbnz data1b, .Lnull_found + + /* Bytes 8-15. */ + ldrd data1a, data1b, [src, #8] + uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ + add result, result, #8 + sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ + uadd8 data1b, data1b, const_m1 + sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ + cbnz data1b, .Lnull_found + + /* Bytes 16-23. */ + ldrd data1a, data1b, [src, #16] + uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ + add result, result, #8 + sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ + uadd8 data1b, data1b, const_m1 + sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ + cbnz data1b, .Lnull_found + + /* Bytes 24-31. */ + ldrd data1a, data1b, [src, #24] + add src, src, #32 + uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ + add result, result, #8 + sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ + uadd8 data1b, data1b, const_m1 + sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ + cmp data1b, #0 + beq .Lloop_aligned + +.Lnull_found: + cmp data1a, #0 + itt eq + addeq result, result, #4 + moveq data1a, data1b +#ifndef __ARMEB__ + rev data1a, data1a +#endif + clz data1a, data1a + ldrd r4, r5, [sp], #8 + add result, result, data1a, lsr #3 /* Bits -> Bytes. */ + bx lr + +.Lmisaligned8: + ldrd data1a, data1b, [src] + and tmp2, tmp1, #3 + rsb result, tmp1, #0 + lsl tmp2, tmp2, #3 /* Bytes -> bits. */ + tst tmp1, #4 + pld [src, #64] + S2HI tmp2, const_m1, tmp2 + orn data1a, data1a, tmp2 + itt ne + ornne data1b, data1b, tmp2 + movne data1a, const_m1 + mov const_0, #0 + b .Lstart_realigned + .size __strlen_armv6t2, . - __strlen_armv6t2 diff --git a/string/include/stringlib.h b/string/include/stringlib.h index 753d06a..96647cf 100644 --- a/string/include/stringlib.h +++ b/string/include/stringlib.h @@ -14,4 +14,36 @@ #if __aarch64__ void *__memcpy_bytewise (void *__restrict, const void *__restrict, size_t); +void *__memcpy_aarch64 (void *__restrict, const void *__restrict, size_t); +void *__memmove_aarch64 (void *__restrict, const void *__restrict, size_t); +void *__memset_aarch64 (void *, int, size_t); +void *__memchr_aarch64 (const void *, int, size_t); +int __memcmp_aarch64 (const void *, const void *, size_t); +char *__strcpy_aarch64 (char *__restrict, const char *__restrict); +int __strcmp_aarch64 (const char *, const char *); +char *__strchr_aarch64 (const char *, int); +char *__strchrnul_aarch64 (const char *, int ); +size_t __strlen_aarch64 (const char *); +size_t __strnlen_aarch64 (const char *, size_t); +int __strncmp_aarch64 (const char *, const char *, size_t); +# if __ARM_FEATURE_SVE +void *__memchr_aarch64_sve (const void *, int, size_t); +int __memcmp_aarch64_sve (const void *, const void *, size_t); +char *__strchr_aarch64_sve (const char *, int); +char *__strrchr_aarch64_sve (const char *, int); +char *__strchrnul_aarch64_sve (const char *, int ); +int __strcmp_aarch64_sve (const char *, const char *); +char *__strcpy_aarch64_sve (char *__restrict, const char *__restrict); +size_t __strlen_aarch64_sve (const char *); +size_t __strnlen_aarch64_sve (const char *, size_t); +int __strncmp_aarch64_sve (const char *, const char *, size_t); +# endif +#elif __arm__ +void *__memcpy_arm (void *__restrict, const void *__restrict, size_t); +void *__memset_arm (void *, int, size_t); +void *__memchr_arm (const void *, int, size_t); +char *__strcpy_arm (char *__restrict, const char *__restrict); +int __strcmp_arm (const char *, const char *); +int __strcmp_armv6m (const char *, const char *); +size_t __strlen_armv6t2 (const char *); #endif diff --git a/string/memchr.S b/string/memchr.S new file mode 100644 index 0000000..0a564d8 --- /dev/null +++ b/string/memchr.S @@ -0,0 +1,15 @@ +/* + * Selected possible memchr implementations. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __aarch64__ +#include "aarch64/memchr.S" +# if __ARM_FEATURE_SVE +#include "aarch64/memchr-sve.S" +# endif +#elif __arm__ +#include "arm/memchr.S" +#endif diff --git a/string/memcmp.S b/string/memcmp.S new file mode 100644 index 0000000..22da685 --- /dev/null +++ b/string/memcmp.S @@ -0,0 +1,13 @@ +/* + * Selected possible memcpy implementations. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __aarch64__ +#include "aarch64/memcmp.S" +# if __ARM_FEATURE_SVE +#include "aarch64/memcmp-sve.S" +# endif +#endif diff --git a/string/memcpy.S b/string/memcpy.S new file mode 100644 index 0000000..c0f23e3 --- /dev/null +++ b/string/memcpy.S @@ -0,0 +1,12 @@ +/* + * Selected possible memcpy implementations. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __aarch64__ +#include "aarch64/memcpy.S" +#elif __arm__ +#include "arm/memcpy.S" +#endif diff --git a/string/memmove.S b/string/memmove.S new file mode 100644 index 0000000..be3c7a1 --- /dev/null +++ b/string/memmove.S @@ -0,0 +1,10 @@ +/* + * Selected possible memmmove implementations. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __aarch64__ +#include "aarch64/memmove.S" +#endif diff --git a/string/memset.S b/string/memset.S new file mode 100644 index 0000000..57542ef --- /dev/null +++ b/string/memset.S @@ -0,0 +1,12 @@ +/* + * Selected possible memset implementations. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __aarch64__ +#include "aarch64/memset.S" +#elif __arm__ +#include "arm/memset.S" +#endif diff --git a/string/strchr.S b/string/strchr.S new file mode 100644 index 0000000..8cead02 --- /dev/null +++ b/string/strchr.S @@ -0,0 +1,13 @@ +/* + * Selected possible strchr implementations. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __aarch64__ +#include "aarch64/strchr.S" +# if __ARM_FEATURE_SVE +#include "aarch64/strchr-sve.S" +# endif +#endif diff --git a/string/strchrnul.S b/string/strchrnul.S new file mode 100644 index 0000000..3dfdeef --- /dev/null +++ b/string/strchrnul.S @@ -0,0 +1,13 @@ +/* + * Selected possible strchr implementations. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __aarch64__ +#include "aarch64/strchrnul.S" +# if __ARM_FEATURE_SVE +#include "aarch64/strchrnul-sve.S" +# endif +#endif diff --git a/string/strcmp.S b/string/strcmp.S new file mode 100644 index 0000000..12530ec --- /dev/null +++ b/string/strcmp.S @@ -0,0 +1,19 @@ +/* + * Selected possible strcmp implementations. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __aarch64__ +#include "aarch64/strcmp.S" +# if __ARM_FEATURE_SVE +#include "aarch64/strcmp-sve.S" +# endif +#elif __arm__ +# if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 +#include "arm/strcmp.S" +# elif __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1 +#include "arm/strcmp-armv6m.S" +# endif +#endif diff --git a/string/strcpy-c.c b/string/strcpy-c.c new file mode 100644 index 0000000..6bde24a --- /dev/null +++ b/string/strcpy-c.c @@ -0,0 +1,10 @@ +/* + * Selected possible strcpy implementations. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __arm__ && defined (__thumb2__) && !defined (__thumb__) +#include "arm/strcpy.c" +#endif diff --git a/string/strcpy.S b/string/strcpy.S new file mode 100644 index 0000000..a604b22 --- /dev/null +++ b/string/strcpy.S @@ -0,0 +1,13 @@ +/* + * Selected possible strcpy implementations. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __aarch64__ +#include "aarch64/strcpy.S" +# if __ARM_FEATURE_SVE +#include "aarch64/strcpy-sve.S" +# endif +#endif diff --git a/string/strlen.S b/string/strlen.S new file mode 100644 index 0000000..d681033 --- /dev/null +++ b/string/strlen.S @@ -0,0 +1,17 @@ +/* + * Selected possible strlen implementations. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __aarch64__ +#include "aarch64/strlen.S" +# if __ARM_FEATURE_SVE +#include "aarch64/strlen-sve.S" +# endif +#elif __arm__ +# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 +#include "arm/strlen-armv6t2.S" +# endif +#endif diff --git a/string/strncmp.S b/string/strncmp.S new file mode 100644 index 0000000..26b56b7 --- /dev/null +++ b/string/strncmp.S @@ -0,0 +1,13 @@ +/* + * Selected possible strncmp implementations. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __aarch64__ +#include "aarch64/strncmp.S" +# if __ARM_FEATURE_SVE +#include "aarch64/strncmp-sve.S" +# endif +#endif diff --git a/string/strnlen.S b/string/strnlen.S new file mode 100644 index 0000000..eebe777 --- /dev/null +++ b/string/strnlen.S @@ -0,0 +1,13 @@ +/* + * Selected possible strnlen implementations. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __aarch64__ +#include "aarch64/strnlen.S" +# if __ARM_FEATURE_SVE +#include "aarch64/strnlen-sve.S" +# endif +#endif diff --git a/string/strrchr.S b/string/strrchr.S new file mode 100644 index 0000000..18b1cf9 --- /dev/null +++ b/string/strrchr.S @@ -0,0 +1,12 @@ +/* + * Selected possible strrchr implementations. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __aarch64__ +# if __ARM_FEATURE_SVE +#include "aarch64/strrchr-sve.S" +# endif +#endif diff --git a/string/test/memchr.c b/string/test/memchr.c new file mode 100644 index 0000000..8d609c9 --- /dev/null +++ b/string/test/memchr.c @@ -0,0 +1,94 @@ +/* + * memchr test. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include "stringlib.h" + +static const struct fun +{ + const char *name; + void *(*fun)(const void *, int c, size_t n); +} funtab[] = { +#define F(x) {#x, x}, +F(memchr) +#if __aarch64__ +F(__memchr_aarch64) +# if __ARM_FEATURE_SVE +F(__memchr_aarch64_sve) +# endif +#elif __arm__ +F(__memchr_arm) +#endif +#undef F + {0, 0} +}; + +static int test_status; +#define ERR(...) (test_status=1, printf(__VA_ARGS__)) + +#define A 32 +#define SP 512 +#define LEN 250000 +static unsigned char sbuf[LEN+2*A]; + +static void *alignup(void *p) +{ + return (void*)(((uintptr_t)p + A-1) & -A); +} + +static void test(const struct fun *fun, int align, int seekpos, int len) +{ + unsigned char *src = alignup(sbuf); + unsigned char *s = src + align; + unsigned char *f = len ? s + seekpos : 0; + int seekchar = 0x1; + int i; + void *p; + + if (len > LEN || seekpos >= len || align >= A) + abort(); + + for (i = 0; i < seekpos; i++) + s[i] = 'a' + i%23; + s[i++] = seekchar; + for (; i < len; i++) + s[i] = 'a' + i%23; + + p = fun->fun(s, seekchar, len); + + if (p != f) { + ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p); + ERR("expected: %p\n", f); + abort(); + } +} + +int main() +{ + int r = 0; + for (int i=0; funtab[i].name; i++) { + test_status = 0; + for (int a = 0; a < A; a++) { + for (int n = 0; n < 100; n++) + for (int sp = 0; sp < n-1; sp++) + test(funtab+i, a, sp, n); + for (int n = 100; n < LEN; n *= 2) { + test(funtab+i, a, n-1, n); + test(funtab+i, a, n/2, n); + } + } + if (test_status) { + r = -1; + ERR("FAIL %s\n", funtab[i].name); + } + } + return r; +} diff --git a/string/test/memcmp.c b/string/test/memcmp.c new file mode 100644 index 0000000..63b07bd --- /dev/null +++ b/string/test/memcmp.c @@ -0,0 +1,97 @@ +/* + * memcmp test. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "stringlib.h" + +static const struct fun +{ + const char *name; + int (*fun)(const void *s1, const void *s2, size_t n); +} funtab[] = { +#define F(x) {#x, x}, +F(memcmp) +#if __aarch64__ +F(__memcmp_aarch64) +# if __ARM_FEATURE_SVE +F(__memcmp_aarch64_sve) +# endif +#endif +#undef F + {0, 0} +}; + +static int test_status; +#define ERR(...) (test_status=1, printf(__VA_ARGS__)) + +#define A 32 +#define LEN 250000 +static unsigned char s1buf[LEN+2*A]; +static unsigned char s2buf[LEN+2*A]; + +static void *alignup(void *p) +{ + return (void*)(((uintptr_t)p + A-1) & -A); +} + +static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos) +{ + unsigned char *src1 = alignup(s1buf); + unsigned char *src2 = alignup(s2buf); + unsigned char *s1 = src1 + s1align; + unsigned char *s2 = src2 + s2align; + int r; + + if (len > LEN || s1align >= A || s2align >= A) + abort(); + if (diffpos && diffpos >= len) + abort(); + + for (int i = 0; i < len+A; i++) + src1[i] = src2[i] = '?'; + for (int i = 0; i < len; i++) + s1[i] = s2[i] = 'a' + i%23; + if (diffpos) + s1[diffpos]++; + + r = fun->fun(s1, s2, len); + + if ((!diffpos && r != 0) || (diffpos && r == 0)) { + ERR("%s(align %d, align %d, %d) failed, returned %d\n", + fun->name, s1align, s2align, len, r); + ERR("src1: %.*s\n", s1align+len+1, src1); + ERR("src2: %.*s\n", s2align+len+1, src2); + } +} + +int main() +{ + int r = 0; + for (int i=0; funtab[i].name; i++) { + test_status = 0; + for (int d = 0; d < A; d++) + for (int s = 0; s < A; s++) { + int n; + for (n = 0; n < 100; n++) { + test(funtab+i, d, s, n, 0); + test(funtab+i, d, s, n, n / 2); + } + for (; n < LEN; n *= 2) { + test(funtab+i, d, s, n, 0); + test(funtab+i, d, s, n, n / 2); + } + } + if (test_status) { + r = -1; + ERR("FAIL %s\n", funtab[i].name); + } + } + return r; +} diff --git a/string/test/memcpy.c b/string/test/memcpy.c index 1dccac7..26ab0ec 100644 --- a/string/test/memcpy.c +++ b/string/test/memcpy.c @@ -20,6 +20,9 @@ static const struct fun F(memcpy) #if __aarch64__ F(__memcpy_bytewise) +F(__memcpy_aarch64) +#elif __arm__ +F(__memcpy_arm) #endif #undef F {0, 0} diff --git a/string/test/memmove.c b/string/test/memmove.c new file mode 100644 index 0000000..8164383 --- /dev/null +++ b/string/test/memmove.c @@ -0,0 +1,142 @@ +/* + * memmove test. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "stringlib.h" + +static const struct fun +{ + const char *name; + void *(*fun)(void *, const void *, size_t); +} funtab[] = { +#define F(x) {#x, x}, +F(memmove) +#if __aarch64__ +F(__memmove_aarch64) +#endif +#undef F + {0, 0} +}; + +static int test_status; +#define ERR(...) (test_status=1, printf(__VA_ARGS__)) + +#define A 32 +#define LEN 250000 +static unsigned char dbuf[LEN+2*A]; +static unsigned char sbuf[LEN+2*A]; +static unsigned char wbuf[LEN+2*A]; + +static void *alignup(void *p) +{ + return (void*)(((uintptr_t)p + A-1) & -A); +} + +static void test(const struct fun *fun, int dalign, int salign, int len) +{ + unsigned char *src = alignup(sbuf); + unsigned char *dst = alignup(dbuf); + unsigned char *want = wbuf; + unsigned char *s = src + salign; + unsigned char *d = dst + dalign; + unsigned char *w = want + dalign; + void *p; + int i; + + if (len > LEN || dalign >= A || salign >= A) + abort(); + for (i = 0; i < len+A; i++) { + src[i] = '?'; + want[i] = dst[i] = '*'; + } + for (i = 0; i < len; i++) + s[i] = w[i] = 'a' + i%23; + + p = fun->fun(d, s, len); + if (p != d) + ERR("%s(%p,..) returned %p\n", fun->name, d, p); + for (i = 0; i < len+A; i++) { + if (dst[i] != want[i]) { + ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len); + ERR("got : %.*s\n", dalign+len+1, dst); + ERR("want: %.*s\n", dalign+len+1, want); + break; + } + } +} + +static void test_overlap(const struct fun *fun, int dalign, int salign, int len) +{ + unsigned char *src = alignup(sbuf); + unsigned char *dst = alignup(sbuf); + unsigned char *want = wbuf; + unsigned char *s = src + salign; + unsigned char *d = dst + dalign; + unsigned char *w = wbuf + dalign; + void *p; + + if (len > LEN || dalign >= A || salign >= A) + abort(); + + for (int i = 0; i < len+A; i++) + src[i] = want[i] = '?'; + + for (int i = 0; i < len; i++) + s[i] = w[i] = 'a' + i%23; + + /* Copy the potential overlap range. */ + if (s < d) { + for (int i = 0; i < (uintptr_t)d-(uintptr_t)s; i++) + want[salign+i] = src[salign+i]; + } else { + for (int i = 0; i < (uintptr_t)s-(uintptr_t)d; i++) + want[len + dalign + i] = src[len + dalign + i]; + } + + p = fun->fun(d, s, len); + if (p != d) + ERR("%s(%p,..) returned %p\n", fun->name, d, p); + for (int i = 0; i < len+A; i++) { + if (dst[i] != want[i]) { + ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len); + ERR("got : %.*s\n", dalign+len+1, dst); + ERR("want: %.*s\n", dalign+len+1, want); + abort(); + break; + } + } +} + +int main() +{ + test_overlap(funtab+0, 2, 1, 1); + + int r = 0; + for (int i=0; funtab[i].name; i++) { + test_status = 0; + for (int d = 0; d < A; d++) + for (int s = 0; s < A; s++) { + int n; + for (n = 0; n < 100; n++) { + test(funtab+i, d, s, n); + test_overlap(funtab+i, d, s, n); + } + for (; n < LEN; n *= 2) { + test(funtab+i, d, s, n); + test_overlap(funtab+i, d, s, n); + } + } + if (test_status) { + r = -1; + ERR("FAIL %s\n", funtab[i].name); + } + } + return r; +} diff --git a/string/test/memset.c b/string/test/memset.c new file mode 100644 index 0000000..c0c7ed6 --- /dev/null +++ b/string/test/memset.c @@ -0,0 +1,112 @@ +/* + * memset test. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "stringlib.h" + +static const struct fun +{ + const char *name; + void *(*fun)(void *s, int c, size_t n); +} funtab[] = { +#define F(x) {#x, x}, +F(memset) +#if __aarch64__ +F(__memset_aarch64) +#elif __arm__ +F(__memset_arm) +#endif +#undef F + {0, 0} +}; + +static int test_status; +#define ERR(...) (test_status=1, printf(__VA_ARGS__)) + +#define A 32 +#define LEN 250000 +static unsigned char sbuf[LEN+2*A]; + +static void *alignup(void *p) +{ + return (void*)(((uintptr_t)p + A-1) & -A); +} + +static void err(const char *name, unsigned char *src, int salign, int c, int len) +{ + ERR("%s(align %d, %d, %d) failed\n", name, salign, c, len); + ERR("got : %.*s\n", salign+len+1, src); +} + +static void test(const struct fun *fun, int salign, int c, int len) +{ + unsigned char *src = alignup(sbuf); + unsigned char *s = src + salign; + void *p; + int i; + + if (len > LEN || salign >= A) + abort(); + for (i = 0; i < len+A; i++) + src[i] = '?'; + for (i = 0; i < len; i++) + s[i] = 'a' + i%23; + for (; i<len%A; i++) + s[i] = '*'; + + p = fun->fun(s, c, len); + if (p != s) + ERR("%s(%p,..) returned %p\n", fun->name, s, p); + + for (i = 0; i < salign; i++) { + if (src[i] != '?') { + err(fun->name, src, salign, c, len); + return; + } + } + for (i = salign; i < len; i++) { + if (src[i] != (unsigned char)c) { + err(fun->name, src, salign, c, len); + return; + } + } + for (; i < len%A; i++) { + if (src[i] != '*') { + err(fun->name, src, salign, c, len); + return; + } + } +} + +int main() +{ + int r = 0; + for (int i=0; funtab[i].name; i++) { + test_status = 0; + for (int s = 0; s < A; s++) { + int n; + for (n = 0; n < 100; n++) { + test(funtab+i, s, 0, n); + test(funtab+i, s, 0x25, n); + test(funtab+i, s, 0xaa25, n); + } + for (; n < LEN; n *= 2) { + test(funtab+i, s, 0, n); + test(funtab+i, s, 0x25, n); + test(funtab+i, s, 0xaa25, n); + } + } + if (test_status) { + r = -1; + ERR("FAIL %s\n", funtab[i].name); + } + } + return r; +} diff --git a/string/test/strchr.c b/string/test/strchr.c new file mode 100644 index 0000000..30c714f --- /dev/null +++ b/string/test/strchr.c @@ -0,0 +1,98 @@ +/* + * strchr test. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include "stringlib.h" + +static const struct fun +{ + const char *name; + char *(*fun)(const char *s, int c); +} funtab[] = { +#define F(x) {#x, x}, +F(strchr) +#if __aarch64__ +F(__strchr_aarch64) +# if __ARM_FEATURE_SVE +F(__strchr_aarch64_sve) +# endif +#endif +#undef F + {0, 0} +}; + +static int test_status; +#define ERR(...) (test_status=1, printf(__VA_ARGS__)) + +#define A 32 +#define SP 512 +#define LEN 250000 +static char sbuf[LEN+2*A]; + +static void *alignup(void *p) +{ + return (void*)(((uintptr_t)p + A-1) & -A); +} + +static void test(const struct fun *fun, int align, int seekpos, int len) +{ + char *src = alignup(sbuf); + char *s = src + align; + char *f = seekpos != -1 ? s + seekpos : 0; + int seekchar = 0x1; + void *p; + + if (len > LEN || seekpos >= len - 1 || align >= A) + abort(); + if (seekchar >= 'a' && seekchar <= 'a' + 23) + abort(); + + for (int i = 0; i < len + A; i++) + src[i] = '?'; + for (int i = 0; i < len - 2; i++) + s[i] = 'a' + i%23; + if (seekpos != -1) + s[seekpos] = seekchar; + s[len - 1] = '\0'; + + p = fun->fun(s, seekchar); + + if (p != f) { + ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p); + ERR("expected: %p\n", f); + abort(); + } +} + +int main() +{ + int r = 0; + for (int i=0; funtab[i].name; i++) { + test_status = 0; + for (int a = 0; a < A; a++) { + int n; + for (n = 1; n < 100; n++) { + for (int sp = 0; sp < n - 1; sp++) + test(funtab+i, a, sp, n); + test(funtab+i, a, -1, n); + } + for (; n < LEN; n *= 2) { + test(funtab+i, a, -1, n); + test(funtab+i, a, n / 2, n); + } + } + if (test_status) { + r = -1; + ERR("FAIL %s\n", funtab[i].name); + } + } + return r; +} diff --git a/string/test/strchrnul.c b/string/test/strchrnul.c new file mode 100644 index 0000000..c4260e6 --- /dev/null +++ b/string/test/strchrnul.c @@ -0,0 +1,100 @@ +/* + * strchrnul test. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define _GNU_SOURCE + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include "stringlib.h" + +static const struct fun +{ + const char *name; + char *(*fun)(const char *s, int c); +} funtab[] = { +#define F(x) {#x, x}, +F(strchrnul) +#if __aarch64__ +F(__strchrnul_aarch64) +# if __ARM_FEATURE_SVE +F(__strchrnul_aarch64_sve) +# endif +#endif +#undef F + {0, 0} +}; + +static int test_status; +#define ERR(...) (test_status=1, printf(__VA_ARGS__)) + +#define A 32 +#define SP 512 +#define LEN 250000 +static char sbuf[LEN+2*A]; + +static void *alignup(void *p) +{ + return (void*)(((uintptr_t)p + A-1) & -A); +} + +static void test(const struct fun *fun, int align, int seekpos, int len) +{ + char *src = alignup(sbuf); + char *s = src + align; + char *f = seekpos != -1 ? s + seekpos : s + len - 1; + int seekchar = 0x1; + void *p; + + if (len > LEN || seekpos >= len - 1 || align >= A) + abort(); + if (seekchar >= 'a' && seekchar <= 'a' + 23) + abort(); + + for (int i = 0; i < len + A; i++) + src[i] = '?'; + for (int i = 0; i < len - 2; i++) + s[i] = 'a' + i%23; + if (seekpos != -1) + s[seekpos] = seekchar; + s[len - 1] = '\0'; + + p = fun->fun(s, seekchar); + + if (p != f) { + ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p); + ERR("expected: %p\n", f); + abort(); + } +} + +int main() +{ + int r = 0; + for (int i=0; funtab[i].name; i++) { + test_status = 0; + for (int a = 0; a < A; a++) { + int n; + for (n = 1; n < 100; n++) { + for (int sp = 0; sp < n - 1; sp++) + test(funtab+i, a, sp, n); + test(funtab+i, a, -1, n); + } + for (; n < LEN; n *= 2) { + test(funtab+i, a, -1, n); + test(funtab+i, a, n / 2, n); + } + } + if (test_status) { + r = -1; + ERR("FAIL %s\n", funtab[i].name); + } + } + return r; +} diff --git a/string/test/strcmp.c b/string/test/strcmp.c new file mode 100644 index 0000000..c4e8867 --- /dev/null +++ b/string/test/strcmp.c @@ -0,0 +1,104 @@ +/* + * strcmp test. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "stringlib.h" + +static const struct fun +{ + const char *name; + int (*fun)(const char *s1, const char *s2); +} funtab[] = { +#define F(x) {#x, x}, +F(strcmp) +#if __aarch64__ +F(__strcmp_aarch64) +# if __ARM_FEATURE_SVE +F(__strcmp_aarch64_sve) +# endif +#elif __arm__ +# if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 +F(__strcmp_arm) +# elif __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1 +F(__strcmp_armv6m) +# endif +#endif +#undef F + {0, 0} +}; + +static int test_status; +#define ERR(...) (test_status=1, printf(__VA_ARGS__)) + +#define A 32 +#define LEN 250000 +static char s1buf[LEN+2*A]; +static char s2buf[LEN+2*A]; + +static void *alignup(void *p) +{ + return (void*)(((uintptr_t)p + A-1) & -A); +} + +static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos) +{ + char *src1 = alignup(s1buf); + char *src2 = alignup(s2buf); + char *s1 = src1 + s1align; + char *s2 = src2 + s2align; + int r; + + if (len > LEN || s1align >= A || s2align >= A) + abort(); + if (diffpos > 1 && diffpos >= len-1) + abort(); + + for (int i = 0; i < len+A; i++) + src1[i] = src2[i] = '?'; + for (int i = 0; i < len-1; i++) + s1[i] = s2[i] = 'a' + i%23; + if (diffpos > 1) + s1[diffpos]++; + s1[len] = s2[len] = '\0'; + + r = fun->fun(s1, s2); + + if (((diffpos <= 1) && r != 0) || (diffpos > 1 && r == 0)) { + ERR("%s(align %d, align %d, %d) failed, returned %d\n", + fun->name, s1align, s2align, len, r); + ERR("src1: %.*s\n", s1align+len+1, src1); + ERR("src2: %.*s\n", s2align+len+1, src2); + } +} + +int main() +{ + int r = 0; + for (int i=0; funtab[i].name; i++) { + test_status = 0; + for (int d = 0; d < A; d++) + for (int s = 0; s < A; s++) { + int n; + for (n = 0; n < 100; n++) { + test(funtab+i, d, s, n, 0); + test(funtab+i, d, s, n, n / 2); + } + for (; n < LEN; n *= 2) { + test(funtab+i, d, s, n, 0); + test(funtab+i, d, s, n, n / 2); + } + } + if (test_status) { + r = -1; + ERR("FAIL %s\n", funtab[i].name); + } + } + return r; +} diff --git a/string/test/strcpy.c b/string/test/strcpy.c new file mode 100644 index 0000000..3072ade --- /dev/null +++ b/string/test/strcpy.c @@ -0,0 +1,100 @@ +/* + * strcpy test. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "stringlib.h" + +static const struct fun +{ + const char *name; + char *(*fun)(char *dest, const char *src); +} funtab[] = { +#define F(x) {#x, x}, +F(strcpy) +#if __aarch64__ +F(__strcpy_aarch64) +# if __ARM_FEATURE_SVE +F(__strcpy_aarch64_sve) +# endif +#elif __arm__ && defined (__thumb2__) && !defined (__thumb__) +F(__strcpy_arm) +#endif +#undef F + {0, 0} +}; + +static int test_status; +#define ERR(...) (test_status=1, printf(__VA_ARGS__)) + +#define A 32 +#define LEN 250000 +static char dbuf[LEN+2*A]; +static char sbuf[LEN+2*A]; +static char wbuf[LEN+2*A]; + +static void *alignup(void *p) +{ + return (void*)(((uintptr_t)p + A-1) & -A); +} + +static void test(const struct fun *fun, int dalign, int salign, int len) +{ + char *src = alignup(sbuf); + char *dst = alignup(dbuf); + char *want = wbuf; + char *s = src + salign; + char *d = dst + dalign; + char *w = want + dalign; + void *p; + int i; + + if (len > LEN || dalign >= A || salign >= A) + abort(); + for (i = 0; i < len+A; i++) { + src[i] = '?'; + want[i] = dst[i] = '*'; + } + for (i = 0; i < len-1; i++) + s[i] = w[i] = 'a' + i%23; + s[i] = w[i] = '\0'; + + p = fun->fun(d, s); + if (p != d) + ERR("%s(%p,..) returned %p\n", fun->name, d, p); + for (i = 0; i < len+A; i++) { + if (dst[i] != want[i]) { + ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len); + ERR("got : %.*s\n", dalign+len+1, dst); + ERR("want: %.*s\n", dalign+len+1, want); + break; + } + } +} + +int main() +{ + int r = 0; + for (int i=0; funtab[i].name; i++) { + test_status = 0; + for (int d = 0; d < A; d++) + for (int s = 0; s < A; s++) { + int n; + for (n = 0; n < 100; n++) + test(funtab+i, d, s, n); + for (; n < LEN; n *= 2) + test(funtab+i, d, s, n); + } + if (test_status) { + r = -1; + ERR("FAIL %s\n", funtab[i].name); + } + } + return r; +} diff --git a/string/test/strlen.c b/string/test/strlen.c new file mode 100644 index 0000000..700c865 --- /dev/null +++ b/string/test/strlen.c @@ -0,0 +1,91 @@ +/* + * strlen test. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include "stringlib.h" + +static const struct fun +{ + const char *name; + size_t (*fun)(const char *s); +} funtab[] = { +#define F(x) {#x, x}, +F(strlen) +#if __aarch64__ +F(__strlen_aarch64) +# if __ARM_FEATURE_SVE +F(__strlen_aarch64_sve) +# endif +#elif __arm__ +# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 +F(__strlen_armv6t2) +# endif +#endif +#undef F + {0, 0} +}; + +static int test_status; +#define ERR(...) (test_status=1, printf(__VA_ARGS__)) + +#define A 32 +#define SP 512 +#define LEN 250000 +static char sbuf[LEN+2*A]; + +static void *alignup(void *p) +{ + return (void*)(((uintptr_t)p + A-1) & -A); +} + +static void test(const struct fun *fun, int align, int len) +{ + char *src = alignup(sbuf); + char *s = src + align; + size_t r; + + if (len > LEN || align >= A) + abort(); + + for (int i = 0; i < len + A; i++) + src[i] = '?'; + for (int i = 0; i < len - 2; i++) + s[i] = 'a' + i%23; + s[len - 1] = '\0'; + + r = fun->fun(s); + if (r != len-1) { + ERR("%s(%p) returned %zu\n", fun->name, s, r); + ERR("input: %.*s\n", align+len+1, src); + ERR("expected: %d\n", len); + abort(); + } +} + +int main() +{ + int r = 0; + for (int i=0; funtab[i].name; i++) { + test_status = 0; + for (int a = 0; a < A; a++) { + int n; + for (n = 1; n < 100; n++) + test(funtab+i, a, n); + for (; n < LEN; n *= 2) + test(funtab+i, a, n); + } + if (test_status) { + r = -1; + ERR("FAIL %s\n", funtab[i].name); + } + } + return r; +} diff --git a/string/test/strncmp.c b/string/test/strncmp.c new file mode 100644 index 0000000..14e0a8c --- /dev/null +++ b/string/test/strncmp.c @@ -0,0 +1,104 @@ +/* + * strncmp test. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "stringlib.h" + +static const struct fun +{ + const char *name; + int (*fun)(const char *, const char *, size_t); +} funtab[] = { +#define F(x) {#x, x}, +F(strncmp) +#if __aarch64__ +F(__strncmp_aarch64) +# if __ARM_FEATURE_SVE +F(__strncmp_aarch64_sve) +# endif +#endif +#undef F + {0, 0} +}; + +static int test_status; +#define ERR(...) (test_status=1, printf(__VA_ARGS__)) + +#define A 32 +#define LEN 250000 +static char s1buf[LEN+2*A]; +static char s2buf[LEN+2*A]; + +static void *alignup(void *p) +{ + return (void*)(((uintptr_t)p + A-1) & -A); +} + +static void test(const struct fun *fun, int s1align, int s2align, int maxlen, int diffpos, int len) +{ + char *src1 = alignup(s1buf); + char *src2 = alignup(s2buf); + char *s1 = src1 + s1align; + char *s2 = src2 + s2align; + int r; + + if (len > LEN || s1align >= A || s2align >= A) + abort(); + if (diffpos > 1 && diffpos >= len-1) + abort(); + + for (int i = 0; i < len+A; i++) + src1[i] = src2[i] = '?'; + for (int i = 0; i < len-1; i++) + s1[i] = s2[i] = 'a' + i%23; + if (diffpos > 1) + s1[diffpos]++; + s1[len] = s2[len] = '\0'; + + r = fun->fun(s1, s2, maxlen); + + diffpos = maxlen <= diffpos ? 0 : diffpos; + + if (((diffpos <= 1) && r != 0) || (diffpos > 1 && r == 0)) { + ERR("%s(align %d, align %d, %d (%d)) failed, returned %d (%d)\n", + fun->name, s1align, s2align, maxlen, len, r, diffpos); + ERR("src1: %.*s\n", s1align+len+1, src1); + ERR("src2: %.*s\n", s2align+len+1, src2); + } +} + +int main() +{ + int r = 0; + for (int i=0; funtab[i].name; i++) { + test_status = 0; + for (int d = 0; d < A; d++) + for (int s = 0; s < A; s++) { + int n; + for (n = 0; n < 100; n++) { + test(funtab+i, d, s, n, 0, n); + test(funtab+i, d, s, n, n/2, n); + test(funtab+i, d, s, n/2, 0, n); + test(funtab+i, d, s, n/2, n/2, n); + } + for (; n < LEN; n *= 2) { + test(funtab+i, d, s, n, 0, n); + test(funtab+i, d, s, n, n/2, n); + test(funtab+i, d, s, n/2, 0, n); + test(funtab+i, d, s, n/2, n/2, n); + } + } + if (test_status) { + r = -1; + ERR("FAIL %s\n", funtab[i].name); + } + } + return r; +} diff --git a/string/test/strnlen.c b/string/test/strnlen.c new file mode 100644 index 0000000..9a98d80 --- /dev/null +++ b/string/test/strnlen.c @@ -0,0 +1,94 @@ +/* + * strnlen test. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define _POSIX_C_SOURCE 200809L + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include "stringlib.h" + +static const struct fun +{ + const char *name; + size_t (*fun)(const char *s, size_t m); +} funtab[] = { +#define F(x) {#x, x}, +F(strnlen) +#if __aarch64__ +F(__strnlen_aarch64) +# if __ARM_FEATURE_SVE +F(__strnlen_aarch64_sve) +# endif +#endif +#undef F + {0, 0} +}; + +static int test_status; +#define ERR(...) (test_status=1, printf(__VA_ARGS__)) + +#define A 32 +#define SP 512 +#define LEN 250000 +static char sbuf[LEN+2*A]; + +static void *alignup(void *p) +{ + return (void*)(((uintptr_t)p + A-1) & -A); +} + +static void test(const struct fun *fun, int align, int maxlen, int len) +{ + char *src = alignup(sbuf); + char *s = src + align; + size_t r; + size_t e = maxlen < len ? maxlen : len - 1; + + if (len > LEN || align >= A) + abort(); + + for (int i = 0; i < len + A; i++) + src[i] = '?'; + for (int i = 0; i < len - 2; i++) + s[i] = 'a' + i%23; + s[len - 1] = '\0'; + + r = fun->fun(s, maxlen); + if (r != e) { + ERR("%s(%p) returned %zu\n", fun->name, s, r); + ERR("input: %.*s\n", align+len+1, src); + ERR("expected: %d\n", len); + abort(); + } +} + +int main() +{ + int r = 0; + for (int i=0; funtab[i].name; i++) { + test_status = 0; + for (int a = 0; a < A; a++) { + int n; + for (n = 1; n < 100; n++) + for (int maxlen = 0; maxlen < 100; maxlen++) + test(funtab+i, a, maxlen, n); + for (; n < LEN; n *= 2) { + test(funtab+i, a, n*2, n); + test(funtab+i, a, n, n); + test(funtab+i, a, n/2, n); + } + } + if (test_status) { + r = -1; + ERR("FAIL %s\n", funtab[i].name); + } + } + return r; +} diff --git a/string/test/strrchr.c b/string/test/strrchr.c new file mode 100644 index 0000000..b3fc2a9 --- /dev/null +++ b/string/test/strrchr.c @@ -0,0 +1,97 @@ +/* + * strrchr test. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include "stringlib.h" + +static const struct fun +{ + const char *name; + char *(*fun)(const char *s, int c); +} funtab[] = { +#define F(x) {#x, x}, +F(strrchr) +#if __aarch64__ +# if __ARM_FEATURE_SVE +F(__strrchr_aarch64_sve) +# endif +#endif +#undef F + {0, 0} +}; + +static int test_status; +#define ERR(...) (test_status=1, printf(__VA_ARGS__)) + +#define A 32 +#define SP 512 +#define LEN 250000 +static char sbuf[LEN+2*A]; + +static void *alignup(void *p) +{ + return (void*)(((uintptr_t)p + A-1) & -A); +} + +static void test(const struct fun *fun, int align, int seekpos, int len) +{ + char *src = alignup(sbuf); + char *s = src + align; + char *f = seekpos != -1 ? s + seekpos : 0; + int seekchar = 0x1; + void *p; + + if (len > LEN || seekpos >= len - 1 || align >= A) + abort(); + if (seekchar >= 'a' && seekchar <= 'a' + 23) + abort(); + + for (int i = 0; i < len + A; i++) + src[i] = '?'; + for (int i = 0; i < len - 2; i++) + s[i] = 'a' + i%23; + if (seekpos != -1) + s[seekpos/2] = s[seekpos] = seekchar; + s[len - 1] = '\0'; + + p = fun->fun(s, seekchar); + + if (p != f) { + ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p); + ERR("expected: %p\n", f); + abort(); + } +} + +int main() +{ + int r = 0; + for (int i=0; funtab[i].name; i++) { + test_status = 0; + for (int a = 0; a < A; a++) { + int n; + for (n = 1; n < 100; n++) { + for (int sp = 0; sp < n - 1; sp++) + test(funtab+i, a, sp, n); + test(funtab+i, a, -1, n); + } + for (; n < LEN; n *= 2) { + test(funtab+i, a, -1, n); + test(funtab+i, a, n / 2, n); + } + } + if (test_status) { + r = -1; + ERR("FAIL %s\n", funtab[i].name); + } + } + return r; +} |