aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTreehugger Robot <treehugger-gerrit@google.com>2019-09-18 11:57:37 +0000
committerGerrit Code Review <noreply-gerritcodereview@google.com>2019-09-18 11:57:37 +0000
commit6a751e14fcef594edf6a0eb7fab98799deaa484e (patch)
treed58d9102b2010c48c3d95474fb5914ae4c2371d9
parent055b2f3c3849d992f9287a2fa0dbe79bd9f2f251 (diff)
parent68a0658ee72ca4c3961719900e9f97562dd66931 (diff)
downloadarm-optimized-routines-ndk-sysroot-r21.tar.gz
Merge "Upgrade arm-optimized-routines to 9c8399909a9835e6f55977df1661cf6306c56707"ndk-sysroot-r21
-rw-r--r--METADATA6
-rw-r--r--math/exp.c4
-rw-r--r--math/exp2.c4
-rw-r--r--math/include/mathlib.h3
-rw-r--r--math/log.c4
-rw-r--r--math/log2.c4
-rw-r--r--math/pow.c4
-rw-r--r--math/test/ulp.c2
-rw-r--r--string/Dir.mk24
-rw-r--r--string/aarch64/memchr-sve.S62
-rw-r--r--string/aarch64/memchr.S149
-rw-r--r--string/aarch64/memcmp-sve.S48
-rw-r--r--string/aarch64/memcmp.S141
-rw-r--r--string/aarch64/memcpy.S178
-rw-r--r--string/aarch64/memmove.S103
-rw-r--r--string/aarch64/memset.S188
-rw-r--r--string/aarch64/strchr-sve.S69
-rw-r--r--string/aarch64/strchr.S137
-rw-r--r--string/aarch64/strchrnul-sve.S9
-rw-r--r--string/aarch64/strchrnul.S122
-rw-r--r--string/aarch64/strcmp-sve.S57
-rw-r--r--string/aarch64/strcmp.S177
-rw-r--r--string/aarch64/strcpy-sve.S69
-rw-r--r--string/aarch64/strcpy.S314
-rw-r--r--string/aarch64/strlen-sve.S55
-rw-r--r--string/aarch64/strlen.S214
-rw-r--r--string/aarch64/strncmp-sve.S66
-rw-r--r--string/aarch64/strncmp.S266
-rw-r--r--string/aarch64/strnlen-sve.S72
-rw-r--r--string/aarch64/strnlen.S160
-rw-r--r--string/aarch64/strrchr-sve.S83
-rw-r--r--string/arm/memchr.S133
-rw-r--r--string/arm/memcpy.S593
-rw-r--r--string/arm/memset.S99
-rw-r--r--string/arm/strcmp-armv6m.S118
-rw-r--r--string/arm/strcmp.S479
-rw-r--r--string/arm/strcpy.c129
-rw-r--r--string/arm/strlen-armv6t2.S125
-rw-r--r--string/include/stringlib.h32
-rw-r--r--string/memchr.S15
-rw-r--r--string/memcmp.S13
-rw-r--r--string/memcpy.S12
-rw-r--r--string/memmove.S10
-rw-r--r--string/memset.S12
-rw-r--r--string/strchr.S13
-rw-r--r--string/strchrnul.S13
-rw-r--r--string/strcmp.S19
-rw-r--r--string/strcpy-c.c10
-rw-r--r--string/strcpy.S13
-rw-r--r--string/strlen.S17
-rw-r--r--string/strncmp.S13
-rw-r--r--string/strnlen.S13
-rw-r--r--string/strrchr.S12
-rw-r--r--string/test/memchr.c94
-rw-r--r--string/test/memcmp.c97
-rw-r--r--string/test/memcpy.c3
-rw-r--r--string/test/memmove.c142
-rw-r--r--string/test/memset.c112
-rw-r--r--string/test/strchr.c98
-rw-r--r--string/test/strchrnul.c100
-rw-r--r--string/test/strcmp.c104
-rw-r--r--string/test/strcpy.c100
-rw-r--r--string/test/strlen.c91
-rw-r--r--string/test/strncmp.c104
-rw-r--r--string/test/strnlen.c94
-rw-r--r--string/test/strrchr.c97
66 files changed, 5916 insertions, 7 deletions
diff --git a/METADATA b/METADATA
index 7c706d5..9762f4f 100644
--- a/METADATA
+++ b/METADATA
@@ -9,11 +9,11 @@ third_party {
type: GIT
value: "https://github.com/ARM-software/optimized-routines.git"
}
- version: "6b594432c8ac46e71686ea21fad30d1c3f79e65a"
+ version: "9c8399909a9835e6f55977df1661cf6306c56707"
license_type: NOTICE
last_upgrade_date {
year: 2019
- month: 7
- day: 31
+ month: 9
+ day: 3
}
}
diff --git a/math/exp.c b/math/exp.c
index ffd3111..1909b8e 100644
--- a/math/exp.c
+++ b/math/exp.c
@@ -5,6 +5,7 @@
* SPDX-License-Identifier: MIT
*/
+#include <float.h>
#include <math.h>
#include <stdint.h>
#include "math_config.h"
@@ -169,4 +170,7 @@ __exp_dd (double x, double xtail)
strong_alias (exp, __exp_finite)
hidden_alias (exp, __ieee754_exp)
hidden_alias (__exp_dd, __exp1)
+# if LDBL_MANT_DIG == 53
+long double expl (long double x) { return exp (x); }
+# endif
#endif
diff --git a/math/exp2.c b/math/exp2.c
index fbedbcb..47aa479 100644
--- a/math/exp2.c
+++ b/math/exp2.c
@@ -5,6 +5,7 @@
* SPDX-License-Identifier: MIT
*/
+#include <float.h>
#include <math.h>
#include <stdint.h>
#include "math_config.h"
@@ -136,4 +137,7 @@ exp2 (double x)
#if USE_GLIBC_ABI
strong_alias (exp2, __exp2_finite)
hidden_alias (exp2, __ieee754_exp2)
+# if LDBL_MANT_DIG == 53
+long double exp2l (long double x) { return exp2 (x); }
+# endif
#endif
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index aac2d4d..eed294b 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -5,9 +5,6 @@
* SPDX-License-Identifier: MIT
*/
-float sinf (float);
-float cosf (float);
-float tanf (float);
float expf (float);
float exp2f (float);
float logf (float);
diff --git a/math/log.c b/math/log.c
index 1283ef2..b85d3ff 100644
--- a/math/log.c
+++ b/math/log.c
@@ -5,6 +5,7 @@
* SPDX-License-Identifier: MIT
*/
+#include <float.h>
#include <math.h>
#include <stdint.h>
#include "math_config.h"
@@ -155,4 +156,7 @@ log (double x)
#if USE_GLIBC_ABI
strong_alias (log, __log_finite)
hidden_alias (log, __ieee754_log)
+# if LDBL_MANT_DIG == 53
+long double logl (long double x) { return log (x); }
+# endif
#endif
diff --git a/math/log2.c b/math/log2.c
index 478b33d..804fb85 100644
--- a/math/log2.c
+++ b/math/log2.c
@@ -5,6 +5,7 @@
* SPDX-License-Identifier: MIT
*/
+#include <float.h>
#include <math.h>
#include <stdint.h>
#include "math_config.h"
@@ -134,4 +135,7 @@ log2 (double x)
#if USE_GLIBC_ABI
strong_alias (log2, __log2_finite)
hidden_alias (log2, __ieee754_log2)
+# if LDBL_MANT_DIG == 53
+long double log2l (long double x) { return log2 (x); }
+# endif
#endif
diff --git a/math/pow.c b/math/pow.c
index e55f159..493488d 100644
--- a/math/pow.c
+++ b/math/pow.c
@@ -5,6 +5,7 @@
* SPDX-License-Identifier: MIT
*/
+#include <float.h>
#include <math.h>
#include <stdint.h>
#include "math_config.h"
@@ -371,4 +372,7 @@ pow (double x, double y)
#if USE_GLIBC_ABI
strong_alias (pow, __pow_finite)
hidden_alias (pow, __ieee754_pow)
+# if LDBL_MANT_DIG == 53
+long double powl (long double x, long double y) { return pow (x, y); }
+# endif
#endif
diff --git a/math/test/ulp.c b/math/test/ulp.c
index 8de6e5b..8782fb0 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -704,7 +704,7 @@ main (int argc, char *argv[])
if (!USE_MPFR && conf.mpfr)
{
puts ("mpfr is not available.");
- return 1;
+ return 0;
}
argc--;
argv++;
diff --git a/string/Dir.mk b/string/Dir.mk
index e179642..bd9979f 100644
--- a/string/Dir.mk
+++ b/string/Dir.mk
@@ -14,6 +14,18 @@ string-libs := \
string-tools := \
build/bin/test/memcpy \
+ build/bin/test/memmove \
+ build/bin/test/memset \
+ build/bin/test/memchr \
+ build/bin/test/memcmp \
+ build/bin/test/strcpy \
+ build/bin/test/strcmp \
+ build/bin/test/strchr \
+ build/bin/test/strrchr \
+ build/bin/test/strchrnul \
+ build/bin/test/strlen \
+ build/bin/test/strnlen \
+ build/bin/test/strncmp
string-lib-base := $(basename $(string-lib-srcs))
string-lib-objs := $(string-lib-base:$(srcdir)/%=build/%.o)
@@ -47,5 +59,17 @@ build/bin/%.sh: $(srcdir)/string/test/%.sh
check-string: $(string-tools)
$(EMULATOR) build/bin/test/memcpy
+ $(EMULATOR) build/bin/test/memmove
+ $(EMULATOR) build/bin/test/memset
+ $(EMULATOR) build/bin/test/memchr
+ $(EMULATOR) build/bin/test/memcmp
+ $(EMULATOR) build/bin/test/strcpy
+ $(EMULATOR) build/bin/test/strcmp
+ $(EMULATOR) build/bin/test/strchr
+ $(EMULATOR) build/bin/test/strrchr
+ $(EMULATOR) build/bin/test/strchrnul
+ $(EMULATOR) build/bin/test/strlen
+ $(EMULATOR) build/bin/test/strnlen
+ $(EMULATOR) build/bin/test/strncmp
.PHONY: all-string check-string
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S
new file mode 100644
index 0000000..0d75acd
--- /dev/null
+++ b/string/aarch64/memchr-sve.S
@@ -0,0 +1,62 @@
+/*
+ * memchr - find a character in a memory zone
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ .arch armv8-a+sve
+ .text
+
+ .globl __memchr_aarch64_sve
+ .type __memchr_aarch64_sve, %function
+ .p2align 4
+__memchr_aarch64_sve:
+ dup z1.b, w1 /* duplicate c to a vector */
+ setffr /* initialize FFR */
+ mov x3, 0 /* initialize off */
+ nop
+
+0: whilelo p1.b, x3, x2 /* make sure off < max */
+ b.none 9f
+
+ /* Read a vector's worth of bytes, bounded by max,
+ stopping on first fault. */
+ ldff1b z0.b, p1/z, [x0, x3]
+ rdffrs p0.b, p1/z
+ b.nlast 2f
+
+ /* First fault did not fail: the vector bounded by max is valid.
+ Avoid depending on the contents of FFR beyond the branch. */
+ incb x3 /* speculate increment */
+ cmpeq p2.b, p1/z, z0.b, z1.b /* search for c */
+ b.none 0b
+ decb x3 /* undo speculate */
+
+ /* Found C. */
+1: brkb p2.b, p1/z, p2.b /* find the first c */
+ add x0, x0, x3 /* form partial pointer */
+ incp x0, p2.b /* form final pointer to c */
+ ret
+
+ /* First fault failed: only some of the vector is valid.
+ Perform the comparision only on the valid bytes. */
+2: cmpeq p2.b, p0/z, z0.b, z1.b
+ b.any 1b
+
+ /* No C found. Re-init FFR, increment, and loop. */
+ setffr
+ incp x3, p0.b
+ b 0b
+
+ /* Found end of count. */
+9: mov x0, 0 /* return null */
+ ret
+
+ .size __memchr_aarch64_sve, . - __memchr_aarch64_sve
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
new file mode 100644
index 0000000..e5a3abf
--- /dev/null
+++ b/string/aarch64/memchr.S
@@ -0,0 +1,149 @@
+/*
+ * memchr - find a character in a memory zone
+ *
+ * Copyright (c) 2014-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+#define cntin x2
+
+#define result x0
+
+#define src x3
+#define tmp x4
+#define wtmp2 w5
+#define synd x6
+#define soff x9
+#define cntrem x10
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_chr1 v3
+#define vhas_chr2 v4
+#define vrepmask v5
+#define vend v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+def_fn __memchr_aarch64
+ /* Do not dereference srcin if no bytes to compare. */
+ cbz cntin, .Lzero_length
+ /*
+ * Magic constant 0x40100401 allows us to identify which lane matches
+ * the requested byte.
+ */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ /* Work with aligned 32-byte chunks */
+ bic src, srcin, #31
+ dup vrepmask.4s, wtmp2
+ ands soff, srcin, #31
+ and cntrem, cntin, #31
+ b.eq .Lloop
+
+ /*
+ * Input string is not 32-byte aligned. We calculate the syndrome
+ * value for the aligned 32 bytes block containing the first bytes
+ * and mask the irrelevant part.
+ */
+
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Clear the soff*2 lower bits */
+ lsl tmp, soff, #1
+ lsr synd, synd, tmp
+ lsl synd, synd, tmp
+ /* The first block can also be the last */
+ b.ls .Lmasklast
+ /* Have we found something already? */
+ cbnz synd, .Ltail
+
+.Lloop:
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ subs cntin, cntin, #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ /* If we're out of data we finish regardless of the result */
+ b.ls .Lend
+ /* Use a fast check for the termination condition */
+ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend.2d, vend.2d, vend.2d
+ mov synd, vend.2d[0]
+ /* We're not out of data, loop if we haven't found the character */
+ cbz synd, .Lloop
+
+.Lend:
+ /* Termination condition found, let's calculate the syndrome value */
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Only do the clear for the last possible block */
+ b.hi .Ltail
+
+.Lmasklast:
+ /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+ add tmp, cntrem, soff
+ and tmp, tmp, #31
+ sub tmp, tmp, #32
+ neg tmp, tmp, lsl #1
+ lsl synd, synd, tmp
+ lsr synd, synd, tmp
+
+.Ltail:
+ /* Count the trailing zeros using bit reversing */
+ rbit synd, synd
+ /* Compensate the last post-increment */
+ sub src, src, #32
+ /* Check that we have found a character */
+ cmp synd, #0
+ /* And count the leading zeros */
+ clz synd, synd
+ /* Compute the potential result */
+ add result, src, synd, lsr #1
+ /* Select result or NULL */
+ csel result, xzr, result, eq
+ ret
+
+.Lzero_length:
+ mov result, #0
+ ret
+
+ .size __memchr_aarch64, . - __memchr_aarch64
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S
new file mode 100644
index 0000000..d4f6026
--- /dev/null
+++ b/string/aarch64/memcmp-sve.S
@@ -0,0 +1,48 @@
+/*
+ * memcmp - compare memory
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ .arch armv8-a+sve
+ .text
+
+ .globl __memcmp_aarch64_sve
+ .type __memcmp_aarch64_sve, %function
+ .p2align 4
+__memcmp_aarch64_sve:
+ mov x3, 0 /* initialize off */
+
+0: whilelo p0.b, x3, x2 /* while off < max */
+ b.none 9f
+
+ ld1b z0.b, p0/z, [x0, x3] /* read vectors bounded by max. */
+ ld1b z1.b, p0/z, [x1, x3]
+
+ /* Increment for a whole vector, even if we've only read a partial.
+ This is significantly cheaper than INCP, and since OFF is not
+ used after the loop it is ok to increment OFF past MAX. */
+ incb x3
+
+ cmpne p1.b, p0/z, z0.b, z1.b /* while no inequalities */
+ b.none 0b
+
+ /* Found inequality. */
+1: brkb p1.b, p0/z, p1.b /* find first such */
+ lasta w0, p1, z0.b /* extract each byte */
+ lasta w1, p1, z1.b
+ sub x0, x0, x1 /* return comparison */
+ ret
+
+ /* Found end-of-count. */
+9: mov x0, 0 /* return equality */
+ ret
+
+ .size __memcmp_aarch64_sve, . - __memcmp_aarch64_sve
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
new file mode 100644
index 0000000..72a66bc
--- /dev/null
+++ b/string/aarch64/memcmp.S
@@ -0,0 +1,141 @@
+/* memcmp - compare memory
+ *
+ * Copyright (c) 2013, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ */
+
+#define L(l) .L ## l
+
+/* Parameters and result. */
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result w0
+
+/* Internal variables. */
+#define data1 x3
+#define data1w w3
+#define data1h x4
+#define data2 x5
+#define data2w w5
+#define data2h x6
+#define tmp1 x7
+#define tmp2 x8
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+def_fn __memcmp_aarch64 p2align=6
+ subs limit, limit, 8
+ b.lo L(less8)
+
+ ldr data1, [src1], 8
+ ldr data2, [src2], 8
+ cmp data1, data2
+ b.ne L(return)
+
+ subs limit, limit, 8
+ b.gt L(more16)
+
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ b L(return)
+
+L(more16):
+ ldr data1, [src1], 8
+ ldr data2, [src2], 8
+ cmp data1, data2
+ bne L(return)
+
+ /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+ strings. */
+ subs limit, limit, 16
+ b.ls L(last_bytes)
+
+ /* We overlap loads between 0-32 bytes at either side of SRC1 when we
+ try to align, so limit it only to strings larger than 128 bytes. */
+ cmp limit, 96
+ b.ls L(loop16)
+
+ /* Align src1 and adjust src2 with bytes not yet done. */
+ and tmp1, src1, 15
+ add limit, limit, tmp1
+ sub src1, src1, tmp1
+ sub src2, src2, tmp1
+
+ /* Loop performing 16 bytes per iteration using aligned src1.
+ Limit is pre-decremented by 16 and must be larger than zero.
+ Exit if <= 16 bytes left to do or if the data is not equal. */
+ .p2align 4
+L(loop16):
+ ldp data1, data1h, [src1], 16
+ ldp data2, data2h, [src2], 16
+ subs limit, limit, 16
+ ccmp data1, data2, 0, hi
+ ccmp data1h, data2h, 0, eq
+ b.eq L(loop16)
+
+ cmp data1, data2
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
+ cmp data1, data2
+ bne L(return)
+
+ /* Compare last 1-16 bytes using unaligned access. */
+L(last_bytes):
+ add src1, src1, limit
+ add src2, src2, limit
+ ldp data1, data1h, [src1]
+ ldp data2, data2h, [src2]
+ cmp data1, data2
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
+ cmp data1, data2
+
+ /* Compare data bytes and set return value to 0, -1 or 1. */
+L(return):
+#ifndef __AARCH64EB__
+ rev data1, data1
+ rev data2, data2
+#endif
+ cmp data1, data2
+L(ret_eq):
+ cset result, ne
+ cneg result, result, lo
+ ret
+
+ .p2align 4
+ /* Compare up to 8 bytes. Limit is [-8..-1]. */
+L(less8):
+ adds limit, limit, 4
+ b.lo L(less4)
+ ldr data1w, [src1], 4
+ ldr data2w, [src2], 4
+ cmp data1w, data2w
+ b.ne L(return)
+ sub limit, limit, 4
+L(less4):
+ adds limit, limit, 4
+ beq L(ret_eq)
+L(byte_loop):
+ ldrb data1w, [src1], 1
+ ldrb data2w, [src2], 1
+ subs limit, limit, 1
+ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
+ b.eq L(byte_loop)
+ sub result, data1w, data2w
+ ret
+
+ .size __memcmp_aarch64, . - __memcmp_aarch64
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
new file mode 100644
index 0000000..4bbd288
--- /dev/null
+++ b/string/aarch64/memcpy.S
@@ -0,0 +1,178 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2012, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define A_hw w7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l src
+#define E_h count
+#define F_l srcend
+#define F_h dst
+#define tmp1 x9
+
+#define L(l) .L ## l
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+ medium copies of 17..96 bytes which are fully unrolled. Large copies
+ of more than 96 bytes align the destination and use an unrolled loop
+ processing 64 bytes per iteration.
+ Small and medium copies read all data before writing, allowing any
+ kind of overlap, and memmove tailcalls memcpy for these cases as
+ well as non-overlapping copies.
+*/
+
+def_fn __memcpy_aarch64 p2align=6
+ prfm PLDL1KEEP, [src]
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16)
+ cmp count, 96
+ b.hi L(copy_long)
+
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldp A_l, A_h, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldp D_l, D_h, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+1:
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Small copies: 0..16 bytes. */
+L(copy16):
+ cmp count, 8
+ b.lo 1f
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+ .p2align 4
+1:
+ tbz count, 2, 1f
+ ldr A_lw, [src]
+ ldr A_hw, [srcend, -4]
+ str A_lw, [dstin]
+ str A_hw, [dstend, -4]
+ ret
+
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
+1:
+ cbz count, 2f
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb A_hw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
+2: ret
+
+ .p2align 4
+ /* Copy 64..96 bytes. Copy 64 bytes from the start and
+ 32 bytes from the end. */
+L(copy96):
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [src, 32]
+ ldp D_l, D_h, [src, 48]
+ ldp E_l, E_h, [srcend, -32]
+ ldp F_l, F_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin, 32]
+ stp D_l, D_h, [dstin, 48]
+ stp E_l, E_h, [dstend, -32]
+ stp F_l, F_h, [dstend, -16]
+ ret
+
+ /* Align DST to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+
+ .p2align 4
+L(copy_long):
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ ldp D_l, D_h, [src]
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls 2f
+1:
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi 1b
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the end even if
+ there is just 1 byte left. */
+2:
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
+ ret
+
+ .size __memcpy_aarch64, . - __memcpy_aarch64
diff --git a/string/aarch64/memmove.S b/string/aarch64/memmove.S
new file mode 100644
index 0000000..5e70f21
--- /dev/null
+++ b/string/aarch64/memmove.S
@@ -0,0 +1,103 @@
+/*
+ * memmove - copy memory area
+ *
+ * Copyright (c) 2013, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses
+ */
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+/* Parameters and result. */
+#define dstin x0
+#define src x1
+#define count x2
+#define srcend x3
+#define dstend x4
+#define tmp1 x5
+#define A_l x6
+#define A_h x7
+#define B_l x8
+#define B_h x9
+#define C_l x10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l count
+#define E_h tmp1
+
+/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
+ Larger backwards copies are also handled by memcpy. The only remaining
+ case is forward large copies. The destination is aligned, and an
+ unrolled loop processes 64 bytes per iteration.
+*/
+
+def_fn __memmove_aarch64, 6
+ sub tmp1, dstin, src
+ cmp count, 96
+ ccmp tmp1, count, 2, hi
+ b.hs __memcpy_aarch64
+
+ cbz tmp1, 3f
+ add dstend, dstin, count
+ add srcend, src, count
+
+ /* Align dstend to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+
+ and tmp1, dstend, 15
+ ldp D_l, D_h, [srcend, -16]
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls 2f
+ nop
+1:
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi 1b
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the start even if
+ there is just 1 byte left. */
+2:
+ ldp E_l, E_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp E_l, E_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+3: ret
+
+ .size __memmove_aarch64, . - __memmove_aarch64
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
new file mode 100644
index 0000000..aef22e9
--- /dev/null
+++ b/string/aarch64/memset.S
@@ -0,0 +1,188 @@
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2012, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses
+ *
+ */
+
+
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+#define tmp1 x5
+#define tmp1w w5
+#define tmp2 x6
+#define tmp2w w6
+#define zva_len x7
+#define zva_lenw w7
+
+#define L(l) .L ## l
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+def_fn __memset_aarch64 p2align=6
+
+ dup v0.16B, valw
+ add dstend, dstin, count
+
+ cmp count, 96
+ b.hi L(set_long)
+ cmp count, 16
+ b.hs L(set_medium)
+ mov val, v0.D[0]
+
+ /* Set 0..15 bytes. */
+ tbz count, 3, 1f
+ str val, [dstin]
+ str val, [dstend, -8]
+ ret
+ nop
+1: tbz count, 2, 2f
+ str valw, [dstin]
+ str valw, [dstend, -4]
+ ret
+2: cbz count, 3f
+ strb valw, [dstin]
+ tbz count, 1, 3f
+ strh valw, [dstend, -2]
+3: ret
+
+ /* Set 17..96 bytes. */
+L(set_medium):
+ str q0, [dstin]
+ tbnz count, 6, L(set96)
+ str q0, [dstend, -16]
+ tbz count, 5, 1f
+ str q0, [dstin, 16]
+ str q0, [dstend, -32]
+1: ret
+
+ .p2align 4
+ /* Set 64..96 bytes. Write 64 bytes from the start and
+ 32 bytes from the end. */
+L(set96):
+ str q0, [dstin, 16]
+ stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 3
+ nop
+L(set_long):
+ and valw, valw, 255
+ bic dst, dstin, 15
+ str q0, [dstin]
+ cmp count, 256
+ ccmp valw, 0, 0, cs
+ b.eq L(try_zva)
+L(no_zva):
+ sub count, dstend, dst /* Count is 16 too large. */
+ add dst, dst, 16
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+1: stp q0, q0, [dst], 64
+ stp q0, q0, [dst, -32]
+L(tail64):
+ subs count, count, 64
+ b.hi 1b
+2: stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 3
+L(try_zva):
+ mrs tmp1, dczid_el0
+ tbnz tmp1w, 4, L(no_zva)
+ and tmp1w, tmp1w, 15
+ cmp tmp1w, 4 /* ZVA size is 64 bytes. */
+ b.ne L(zva_128)
+
+ /* Write the first and last 64 byte aligned block using stp rather
+ than using DC ZVA. This is faster on some cores.
+ */
+L(zva_64):
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ bic dst, dst, 63
+ stp q0, q0, [dst, 64]
+ stp q0, q0, [dst, 96]
+ sub count, dstend, dst /* Count is now 128 too large. */
+ sub count, count, 128+64+64 /* Adjust count and bias for loop. */
+ add dst, dst, 128
+ nop
+1: dc zva, dst
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi 1b
+ stp q0, q0, [dst, 0]
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 3
+L(zva_128):
+ cmp tmp1w, 5 /* ZVA size is 128 bytes. */
+ b.ne L(zva_other)
+
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 64]
+ stp q0, q0, [dst, 96]
+ bic dst, dst, 127
+ sub count, dstend, dst /* Count is now 128 too large. */
+ sub count, count, 128+128 /* Adjust count and bias for loop. */
+ add dst, dst, 128
+1: dc zva, dst
+ add dst, dst, 128
+ subs count, count, 128
+ b.hi 1b
+ stp q0, q0, [dstend, -128]
+ stp q0, q0, [dstend, -96]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+L(zva_other):
+ mov tmp2w, 4
+ lsl zva_lenw, tmp2w, tmp1w
+ add tmp1, zva_len, 64 /* Max alignment bytes written. */
+ cmp count, tmp1
+ blo L(no_zva)
+
+ sub tmp2, zva_len, 1
+ add tmp1, dst, zva_len
+ add dst, dst, 16
+ subs count, tmp1, dst /* Actual alignment bytes to write. */
+ bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
+ beq 2f
+1: stp q0, q0, [dst], 64
+ stp q0, q0, [dst, -32]
+ subs count, count, 64
+ b.hi 1b
+2: mov dst, tmp1
+ sub count, dstend, tmp1 /* Remaining bytes to write. */
+ subs count, count, zva_len
+ b.lo 4f
+3: dc zva, dst
+ add dst, dst, zva_len
+ subs count, count, zva_len
+ b.hs 3b
+4: add count, count, zva_len
+ b L(tail64)
+
+ .size __memset_aarch64, . - __memset_aarch64
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S
new file mode 100644
index 0000000..8d8a319
--- /dev/null
+++ b/string/aarch64/strchr-sve.S
@@ -0,0 +1,69 @@
+/*
+ * strchr/strchrnul - find a character in a string
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ .arch armv8-a+sve
+ .text
+
+/* To build as strchrnul, define BUILD_STRCHRNUL before compiling this file. */
+#ifdef BUILD_STRCHRNUL
+#define FUNC __strchrnul_aarch64_sve
+#else
+#define FUNC __strchr_aarch64_sve
+#endif
+
+ .globl FUNC
+ .type FUNC, %function
+ .p2align 4
+FUNC:
+ dup z1.b, w1 /* replicate byte across vector */
+ setffr /* initialize FFR */
+ ptrue p1.b /* all ones; loop invariant */
+
+ .p2align 4
+ /* Read a vector's worth of bytes, stopping on first fault. */
+0: ldff1b z0.b, p1/z, [x0, xzr]
+ rdffrs p0.b, p1/z
+ b.nlast 2f
+
+ /* First fault did not fail: the whole vector is valid.
+ Avoid depending on the contents of FFR beyond the branch. */
+ incb x0 /* speculate increment */
+ cmpeq p2.b, p1/z, z0.b, z1.b /* search for c */
+ cmpeq p3.b, p1/z, z0.b, 0 /* search for 0 */
+ orrs p4.b, p1/z, p2.b, p3.b /* c | 0 */
+ b.none 0b
+ decb x0 /* undo speculate */
+
+ /* Found C or 0. */
+1: brka p4.b, p1/z, p4.b /* find first such */
+ sub x0, x0, 1 /* adjust pointer for that byte */
+ incp x0, p4.b
+#ifndef BUILD_STRCHRNUL
+ ptest p4, p2.b /* was first in c? */
+ csel x0, xzr, x0, none /* if there was no c, return null */
+#endif
+ ret
+
+ /* First fault failed: only some of the vector is valid.
+ Perform the comparision only on the valid bytes. */
+2: cmpeq p2.b, p0/z, z0.b, z1.b /* search for c */
+ cmpeq p3.b, p0/z, z0.b, 0 /* search for 0 */
+ orrs p4.b, p0/z, p2.b, p3.b /* c | 0 */
+ b.any 1b
+
+ /* No C or 0 found. Re-init FFR, increment, and loop. */
+ setffr
+ incp x0, p0.b
+ b 0b
+
+ .size FUNC, . - FUNC
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
new file mode 100644
index 0000000..945be3d
--- /dev/null
+++ b/string/aarch64/strchr.S
@@ -0,0 +1,137 @@
+/*
+ * strchr - find a character in a string
+ *
+ * Copyright (c) 2014-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+
+#define result x0
+
+#define src x2
+#define tmp1 x3
+#define wtmp2 w4
+#define tmp3 x5
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_nul1 v3
+#define vhas_nul2 v4
+#define vhas_chr1 v5
+#define vhas_chr2 v6
+#define vrepmask_0 v7
+#define vrepmask_c v16
+#define vend1 v17
+#define vend2 v18
+
+/* Core algorithm.
+
+ For each 32-byte hunk we calculate a 64-bit syndrome value, with
+ two bits per byte (LSB is always in bits 0 and 1, for both big
+ and little-endian systems). For each tuple, bit 0 is set iff
+ the relevant byte matched the requested character; bit 1 is set
+ iff the relevant byte matched the NUL end of string (we trigger
+ off bit0 for the special case of looking for NUL). Since the bits
+ in the syndrome reflect exactly the order in which things occur
+ in the original string a count_trailing_zeros() operation will
+ identify exactly which byte is causing the termination, and why. */
+
+/* Locals and temporaries. */
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+def_fn __strchr_aarch64
+ /* Magic constant 0x40100401 to allow us to identify which lane
+ matches the requested byte. Magic constant 0x80200802 used
+ similarly for NUL termination. */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
+ dup vrepmask_c.4s, wtmp2
+ ands tmp1, srcin, #31
+ add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+ b.eq .Lloop
+
+ /* Input string is not 32-byte aligned. Rather than forcing
+ the padding bytes to a safe value, we calculate the syndrome
+ for all the bytes, but then mask off those bits of the
+ syndrome that are related to the padding. */
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ neg tmp1, tmp1
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+ and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+ orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
+ orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+ lsl tmp1, tmp1, #1
+ addp vend1.16b, vend1.16b, vend2.16b // 256->128
+ mov tmp3, #~0
+ addp vend1.16b, vend1.16b, vend2.16b // 128->64
+ lsr tmp1, tmp3, tmp1
+
+ mov tmp3, vend1.2d[0]
+ bic tmp1, tmp3, tmp1 // Mask padding bits.
+ cbnz tmp1, .Ltail
+
+.Lloop:
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ /* Use a fast check for the termination condition. */
+ orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
+ orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+ orr vend1.16b, vend1.16b, vend2.16b
+ addp vend1.2d, vend1.2d, vend1.2d
+ mov tmp1, vend1.2d[0]
+ cbz tmp1, .Lloop
+
+ /* Termination condition found. Now need to establish exactly why
+ we terminated. */
+ and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+ and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+ orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
+ orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+ addp vend1.16b, vend1.16b, vend2.16b // 256->128
+ addp vend1.16b, vend1.16b, vend2.16b // 128->64
+
+ mov tmp1, vend1.2d[0]
+.Ltail:
+ /* Count the trailing zeros, by bit reversing... */
+ rbit tmp1, tmp1
+ /* Re-bias source. */
+ sub src, src, #32
+ clz tmp1, tmp1 /* And counting the leading zeros. */
+ /* Tmp1 is even if the target charager was found first. Otherwise
+ we've found the end of string and we weren't looking for NUL. */
+ tst tmp1, #1
+ add result, src, tmp1, lsr #1
+ csel result, result, xzr, eq
+ ret
+
+ .size __strchr_aarch64, . - __strchr_aarch64
diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S
new file mode 100644
index 0000000..5140e59
--- /dev/null
+++ b/string/aarch64/strchrnul-sve.S
@@ -0,0 +1,9 @@
+/*
+ * strchrnul - find a character or nul in a string
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define BUILD_STRCHRNUL
+#include "strchr-sve.S"
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
new file mode 100644
index 0000000..d19c0e8
--- /dev/null
+++ b/string/aarch64/strchrnul.S
@@ -0,0 +1,122 @@
+/*
+ * strchrnul - find a character or nul in a string
+ *
+ * Copyright (c) 2014-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+
+#define result x0
+
+#define src x2
+#define tmp1 x3
+#define wtmp2 w4
+#define tmp3 x5
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_nul1 v3
+#define vhas_nul2 v4
+#define vhas_chr1 v5
+#define vhas_chr2 v6
+#define vrepmask v7
+#define vend1 v16
+
+/* Core algorithm.
+
+ For each 32-byte hunk we calculate a 64-bit syndrome value, with
+ two bits per byte (LSB is always in bits 0 and 1, for both big
+ and little-endian systems). For each tuple, bit 0 is set iff
+ the relevant byte matched the requested character or nul. Since the
+ bits in the syndrome reflect exactly the order in which things occur
+ in the original string a count_trailing_zeros() operation will
+ identify exactly which byte is causing the termination. */
+
+/* Locals and temporaries. */
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+def_fn __strchrnul_aarch64
+ /* Magic constant 0x40100401 to allow us to identify which lane
+ matches the termination condition. */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
+ dup vrepmask.4s, wtmp2
+ ands tmp1, srcin, #31
+ b.eq .Lloop
+
+ /* Input string is not 32-byte aligned. Rather than forcing
+ the padding bytes to a safe value, we calculate the syndrome
+ for all the bytes, but then mask off those bits of the
+ syndrome that are related to the padding. */
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ neg tmp1, tmp1
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
+ orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ lsl tmp1, tmp1, #1
+ addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ mov tmp3, #~0
+ addp vend1.16b, vend1.16b, vend1.16b // 128->64
+ lsr tmp1, tmp3, tmp1
+
+ mov tmp3, vend1.2d[0]
+ bic tmp1, tmp3, tmp1 // Mask padding bits.
+ cbnz tmp1, .Ltail
+
+.Lloop:
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ /* Use a fast check for the termination condition. */
+ orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
+ orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
+ orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend1.2d, vend1.2d, vend1.2d
+ mov tmp1, vend1.2d[0]
+ cbz tmp1, .Lloop
+
+ /* Termination condition found. Now need to establish exactly why
+ we terminated. */
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ addp vend1.16b, vend1.16b, vend1.16b // 128->64
+
+ mov tmp1, vend1.2d[0]
+.Ltail:
+ /* Count the trailing zeros, by bit reversing... */
+ rbit tmp1, tmp1
+ /* Re-bias source. */
+ sub src, src, #32
+ clz tmp1, tmp1 /* ... and counting the leading zeros. */
+ /* tmp1 is twice the offset into the fragment. */
+ add result, src, tmp1, lsr #1
+ ret
+
+ .size __strchrnul_aarch64, . - __strchrnul_aarch64
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S
new file mode 100644
index 0000000..91bac19
--- /dev/null
+++ b/string/aarch64/strcmp-sve.S
@@ -0,0 +1,57 @@
+/*
+ * __strcmp_aarch64_sve - compare two strings
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ .arch armv8-a+sve
+ .text
+
+ .globl __strcmp_aarch64_sve
+ .type __strcmp_aarch64_sve, %function
+ .p2align 4
+__strcmp_aarch64_sve:
+ setffr /* initialize FFR */
+ ptrue p1.b, all /* all ones; loop invariant */
+ mov x2, 0 /* initialize offset */
+ nop
+
+ /* Read a vector's worth of bytes, stopping on first fault. */
+0: ldff1b z0.b, p1/z, [x0, x2]
+ ldff1b z1.b, p1/z, [x1, x2]
+ rdffrs p0.b, p1/z
+ b.nlast 2f
+
+ /* First fault did not fail: the whole vector is valid.
+ Avoid depending on the contents of FFR beyond the branch. */
+ incb x2, all /* skip bytes for next round */
+ cmpeq p2.b, p1/z, z0.b, z1.b /* compare strings */
+ cmpne p3.b, p1/z, z0.b, 0 /* search for ~zero */
+ nands p2.b, p1/z, p2.b, p3.b /* ~(eq & ~zero) -> ne | zero */
+ b.none 0b
+
+ /* Found end-of-string or inequality. */
+1: brkb p2.b, p1/z, p2.b /* find first such */
+ lasta w0, p2, z0.b /* extract each char */
+ lasta w1, p2, z1.b
+ sub x0, x0, x1 /* return comparison */
+ ret
+
+ /* First fault failed: only some of the vector is valid.
+ Perform the comparison only on the valid bytes. */
+2: incp x2, p0.b /* skip bytes for next round */
+ setffr /* re-init FFR for next round */
+ cmpeq p2.b, p0/z, z0.b, z1.b /* compare strings, as above */
+ cmpne p3.b, p0/z, z0.b, 0
+ nands p2.b, p0/z, p2.b, p3.b
+ b.none 0b
+ b 1b
+
+ .size __strcmp_aarch64_sve, . - __strcmp_aarch64_sve
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
new file mode 100644
index 0000000..2aa367c
--- /dev/null
+++ b/string/aarch64/strcmp.S
@@ -0,0 +1,177 @@
+/*
+ * strcmp - compare two strings
+ *
+ * Copyright (c) 2012, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+#define L(label) .L ## label
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and result. */
+#define src1 x0
+#define src2 x1
+#define result x0
+
+/* Internal variables. */
+#define data1 x2
+#define data1w w2
+#define data2 x3
+#define data2w w3
+#define has_nul x4
+#define diff x5
+#define syndrome x6
+#define tmp1 x7
+#define tmp2 x8
+#define tmp3 x9
+#define zeroones x10
+#define pos x11
+
+ /* Start of performance-critical section -- one 64B cache line. */
+def_fn __strcmp_aarch64 p2align=6
+ eor tmp1, src1, src2
+ mov zeroones, #REP8_01
+ tst tmp1, #7
+ b.ne L(misaligned8)
+ ands tmp1, src1, #7
+ b.ne L(mutual_align)
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+L(loop_aligned):
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+L(start_realigned):
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ orr syndrome, diff, has_nul
+ cbz syndrome, L(loop_aligned)
+ /* End of performance-critical section -- one 64B cache line. */
+
+L(end):
+#ifndef __AARCH64EB__
+ rev syndrome, syndrome
+ rev data1, data1
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ clz pos, syndrome
+ rev data2, data2
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#else
+ /* For big-endian we cannot use the trick with the syndrome value
+ as carry-propagation can corrupt the upper bits if the trailing
+ bytes in the string contain 0x01. */
+ /* However, if there is no NUL byte in the dword, we can generate
+ the result directly. We can't just subtract the bytes as the
+ MSB might be significant. */
+ cbnz has_nul, 1f
+ cmp data1, data2
+ cset result, ne
+ cneg result, result, lo
+ ret
+1:
+ /* Re-compute the NUL-byte detection, using a byte-reversed value. */
+ rev tmp3, data1
+ sub tmp1, tmp3, zeroones
+ orr tmp2, tmp3, #REP8_7f
+ bic has_nul, tmp1, tmp2
+ rev has_nul, has_nul
+ orr syndrome, diff, has_nul
+ clz pos, syndrome
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#endif
+
+L(mutual_align):
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask off
+ the bytes that preceed the start point. */
+ bic src1, src1, #7
+ bic src2, src2, #7
+ lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
+ ldr data1, [src1], #8
+ neg tmp1, tmp1 /* Bits to alignment -64. */
+ ldr data2, [src2], #8
+ mov tmp2, #~0
+#ifdef __AARCH64EB__
+ /* Big-endian. Early bytes are at MSB. */
+ lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#else
+ /* Little-endian. Early bytes are at LSB. */
+ lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#endif
+ orr data1, data1, tmp2
+ orr data2, data2, tmp2
+ b L(start_realigned)
+
+L(misaligned8):
+ /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+ checking to make sure that we don't access beyond page boundary in
+ SRC2. */
+ tst src1, #7
+ b.eq L(loop_misaligned)
+L(do_misaligned):
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ cmp data1w, #1
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.ne L(done)
+ tst src1, #7
+ b.ne L(do_misaligned)
+
+L(loop_misaligned):
+ /* Test if we are within the last dword of the end of a 4K page. If
+ yes then jump back to the misaligned loop to copy a byte at a time. */
+ and tmp1, src2, #0xff8
+ eor tmp1, tmp1, #0xff8
+ cbz tmp1, L(do_misaligned)
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ orr syndrome, diff, has_nul
+ cbz syndrome, L(loop_misaligned)
+ b L(end)
+
+L(done):
+ sub result, data1, data2
+ ret
+ .size __strcmp_aarch64, .-__strcmp_aarch64
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S
new file mode 100644
index 0000000..c929f37
--- /dev/null
+++ b/string/aarch64/strcpy-sve.S
@@ -0,0 +1,69 @@
+/*
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ .arch armv8-a+sve
+ .text
+
+/* To build as stpcpy, define BUILD_STPCPY before compiling this file. */
+#ifdef BUILD_STPCPY
+#define FUNC __stpcpy_aarch64_sve
+#else
+#define FUNC __strcpy_aarch64_sve
+#endif
+
+ .globl FUNC
+ .type FUNC, %function
+ .p2align 4
+FUNC:
+ setffr /* initialize FFR */
+ ptrue p2.b, all /* all ones; loop invariant */
+ mov x2, 0 /* initialize offset */
+
+ .p2align 4
+ /* Read a vector's worth of bytes, stopping on first fault. */
+0: ldff1b z0.b, p2/z, [x1, x2]
+ rdffrs p0.b, p2/z
+ b.nlast 1f
+
+ /* First fault did not fail: the whole vector is valid.
+ Avoid depending on the contexts of FFR beyond the branch. */
+ cmpeq p1.b, p2/z, z0.b, 0 /* search for zeros */
+ b.any 2f
+
+ /* No zero found. Store the whole vector and loop. */
+ st1b z0.b, p2, [x0, x2]
+ incb x2, all
+ b 0b
+
+ /* First fault failed: only some of the vector is valid.
+ Perform the comparison only on the valid bytes. */
+1: cmpeq p1.b, p0/z, z0.b, 0 /* search for zeros */
+ b.any 2f
+
+ /* No zero found. Store the valid portion of the vector and loop. */
+ setffr /* re-init FFR */
+ st1b z0.b, p0, [x0, x2]
+ incp x2, p0.b
+ b 0b
+
+ /* Zero found. Crop the vector to the found zero and finish. */
+2: brka p0.b, p2/z, p1.b
+ st1b z0.b, p0, [x0, x2]
+#ifdef BUILD_STPCPY
+ add x0, x0, x2
+ sub x0, x0, 1
+ incp x0, p0.b
+#endif
+ ret
+
+ .size FUNC, . - FUNC
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
new file mode 100644
index 0000000..4e10b4d
--- /dev/null
+++ b/string/aarch64/strcpy.S
@@ -0,0 +1,314 @@
+/*
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
+ *
+ * Copyright (c) 2013-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ */
+
+/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
+
+ To test the page crossing code path more thoroughly, compile with
+ -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
+ entry path. This option is not intended for production use. */
+
+/* Arguments and results. */
+#define dstin x0
+#define srcin x1
+
+/* Locals and temporaries. */
+#define src x2
+#define dst x3
+#define data1 x4
+#define data1w w4
+#define data2 x5
+#define data2w w5
+#define has_nul1 x6
+#define has_nul2 x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define tmp4 x11
+#define zeroones x12
+#define data1a x13
+#define data2a x14
+#define pos x15
+#define len x16
+#define to_align x17
+
+#ifdef BUILD_STPCPY
+#define STRCPY __stpcpy_aarch64
+#else
+#define STRCPY __strcpy_aarch64
+#endif
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+ /* AArch64 systems have a minimum page size of 4k. We can do a quick
+ page size check for crossing this boundary on entry and if we
+ do not, then we can short-circuit much of the entry code. We
+ expect early page-crossing strings to be rare (probability of
+ 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
+ predictable, even with random strings.
+
+ We don't bother checking for larger page sizes, the cost of setting
+ up the correct page size is just not worth the extra gain from
+ a small reduction in the cases taking the slow path. Note that
+ we only care about whether the first fetch, which may be
+ misaligned, crosses a page boundary - after that we move to aligned
+ fetches for the remainder of the string. */
+
+#ifdef STRCPY_TEST_PAGE_CROSS
+ /* Make everything that isn't Qword aligned look like a page cross. */
+#define MIN_PAGE_P2 4
+#else
+#define MIN_PAGE_P2 12
+#endif
+
+#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
+
+def_fn STRCPY p2align=6
+ /* For moderately short strings, the fastest way to do the copy is to
+ calculate the length of the string in the same way as strlen, then
+ essentially do a memcpy of the result. This avoids the need for
+ multiple byte copies and further means that by the time we
+ reach the bulk copy loop we know we can always use DWord
+ accesses. We expect __strcpy_aarch64 to rarely be called repeatedly
+ with the same source string, so branch prediction is likely to
+ always be difficult - we mitigate against this by preferring
+ conditional select operations over branches whenever this is
+ feasible. */
+ and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
+ mov zeroones, #REP8_01
+ and to_align, srcin, #15
+ cmp tmp2, #(MIN_PAGE_SIZE - 16)
+ neg tmp1, to_align
+ /* The first fetch will straddle a (possible) page boundary iff
+ srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
+ aligned string will never fail the page align check, so will
+ always take the fast path. */
+ b.gt .Lpage_cross
+
+.Lpage_cross_ok:
+ ldp data1, data2, [srcin]
+#ifdef __AARCH64EB__
+ /* Because we expect the end to be found within 16 characters
+ (profiling shows this is the most common case), it's worth
+ swapping the bytes now to save having to recalculate the
+ termination syndrome later. We preserve data1 and data2
+ so that we can re-use the values later on. */
+ rev tmp2, data1
+ sub tmp1, tmp2, zeroones
+ orr tmp2, tmp2, #REP8_7f
+ bics has_nul1, tmp1, tmp2
+ b.ne .Lfp_le8
+ rev tmp4, data2
+ sub tmp3, tmp4, zeroones
+ orr tmp4, tmp4, #REP8_7f
+#else
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ bics has_nul1, tmp1, tmp2
+ b.ne .Lfp_le8
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+#endif
+ bics has_nul2, tmp3, tmp4
+ b.eq .Lbulk_entry
+
+ /* The string is short (<=16 bytes). We don't know exactly how
+ short though, yet. Work out the exact length so that we can
+ quickly select the optimal copy strategy. */
+.Lfp_gt8:
+ rev has_nul2, has_nul2
+ clz pos, has_nul2
+ mov tmp2, #56
+ add dst, dstin, pos, lsr #3 /* Bits to bytes. */
+ sub pos, tmp2, pos
+#ifdef __AARCH64EB__
+ lsr data2, data2, pos
+#else
+ lsl data2, data2, pos
+#endif
+ str data2, [dst, #1]
+ str data1, [dstin]
+#ifdef BUILD_STPCPY
+ add dstin, dst, #8
+#endif
+ ret
+
+.Lfp_le8:
+ rev has_nul1, has_nul1
+ clz pos, has_nul1
+ add dst, dstin, pos, lsr #3 /* Bits to bytes. */
+ subs tmp2, pos, #24 /* Pos in bits. */
+ b.lt .Lfp_lt4
+#ifdef __AARCH64EB__
+ mov tmp2, #56
+ sub pos, tmp2, pos
+ lsr data2, data1, pos
+ lsr data1, data1, #32
+#else
+ lsr data2, data1, tmp2
+#endif
+ /* 4->7 bytes to copy. */
+ str data2w, [dst, #-3]
+ str data1w, [dstin]
+#ifdef BUILD_STPCPY
+ mov dstin, dst
+#endif
+ ret
+.Lfp_lt4:
+ cbz pos, .Lfp_lt2
+ /* 2->3 bytes to copy. */
+#ifdef __AARCH64EB__
+ lsr data1, data1, #48
+#endif
+ strh data1w, [dstin]
+ /* Fall-through, one byte (max) to go. */
+.Lfp_lt2:
+ /* Null-terminated string. Last character must be zero! */
+ strb wzr, [dst]
+#ifdef BUILD_STPCPY
+ mov dstin, dst
+#endif
+ ret
+
+ .p2align 6
+ /* Aligning here ensures that the entry code and main loop all lies
+ within one 64-byte cache line. */
+.Lbulk_entry:
+ sub to_align, to_align, #16
+ stp data1, data2, [dstin]
+ sub src, srcin, to_align
+ sub dst, dstin, to_align
+ b .Lentry_no_page_cross
+
+ /* The inner loop deals with two Dwords at a time. This has a
+ slightly higher start-up cost, but we should win quite quickly,
+ especially on cores with a high number of issue slots per
+ cycle, as we get much better parallelism out of the operations. */
+.Lmain_loop:
+ stp data1, data2, [dst], #16
+.Lentry_no_page_cross:
+ ldp data1, data2, [src], #16
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+ bics has_nul2, tmp3, tmp4
+ ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
+ b.eq .Lmain_loop
+
+ /* Since we know we are copying at least 16 bytes, the fastest way
+ to deal with the tail is to determine the location of the
+ trailing NUL, then (re)copy the 16 bytes leading up to that. */
+ cmp has_nul1, #0
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul directly. The
+ easiest way to get the correct byte is to byte-swap the data
+ and calculate the syndrome a second time. */
+ csel data1, data1, data2, ne
+ rev data1, data1
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+#else
+ csel has_nul1, has_nul1, has_nul2, ne
+#endif
+ rev has_nul1, has_nul1
+ clz pos, has_nul1
+ add tmp1, pos, #72
+ add pos, pos, #8
+ csel pos, pos, tmp1, ne
+ add src, src, pos, lsr #3
+ add dst, dst, pos, lsr #3
+ ldp data1, data2, [src, #-32]
+ stp data1, data2, [dst, #-16]
+#ifdef BUILD_STPCPY
+ sub dstin, dst, #1
+#endif
+ ret
+
+.Lpage_cross:
+ bic src, srcin, #15
+ /* Start by loading two words at [srcin & ~15], then forcing the
+ bytes that precede srcin to 0xff. This means they never look
+ like termination bytes. */
+ ldp data1, data2, [src]
+ lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
+ tst to_align, #7
+ csetm tmp2, ne
+#ifdef __AARCH64EB__
+ lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#else
+ lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#endif
+ orr data1, data1, tmp2
+ orr data2a, data2, tmp2
+ cmp to_align, #8
+ csinv data1, data1, xzr, lt
+ csel data2, data2, data2a, lt
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+ bics has_nul2, tmp3, tmp4
+ ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
+ b.eq .Lpage_cross_ok
+ /* We now need to make data1 and data2 look like they've been
+ loaded directly from srcin. Do a rotate on the 128-bit value. */
+ lsl tmp1, to_align, #3 /* Bytes->bits. */
+ neg tmp2, to_align, lsl #3
+#ifdef __AARCH64EB__
+ lsl data1a, data1, tmp1
+ lsr tmp4, data2, tmp2
+ lsl data2, data2, tmp1
+ orr tmp4, tmp4, data1a
+ cmp to_align, #8
+ csel data1, tmp4, data2, lt
+ rev tmp2, data1
+ rev tmp4, data2
+ sub tmp1, tmp2, zeroones
+ orr tmp2, tmp2, #REP8_7f
+ sub tmp3, tmp4, zeroones
+ orr tmp4, tmp4, #REP8_7f
+#else
+ lsr data1a, data1, tmp1
+ lsl tmp4, data2, tmp2
+ lsr data2, data2, tmp1
+ orr tmp4, tmp4, data1a
+ cmp to_align, #8
+ csel data1, tmp4, data2, lt
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+#endif
+ bic has_nul1, tmp1, tmp2
+ cbnz has_nul1, .Lfp_le8
+ bic has_nul2, tmp3, tmp4
+ b .Lfp_gt8
+
+ .size STRCPY, . - STRCPY
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S
new file mode 100644
index 0000000..64ede85
--- /dev/null
+++ b/string/aarch64/strlen-sve.S
@@ -0,0 +1,55 @@
+/*
+ * __strlen_aarch64_sve - compute the length of a string
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ .arch armv8-a+sve
+ .text
+
+ .globl __strlen_aarch64_sve
+ .type __strlen_aarch64_sve, %function
+ .p2align 4
+__strlen_aarch64_sve:
+ setffr /* initialize FFR */
+ ptrue p2.b /* all ones; loop invariant */
+ mov x1, 0 /* initialize length */
+ nop
+
+ /* Read a vector's worth of bytes, stopping on first fault. */
+0: ldff1b z0.b, p2/z, [x0, x1]
+ nop
+ rdffrs p0.b, p2/z
+ b.nlast 2f
+
+ /* First fault did not fail: the whole vector is valid.
+ Avoid depending on the contents of FFR beyond the branch. */
+ incb x1, all /* speculate increment */
+ cmpeq p1.b, p2/z, z0.b, 0 /* loop if no zeros */
+ b.none 0b
+ decb x1, all /* undo speculate */
+
+ /* Zero found. Select the bytes before the first and count them. */
+1: brkb p0.b, p2/z, p1.b
+ incp x1, p0.b
+ mov x0, x1
+ ret
+
+ /* First fault failed: only some of the vector is valid.
+ Perform the comparison only on the valid bytes. */
+2: cmpeq p1.b, p0/z, z0.b, 0
+ b.any 1b
+
+ /* No zero found. Re-init FFR, increment, and loop. */
+ setffr
+ incp x1, p0.b
+ b 0b
+
+ .size __strlen_aarch64_sve, . - __strlen_aarch64_sve
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
new file mode 100644
index 0000000..26388d7
--- /dev/null
+++ b/string/aarch64/strlen.S
@@ -0,0 +1,214 @@
+/*
+ * strlen - calculate the length of a string
+ *
+ * Copyright (c) 2013, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ */
+
+/* To test the page crossing code path more thoroughly, compile with
+ -DTEST_PAGE_CROSS - this will force all calls through the slower
+ entry path. This option is not intended for production use. */
+
+/* Arguments and results. */
+#define srcin x0
+#define len x0
+
+/* Locals and temporaries. */
+#define src x1
+#define data1 x2
+#define data2 x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1 x4
+#define tmp2 x5
+#define tmp3 x6
+#define tmp4 x7
+#define zeroones x8
+
+#define L(l) .L ## l
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. A faster check
+ (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
+ false hits for characters 129..255. */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+#ifdef TEST_PAGE_CROSS
+# define MIN_PAGE_SIZE 15
+#else
+# define MIN_PAGE_SIZE 4096
+#endif
+
+ /* Since strings are short on average, we check the first 16 bytes
+ of the string for a NUL character. In order to do an unaligned ldp
+ safely we have to do a page cross check first. If there is a NUL
+ byte we calculate the length from the 2 8-byte words using
+ conditional select to reduce branch mispredictions (it is unlikely
+ __strlen_aarch64 will be repeatedly called on strings with the same length).
+
+ If the string is longer than 16 bytes, we align src so don't need
+ further page cross checks, and process 32 bytes per iteration
+ using the fast NUL check. If we encounter non-ASCII characters,
+ fallback to a second loop using the full NUL check.
+
+ If the page cross check fails, we read 16 bytes from an aligned
+ address, remove any characters before the string, and continue
+ in the main loop using aligned loads. Since strings crossing a
+ page in the first 16 bytes are rare (probability of
+ 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
+
+ AArch64 systems have a minimum page size of 4k. We don't bother
+ checking for larger page sizes - the cost of setting up the correct
+ page size is just not worth the extra gain from a small reduction in
+ the cases taking the slow path. Note that we only care about
+ whether the first fetch, which may be misaligned, crosses a page
+ boundary. */
+
+def_fn __strlen_aarch64 p2align=6
+ and tmp1, srcin, MIN_PAGE_SIZE - 1
+ mov zeroones, REP8_01
+ cmp tmp1, MIN_PAGE_SIZE - 16
+ b.gt L(page_cross)
+ ldp data1, data2, [srcin]
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul1/2 directly.
+ Since we expect strings to be small and early-exit,
+ byte-swap the data now so has_null1/2 will be correct. */
+ rev data1, data1
+ rev data2, data2
+#endif
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(main_loop_entry)
+
+ /* Enter with C = has_nul1 == 0. */
+ csel has_nul1, has_nul1, has_nul2, cc
+ mov len, 8
+ rev has_nul1, has_nul1
+ clz tmp1, has_nul1
+ csel len, xzr, len, cc
+ add len, len, tmp1, lsr 3
+ ret
+
+ /* The inner loop processes 32 bytes per iteration and uses the fast
+ NUL check. If we encounter non-ASCII characters, use a second
+ loop with the accurate NUL check. */
+ .p2align 4
+L(main_loop_entry):
+ bic src, srcin, 15
+ sub src, src, 16
+L(main_loop):
+ ldp data1, data2, [src, 32]!
+.Lpage_cross_entry:
+ sub tmp1, data1, zeroones
+ sub tmp3, data2, zeroones
+ orr tmp2, tmp1, tmp3
+ tst tmp2, zeroones, lsl 7
+ bne 1f
+ ldp data1, data2, [src, 16]
+ sub tmp1, data1, zeroones
+ sub tmp3, data2, zeroones
+ orr tmp2, tmp1, tmp3
+ tst tmp2, zeroones, lsl 7
+ beq L(main_loop)
+ add src, src, 16
+1:
+ /* The fast check failed, so do the slower, accurate NUL check. */
+ orr tmp2, data1, REP8_7f
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(nonascii_loop)
+
+ /* Enter with C = has_nul1 == 0. */
+L(tail):
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul1/2 directly. The
+ easiest way to get the correct byte is to byte-swap the data
+ and calculate the syndrome a second time. */
+ csel data1, data1, data2, cc
+ rev data1, data1
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ bic has_nul1, tmp1, tmp2
+#else
+ csel has_nul1, has_nul1, has_nul2, cc
+#endif
+ sub len, src, srcin
+ rev has_nul1, has_nul1
+ add tmp2, len, 8
+ clz tmp1, has_nul1
+ csel len, len, tmp2, cc
+ add len, len, tmp1, lsr 3
+ ret
+
+L(nonascii_loop):
+ ldp data1, data2, [src, 16]!
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ bne L(tail)
+ ldp data1, data2, [src, 16]!
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(nonascii_loop)
+ b L(tail)
+
+ /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
+ srcin to 0x7f, so we ignore any NUL bytes before the string.
+ Then continue in the aligned loop. */
+L(page_cross):
+ bic src, srcin, 15
+ ldp data1, data2, [src]
+ lsl tmp1, srcin, 3
+ mov tmp4, -1
+#ifdef __AARCH64EB__
+ /* Big-endian. Early bytes are at MSB. */
+ lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
+#else
+ /* Little-endian. Early bytes are at LSB. */
+ lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
+#endif
+ orr tmp1, tmp1, REP8_80
+ orn data1, data1, tmp1
+ orn tmp2, data2, tmp1
+ tst srcin, 8
+ csel data1, data1, tmp4, eq
+ csel data2, data2, tmp2, eq
+ b L(page_cross_entry)
+
+ .size __strlen_aarch64, . - __strlen_aarch64
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S
new file mode 100644
index 0000000..6f31eca
--- /dev/null
+++ b/string/aarch64/strncmp-sve.S
@@ -0,0 +1,66 @@
+/*
+ * strncmp - compare two strings with limit
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ .arch armv8-a+sve
+ .text
+
+ .globl __strncmp_aarch64_sve
+ .type __strncmp_aarch64_sve, %function
+ .p2align 4
+__strncmp_aarch64_sve:
+ setffr /* initialize FFR */
+ mov x3, 0 /* initialize off */
+
+0: whilelo p0.b, x3, x2 /* while off < max */
+ b.none 9f
+
+ ldff1b z0.b, p0/z, [x0, x3]
+ ldff1b z1.b, p0/z, [x1, x3]
+ rdffrs p1.b, p0/z
+ b.nlast 2f
+
+ /* First fault did not fail: the vector up to max is valid.
+ Avoid depending on the contents of FFR beyond the branch.
+ Increment for a whole vector, even if we've only read a partial.
+ This is significantly cheaper than INCP, and since OFF is not
+ used after the loop it is ok to increment OFF past MAX. */
+ incb x3
+ cmpeq p1.b, p0/z, z0.b, z1.b /* compare strings */
+ cmpne p2.b, p0/z, z0.b, 0 /* search for ~zero */
+ nands p2.b, p0/z, p1.b, p2.b /* ~(eq & ~zero) -> ne | zero */
+ b.none 0b
+
+ /* Found end-of-string or inequality. */
+1: brkb p2.b, p0/z, p2.b /* find first such */
+ lasta w0, p2, z0.b /* extract each char */
+ lasta w1, p2, z1.b
+ sub x0, x0, x1 /* return comparison */
+ ret
+
+ /* First fault failed: only some of the vector is valid.
+ Perform the comparison only on the valid bytes. */
+2: cmpeq p2.b, p1/z, z0.b, z1.b /* compare strings, as above */
+ cmpne p3.b, p1/z, z0.b, 0
+ nands p2.b, p1/z, p2.b, p3.b
+ b.any 1b
+
+ /* No inequality or zero found. Re-init FFR, incr and loop. */
+ setffr
+ incp x3, p1.b
+ b 0b
+
+ /* Found end-of-count. */
+9: mov x0, 0 /* return equal */
+ ret
+
+ .size __strncmp_aarch64_sve, . - __strncmp_aarch64_sve
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
new file mode 100644
index 0000000..ced72b9
--- /dev/null
+++ b/string/aarch64/strncmp.S
@@ -0,0 +1,266 @@
+/*
+ * strncmp - compare two strings
+ *
+ * Copyright (c) 2013, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and result. */
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result x0
+
+/* Internal variables. */
+#define data1 x3
+#define data1w w3
+#define data2 x4
+#define data2w w4
+#define has_nul x5
+#define diff x6
+#define syndrome x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define zeroones x11
+#define pos x12
+#define limit_wd x13
+#define mask x14
+#define endloop x15
+#define count mask
+
+ .text
+ .p2align 6
+ .rep 7
+ nop /* Pad so that the loop below fits a cache line. */
+ .endr
+def_fn __strncmp_aarch64
+ cbz limit, .Lret0
+ eor tmp1, src1, src2
+ mov zeroones, #REP8_01
+ tst tmp1, #7
+ and count, src1, #7
+ b.ne .Lmisaligned8
+ cbnz count, .Lmutual_align
+ /* Calculate the number of full and partial words -1. */
+ sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
+ lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+ /* Start of performance-critical section -- one 64B cache line. */
+.Lloop_aligned:
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+.Lstart_realigned:
+ subs limit_wd, limit_wd, #1
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ csinv endloop, diff, xzr, pl /* Last Dword or differences. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp endloop, #0, #0, eq
+ b.eq .Lloop_aligned
+ /* End of performance-critical section -- one 64B cache line. */
+
+ /* Not reached the limit, must have found the end or a diff. */
+ tbz limit_wd, #63, .Lnot_limit
+
+ /* Limit % 8 == 0 => all bytes significant. */
+ ands limit, limit, #7
+ b.eq .Lnot_limit
+
+ lsl limit, limit, #3 /* Bits -> bytes. */
+ mov mask, #~0
+#ifdef __AARCH64EB__
+ lsr mask, mask, limit
+#else
+ lsl mask, mask, limit
+#endif
+ bic data1, data1, mask
+ bic data2, data2, mask
+
+ /* Make sure that the NUL byte is marked in the syndrome. */
+ orr has_nul, has_nul, mask
+
+.Lnot_limit:
+ orr syndrome, diff, has_nul
+
+#ifndef __AARCH64EB__
+ rev syndrome, syndrome
+ rev data1, data1
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ clz pos, syndrome
+ rev data2, data2
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#else
+ /* For big-endian we cannot use the trick with the syndrome value
+ as carry-propagation can corrupt the upper bits if the trailing
+ bytes in the string contain 0x01. */
+ /* However, if there is no NUL byte in the dword, we can generate
+ the result directly. We can't just subtract the bytes as the
+ MSB might be significant. */
+ cbnz has_nul, 1f
+ cmp data1, data2
+ cset result, ne
+ cneg result, result, lo
+ ret
+1:
+ /* Re-compute the NUL-byte detection, using a byte-reversed value. */
+ rev tmp3, data1
+ sub tmp1, tmp3, zeroones
+ orr tmp2, tmp3, #REP8_7f
+ bic has_nul, tmp1, tmp2
+ rev has_nul, has_nul
+ orr syndrome, diff, has_nul
+ clz pos, syndrome
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#endif
+
+.Lmutual_align:
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask off
+ the bytes that precede the start point.
+ We also need to adjust the limit calculations, but without
+ overflowing if the limit is near ULONG_MAX. */
+ bic src1, src1, #7
+ bic src2, src2, #7
+ ldr data1, [src1], #8
+ neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
+ ldr data2, [src2], #8
+ mov tmp2, #~0
+ sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
+#ifdef __AARCH64EB__
+ /* Big-endian. Early bytes are at MSB. */
+ lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
+#else
+ /* Little-endian. Early bytes are at LSB. */
+ lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
+#endif
+ and tmp3, limit_wd, #7
+ lsr limit_wd, limit_wd, #3
+ /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
+ add limit, limit, count
+ add tmp3, tmp3, count
+ orr data1, data1, tmp2
+ orr data2, data2, tmp2
+ add limit_wd, limit_wd, tmp3, lsr #3
+ b .Lstart_realigned
+
+ .p2align 6
+ /* Don't bother with dwords for up to 16 bytes. */
+.Lmisaligned8:
+ cmp limit, #16
+ b.hs .Ltry_misaligned_words
+
+.Lbyte_loop:
+ /* Perhaps we can do better than this. */
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ subs limit, limit, #1
+ ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.eq .Lbyte_loop
+.Ldone:
+ sub result, data1, data2
+ ret
+ /* Align the SRC1 to a dword by doing a bytewise compare and then do
+ the dword loop. */
+.Ltry_misaligned_words:
+ lsr limit_wd, limit, #3
+ cbz count, .Ldo_misaligned
+
+ neg count, count
+ and count, count, #7
+ sub limit, limit, count
+ lsr limit_wd, limit, #3
+
+.Lpage_end_loop:
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ cmp data1w, #1
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.ne .Ldone
+ subs count, count, #1
+ b.hi .Lpage_end_loop
+
+.Ldo_misaligned:
+ /* Prepare ourselves for the next page crossing. Unlike the aligned
+ loop, we fetch 1 less dword because we risk crossing bounds on
+ SRC2. */
+ mov count, #8
+ subs limit_wd, limit_wd, #1
+ b.lo .Ldone_loop
+.Lloop_misaligned:
+ and tmp2, src2, #0xff8
+ eor tmp2, tmp2, #0xff8
+ cbz tmp2, .Lpage_end_loop
+
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp diff, #0, #0, eq
+ b.ne .Lnot_limit
+ subs limit_wd, limit_wd, #1
+ b.pl .Lloop_misaligned
+
+.Ldone_loop:
+ /* We found a difference or a NULL before the limit was reached. */
+ and limit, limit, #7
+ cbz limit, .Lnot_limit
+ /* Read the last word. */
+ sub src1, src1, 8
+ sub src2, src2, 8
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp diff, #0, #0, eq
+ b.ne .Lnot_limit
+
+.Lret0:
+ mov result, #0
+ ret
+ .size __strncmp_aarch64, . - __strncmp_aarch64
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S
new file mode 100644
index 0000000..3a9be08
--- /dev/null
+++ b/string/aarch64/strnlen-sve.S
@@ -0,0 +1,72 @@
+/*
+ * strnlen - calculate the length of a string with limit.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ .arch armv8-a+sve
+ .text
+
+ .globl __strnlen_aarch64_sve
+ .type __strnlen_aarch64_sve, %function
+ .p2align 4
+__strnlen_aarch64_sve:
+ setffr /* initialize FFR */
+ mov x2, 0 /* initialize len */
+ b 1f
+
+ .p2align 4
+ /* We have off + vl <= max, and so may read the whole vector. */
+0: ldff1b z0.b, p0/z, [x0, x2]
+ rdffrs p1.b, p0/z
+ b.nlast 2f
+
+ /* First fault did not fail: the whole vector is valid.
+ Avoid depending on the contents of FFR beyond the branch. */
+ cmpeq p2.b, p0/z, z0.b, 0
+ b.any 8f
+ incb x2
+
+1: whilelo p0.b, x2, x1
+ b.last 0b
+
+ /* We have off + vl < max. Test for off == max before proceeding. */
+ b.none 9f
+
+ ldff1b z0.b, p0/z, [x0, x2]
+ rdffrs p1.b, p0/z
+ b.nlast 2f
+
+ /* First fault did not fail: the vector up to max is valid.
+ Avoid depending on the contents of FFR beyond the branch.
+ Compare for end-of-string, but there are no more bytes. */
+ cmpeq p2.b, p0/z, z0.b, 0
+
+ /* Found end-of-string or zero. */
+8: brkb p2.b, p0/z, p2.b
+ mov x0, x2
+ incp x0, p2.b
+ ret
+
+ /* First fault failed: only some of the vector is valid.
+ Perform the comparison only on the valid bytes. */
+2: cmpeq p2.b, p1/z, z0.b, 0
+ b.any 8b
+
+ /* No inequality or zero found. Re-init FFR, incr and loop. */
+ setffr
+ incp x2, p1.b
+ b 1b
+
+ /* End of count. Return max. */
+9: mov x0, x2
+ ret
+
+ .size __strnlen_aarch64_sve, . - __strnlen_aarch64_sve
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
new file mode 100644
index 0000000..b02c846
--- /dev/null
+++ b/string/aarch64/strnlen.S
@@ -0,0 +1,160 @@
+/*
+ * strnlen - calculate the length of a string with limit.
+ *
+ * Copyright (c) 2013, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define len x0
+#define limit x1
+
+/* Locals and temporaries. */
+#define src x2
+#define data1 x3
+#define data2 x4
+#define data2a x5
+#define has_nul1 x6
+#define has_nul2 x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define tmp4 x11
+#define zeroones x12
+#define pos x13
+#define limit_wd x14
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+ .text
+ .p2align 6
+.Lstart:
+ /* Pre-pad to ensure critical loop begins an icache line. */
+ .rep 7
+ nop
+ .endr
+ /* Put this code here to avoid wasting more space with pre-padding. */
+.Lhit_limit:
+ mov len, limit
+ ret
+
+def_fn __strnlen_aarch64
+ cbz limit, .Lhit_limit
+ mov zeroones, #REP8_01
+ bic src, srcin, #15
+ ands tmp1, srcin, #15
+ b.ne .Lmisaligned
+ /* Calculate the number of full and partial words -1. */
+ sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
+ lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+ /* The inner loop deals with two Dwords at a time. This has a
+ slightly higher start-up cost, but we should win quite quickly,
+ especially on cores with a high number of issue slots per
+ cycle, as we get much better parallelism out of the operations. */
+
+ /* Start of critial section -- keep to one 64Byte cache line. */
+.Lloop:
+ ldp data1, data2, [src], #16
+.Lrealigned:
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ subs limit_wd, limit_wd, #1
+ orr tmp1, has_nul1, has_nul2
+ ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
+ b.eq .Lloop
+ /* End of critical section -- keep to one 64Byte cache line. */
+
+ orr tmp1, has_nul1, has_nul2
+ cbz tmp1, .Lhit_limit /* No null in final Qword. */
+
+ /* We know there's a null in the final Qword. The easiest thing
+ to do now is work out the length of the string and return
+ MIN (len, limit). */
+
+ sub len, src, srcin
+ cbz has_nul1, .Lnul_in_data2
+#ifdef __AARCH64EB__
+ mov data2, data1
+#endif
+ sub len, len, #8
+ mov has_nul2, has_nul1
+.Lnul_in_data2:
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul directly. The
+ easiest way to get the correct byte is to byte-swap the data
+ and calculate the syndrome a second time. */
+ rev data2, data2
+ sub tmp1, data2, zeroones
+ orr tmp2, data2, #REP8_7f
+ bic has_nul2, tmp1, tmp2
+#endif
+ sub len, len, #8
+ rev has_nul2, has_nul2
+ clz pos, has_nul2
+ add len, len, pos, lsr #3 /* Bits to bytes. */
+ cmp len, limit
+ csel len, len, limit, ls /* Return the lower value. */
+ ret
+
+.Lmisaligned:
+ /* Deal with a partial first word.
+ We're doing two things in parallel here;
+ 1) Calculate the number of words (but avoiding overflow if
+ limit is near ULONG_MAX) - to do this we need to work out
+ limit + tmp1 - 1 as a 65-bit value before shifting it;
+ 2) Load and mask the initial data words - we force the bytes
+ before the ones we are interested in to 0xff - this ensures
+ early bytes will not hit any zero detection. */
+ sub limit_wd, limit, #1
+ neg tmp4, tmp1
+ cmp tmp1, #8
+
+ and tmp3, limit_wd, #15
+ lsr limit_wd, limit_wd, #4
+ mov tmp2, #~0
+
+ ldp data1, data2, [src], #16
+ lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */
+ add tmp3, tmp3, tmp1
+
+#ifdef __AARCH64EB__
+ /* Big-endian. Early bytes are at MSB. */
+ lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
+#else
+ /* Little-endian. Early bytes are at LSB. */
+ lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
+#endif
+ add limit_wd, limit_wd, tmp3, lsr #4
+
+ orr data1, data1, tmp2
+ orr data2a, data2, tmp2
+
+ csinv data1, data1, xzr, le
+ csel data2, data2, data2a, le
+ b .Lrealigned
+ .size __strnlen_aarch64, . - .Lstart /* Include pre-padding in size. */
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S
new file mode 100644
index 0000000..bb522e7
--- /dev/null
+++ b/string/aarch64/strrchr-sve.S
@@ -0,0 +1,83 @@
+/*
+ * strrchr - find the last of a character in a string
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ .arch armv8-a+sve
+ .text
+
+ .globl __strrchr_aarch64_sve
+ .type __strrchr_aarch64_sve, %function
+ .p2align 4
+__strrchr_aarch64_sve:
+ dup z1.b, w1 /* replicate byte across vector */
+ setffr /* initialize FFR */
+ ptrue p1.b /* all ones; loop invariant */
+ mov x2, 0 /* no match found so far */
+ pfalse p2.b
+
+ .p2align 4
+ /* Read a vector's worth of bytes, stopping on first fault. */
+0: ldff1b z0.b, p1/z, [x0, xzr]
+ rdffrs p0.b, p1/z
+ b.nlast 1f
+
+ /* First fault did not fail: the whole vector is valid.
+ Avoid depending on the contents of FFR beyond the branch. */
+ incb x0, all /* skip bytes this round */
+ cmpeq p3.b, p1/z, z0.b, 0 /* search for 0 */
+ b.any 3f
+
+ cmpeq p3.b, p1/z, z0.b, z1.b /* search for c; no eos */
+ b.none 0b
+
+ mov x2, x0 /* save advanced base */
+ mov p2.b, p3.b /* save current search */
+ b 0b
+
+ /* First fault failed: only some of the vector is valid.
+ Perform the comparisions only on the valid bytes. */
+1: cmpeq p3.b, p0/z, z0.b, 0 /* search for 0 */
+ b.any 2f
+
+ cmpeq p3.b, p0/z, z0.b, z1.b /* search for c; no eos */
+ mov x3, x0
+ incp x0, p0.b /* skip bytes this round */
+ setffr /* re-init FFR */
+ b.none 0b
+
+ addvl x2, x3, 1 /* save advanced base */
+ mov p2.b, p3.b /* save current search */
+ b 0b
+
+ /* Found end-of-string. */
+2: incb x0, all /* advance base */
+3: brka p3.b, p1/z, p3.b /* mask after first 0 */
+ cmpeq p3.b, p3/z, z0.b, z1.b /* search for c not after eos */
+ b.any 4f
+
+ /* No C within last vector. Did we have one before? */
+ cbz x2, 5f
+ mov x0, x2 /* restore advanced base */
+ mov p3.b, p2.b /* restore saved search */
+
+ /* Find the *last* match in the predicate. This is slightly
+ more complicated than finding the first match. */
+4: rev p3.b, p3.b /* reverse the bits */
+ brka p3.b, p1/z, p3.b /* find position of last match */
+ decp x0, p3.b /* retard pointer to last match */
+ ret
+
+ /* No C whatsoever. Return NULL. */
+5: mov x0, 0
+ ret
+
+ .size __strrchr_aarch64_sve, . - __strrchr_aarch64_sve
diff --git a/string/arm/memchr.S b/string/arm/memchr.S
new file mode 100644
index 0000000..2eff4d1
--- /dev/null
+++ b/string/arm/memchr.S
@@ -0,0 +1,133 @@
+/*
+ * memchr - scan memory for a character
+ *
+ * Copyright (c) 2010, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/*
+ Written by Dave Gilbert <david.gilbert@linaro.org>
+
+ This __memchr_arm routine is optimised on a Cortex-A9 and should work on
+ all ARMv7 processors. It has a fast past for short sizes, and has
+ an optimised path for large data sets; the worst case is finding the
+ match early in a large data set.
+
+ */
+
+@ 2011-02-07 david.gilbert@linaro.org
+@ Extracted from local git a5b438d861
+@ 2011-07-14 david.gilbert@linaro.org
+@ Import endianness fix from local git ea786f1b
+@ 2011-12-07 david.gilbert@linaro.org
+@ Removed unneeded cbz from align loop
+
+ .syntax unified
+ .arch armv7-a
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+ .text
+ .thumb
+
+@ ---------------------------------------------------------------------------
+ .thumb_func
+ .align 2
+ .p2align 4,,15
+ .global __memchr_arm
+ .type __memchr_arm,%function
+__memchr_arm:
+ @ r0 = start of memory to scan
+ @ r1 = character to look for
+ @ r2 = length
+ @ returns r0 = pointer to character or NULL if not found
+ and r1,r1,#0xff @ Don't think we can trust the caller to actually pass a char
+
+ cmp r2,#16 @ If it's short don't bother with anything clever
+ blt 20f
+
+ tst r0, #7 @ If it's already aligned skip the next bit
+ beq 10f
+
+ @ Work up to an aligned point
+5:
+ ldrb r3, [r0],#1
+ subs r2, r2, #1
+ cmp r3, r1
+ beq 50f @ If it matches exit found
+ tst r0, #7
+ bne 5b @ If not aligned yet then do next byte
+
+10:
+ @ At this point, we are aligned, we know we have at least 8 bytes to work with
+ push {r4,r5,r6,r7}
+ orr r1, r1, r1, lsl #8 @ expand the match word across to all bytes
+ orr r1, r1, r1, lsl #16
+ bic r4, r2, #7 @ Number of double words to work with
+ mvns r7, #0 @ all F's
+ movs r3, #0
+
+15:
+ ldmia r0!,{r5,r6}
+ subs r4, r4, #8
+ eor r5,r5, r1 @ Get it so that r5,r6 have 00's where the bytes match the target
+ eor r6,r6, r1
+ uadd8 r5, r5, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r5, r3, r7 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ uadd8 r6, r6, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r6, r5, r7 @ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ cbnz r6, 60f
+ bne 15b @ (Flags from the subs above) If not run out of bytes then go around again
+
+ pop {r4,r5,r6,r7}
+ and r1,r1,#0xff @ Get r1 back to a single character from the expansion above
+ and r2,r2,#7 @ Leave the count remaining as the number after the double words have been done
+
+20:
+ cbz r2, 40f @ 0 length or hit the end already then not found
+
+21: @ Post aligned section, or just a short call
+ ldrb r3,[r0],#1
+ subs r2,r2,#1
+ eor r3,r3,r1 @ r3 = 0 if match - doesn't break flags from sub
+ cbz r3, 50f
+ bne 21b @ on r2 flags
+
+40:
+ movs r0,#0 @ not found
+ bx lr
+
+50:
+ subs r0,r0,#1 @ found
+ bx lr
+
+60: @ We're here because the fast path found a hit - now we have to track down exactly which word it was
+ @ r0 points to the start of the double word after the one that was tested
+ @ r5 has the 00/ff pattern for the first word, r6 has the chained value
+ cmp r5, #0
+ itte eq
+ moveq r5, r6 @ the end is in the 2nd word
+ subeq r0,r0,#3 @ Points to 2nd byte of 2nd word
+ subne r0,r0,#7 @ or 2nd byte of 1st word
+
+ @ r0 currently points to the 3rd byte of the word containing the hit
+ tst r5, # CHARTSTMASK(0) @ 1st character
+ bne 61f
+ adds r0,r0,#1
+ tst r5, # CHARTSTMASK(1) @ 2nd character
+ ittt eq
+ addeq r0,r0,#1
+ tsteq r5, # (3<<15) @ 2nd & 3rd character
+ @ If not the 3rd must be the last one
+ addeq r0,r0,#1
+
+61:
+ pop {r4,r5,r6,r7}
+ subs r0,r0,#1
+ bx lr
+
+ .size __memchr_arm, . - __memchr_arm
diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S
new file mode 100644
index 0000000..3346e4f
--- /dev/null
+++ b/string/arm/memcpy.S
@@ -0,0 +1,593 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2013, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/*
+ This memcpy routine is optimised for Cortex-A15 cores and takes advantage
+ of VFP or NEON when built with the appropriate flags.
+
+ Assumptions:
+
+ ARMv6 (ARMv7-a if using Neon)
+ ARM state
+ Unaligned accesses
+
+ */
+
+ .syntax unified
+ /* This implementation requires ARM state. */
+ .arm
+
+#ifdef __ARM_NEON__
+
+ .fpu neon
+ .arch armv7-a
+# define FRAME_SIZE 4
+# define USE_VFP
+# define USE_NEON
+
+#elif !defined (__SOFTFP__)
+
+ .arch armv6
+ .fpu vfpv2
+# define FRAME_SIZE 32
+# define USE_VFP
+
+#else
+ .arch armv6
+# define FRAME_SIZE 32
+
+#endif
+
+/* Old versions of GAS incorrectly implement the NEON align semantics. */
+#ifdef BROKEN_ASM_NEON_ALIGN
+#define ALIGN(addr, align) addr,:align
+#else
+#define ALIGN(addr, align) addr:align
+#endif
+
+#define PC_OFFSET 8 /* PC pipeline compensation. */
+#define INSN_SIZE 4
+
+/* Call parameters. */
+#define dstin r0
+#define src r1
+#define count r2
+
+/* Locals. */
+#define tmp1 r3
+#define dst ip
+#define tmp2 r10
+
+#ifndef USE_NEON
+/* For bulk copies using GP registers. */
+#define A_l r2 /* Call-clobbered. */
+#define A_h r3 /* Call-clobbered. */
+#define B_l r4
+#define B_h r5
+#define C_l r6
+#define C_h r7
+#define D_l r8
+#define D_h r9
+#endif
+
+/* Number of lines ahead to pre-fetch data. If you change this the code
+ below will need adjustment to compensate. */
+
+#define prefetch_lines 5
+
+#ifdef USE_VFP
+ .macro cpy_line_vfp vreg, base
+ vstr \vreg, [dst, #\base]
+ vldr \vreg, [src, #\base]
+ vstr d0, [dst, #\base + 8]
+ vldr d0, [src, #\base + 8]
+ vstr d1, [dst, #\base + 16]
+ vldr d1, [src, #\base + 16]
+ vstr d2, [dst, #\base + 24]
+ vldr d2, [src, #\base + 24]
+ vstr \vreg, [dst, #\base + 32]
+ vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
+ vstr d0, [dst, #\base + 40]
+ vldr d0, [src, #\base + 40]
+ vstr d1, [dst, #\base + 48]
+ vldr d1, [src, #\base + 48]
+ vstr d2, [dst, #\base + 56]
+ vldr d2, [src, #\base + 56]
+ .endm
+
+ .macro cpy_tail_vfp vreg, base
+ vstr \vreg, [dst, #\base]
+ vldr \vreg, [src, #\base]
+ vstr d0, [dst, #\base + 8]
+ vldr d0, [src, #\base + 8]
+ vstr d1, [dst, #\base + 16]
+ vldr d1, [src, #\base + 16]
+ vstr d2, [dst, #\base + 24]
+ vldr d2, [src, #\base + 24]
+ vstr \vreg, [dst, #\base + 32]
+ vstr d0, [dst, #\base + 40]
+ vldr d0, [src, #\base + 40]
+ vstr d1, [dst, #\base + 48]
+ vldr d1, [src, #\base + 48]
+ vstr d2, [dst, #\base + 56]
+ vldr d2, [src, #\base + 56]
+ .endm
+#endif
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+def_fn __memcpy_arm p2align=6
+
+ mov dst, dstin /* Preserve dstin, we need to return it. */
+ cmp count, #64
+ bge .Lcpy_not_short
+ /* Deal with small copies quickly by dropping straight into the
+ exit block. */
+
+.Ltail63unaligned:
+#ifdef USE_NEON
+ and tmp1, count, #0x38
+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+ add pc, pc, tmp1
+ vld1.8 {d0}, [src]! /* 14 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 12 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 10 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 8 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 6 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 4 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 2 words to go. */
+ vst1.8 {d0}, [dst]!
+
+ tst count, #4
+ ldrne tmp1, [src], #4
+ strne tmp1, [dst], #4
+#else
+ /* Copy up to 15 full words of data. May not be aligned. */
+ /* Cannot use VFP for unaligned data. */
+ and tmp1, count, #0x3c
+ add dst, dst, tmp1
+ add src, src, tmp1
+ rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
+ /* Jump directly into the sequence below at the correct offset. */
+ add pc, pc, tmp1, lsl #1
+
+ ldr tmp1, [src, #-60] /* 15 words to go. */
+ str tmp1, [dst, #-60]
+
+ ldr tmp1, [src, #-56] /* 14 words to go. */
+ str tmp1, [dst, #-56]
+ ldr tmp1, [src, #-52]
+ str tmp1, [dst, #-52]
+
+ ldr tmp1, [src, #-48] /* 12 words to go. */
+ str tmp1, [dst, #-48]
+ ldr tmp1, [src, #-44]
+ str tmp1, [dst, #-44]
+
+ ldr tmp1, [src, #-40] /* 10 words to go. */
+ str tmp1, [dst, #-40]
+ ldr tmp1, [src, #-36]
+ str tmp1, [dst, #-36]
+
+ ldr tmp1, [src, #-32] /* 8 words to go. */
+ str tmp1, [dst, #-32]
+ ldr tmp1, [src, #-28]
+ str tmp1, [dst, #-28]
+
+ ldr tmp1, [src, #-24] /* 6 words to go. */
+ str tmp1, [dst, #-24]
+ ldr tmp1, [src, #-20]
+ str tmp1, [dst, #-20]
+
+ ldr tmp1, [src, #-16] /* 4 words to go. */
+ str tmp1, [dst, #-16]
+ ldr tmp1, [src, #-12]
+ str tmp1, [dst, #-12]
+
+ ldr tmp1, [src, #-8] /* 2 words to go. */
+ str tmp1, [dst, #-8]
+ ldr tmp1, [src, #-4]
+ str tmp1, [dst, #-4]
+#endif
+
+ lsls count, count, #31
+ ldrhcs tmp1, [src], #2
+ ldrbne src, [src] /* Src is dead, use as a scratch. */
+ strhcs tmp1, [dst], #2
+ strbne src, [dst]
+ bx lr
+
+.Lcpy_not_short:
+ /* At least 64 bytes to copy, but don't know the alignment yet. */
+ str tmp2, [sp, #-FRAME_SIZE]!
+ and tmp2, src, #7
+ and tmp1, dst, #7
+ cmp tmp1, tmp2
+ bne .Lcpy_notaligned
+
+#ifdef USE_VFP
+ /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
+ that the FP pipeline is much better at streaming loads and
+ stores. This is outside the critical loop. */
+ vmov.f32 s0, s0
+#endif
+
+ /* SRC and DST have the same mutual 64-bit alignment, but we may
+ still need to pre-copy some bytes to get to natural alignment.
+ We bring SRC and DST into full 64-bit alignment. */
+ lsls tmp2, dst, #29
+ beq 1f
+ rsbs tmp2, tmp2, #0
+ sub count, count, tmp2, lsr #29
+ ldrmi tmp1, [src], #4
+ strmi tmp1, [dst], #4
+ lsls tmp2, tmp2, #2
+ ldrhcs tmp1, [src], #2
+ ldrbne tmp2, [src], #1
+ strhcs tmp1, [dst], #2
+ strbne tmp2, [dst], #1
+
+1:
+ subs tmp2, count, #64 /* Use tmp2 for count. */
+ blt .Ltail63aligned
+
+ cmp tmp2, #512
+ bge .Lcpy_body_long
+
+.Lcpy_body_medium: /* Count in tmp2. */
+#ifdef USE_VFP
+1:
+ vldr d0, [src, #0]
+ subs tmp2, tmp2, #64
+ vldr d1, [src, #8]
+ vstr d0, [dst, #0]
+ vldr d0, [src, #16]
+ vstr d1, [dst, #8]
+ vldr d1, [src, #24]
+ vstr d0, [dst, #16]
+ vldr d0, [src, #32]
+ vstr d1, [dst, #24]
+ vldr d1, [src, #40]
+ vstr d0, [dst, #32]
+ vldr d0, [src, #48]
+ vstr d1, [dst, #40]
+ vldr d1, [src, #56]
+ vstr d0, [dst, #48]
+ add src, src, #64
+ vstr d1, [dst, #56]
+ add dst, dst, #64
+ bge 1b
+ tst tmp2, #0x3f
+ beq .Ldone
+
+.Ltail63aligned: /* Count in tmp2. */
+ and tmp1, tmp2, #0x38
+ add dst, dst, tmp1
+ add src, src, tmp1
+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+ add pc, pc, tmp1
+
+ vldr d0, [src, #-56] /* 14 words to go. */
+ vstr d0, [dst, #-56]
+ vldr d0, [src, #-48] /* 12 words to go. */
+ vstr d0, [dst, #-48]
+ vldr d0, [src, #-40] /* 10 words to go. */
+ vstr d0, [dst, #-40]
+ vldr d0, [src, #-32] /* 8 words to go. */
+ vstr d0, [dst, #-32]
+ vldr d0, [src, #-24] /* 6 words to go. */
+ vstr d0, [dst, #-24]
+ vldr d0, [src, #-16] /* 4 words to go. */
+ vstr d0, [dst, #-16]
+ vldr d0, [src, #-8] /* 2 words to go. */
+ vstr d0, [dst, #-8]
+#else
+ sub src, src, #8
+ sub dst, dst, #8
+1:
+ ldrd A_l, A_h, [src, #8]
+ strd A_l, A_h, [dst, #8]
+ ldrd A_l, A_h, [src, #16]
+ strd A_l, A_h, [dst, #16]
+ ldrd A_l, A_h, [src, #24]
+ strd A_l, A_h, [dst, #24]
+ ldrd A_l, A_h, [src, #32]
+ strd A_l, A_h, [dst, #32]
+ ldrd A_l, A_h, [src, #40]
+ strd A_l, A_h, [dst, #40]
+ ldrd A_l, A_h, [src, #48]
+ strd A_l, A_h, [dst, #48]
+ ldrd A_l, A_h, [src, #56]
+ strd A_l, A_h, [dst, #56]
+ ldrd A_l, A_h, [src, #64]!
+ strd A_l, A_h, [dst, #64]!
+ subs tmp2, tmp2, #64
+ bge 1b
+ tst tmp2, #0x3f
+ bne 1f
+ ldr tmp2,[sp], #FRAME_SIZE
+ bx lr
+1:
+ add src, src, #8
+ add dst, dst, #8
+
+.Ltail63aligned: /* Count in tmp2. */
+ /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
+ we know that the src and dest are 64-bit aligned so we can use
+ LDRD/STRD to improve efficiency. */
+ /* TMP2 is now negative, but we don't care about that. The bottom
+ six bits still tell us how many bytes are left to copy. */
+
+ and tmp1, tmp2, #0x38
+ add dst, dst, tmp1
+ add src, src, tmp1
+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+ add pc, pc, tmp1
+ ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
+ strd A_l, A_h, [dst, #-56]
+ ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
+ strd A_l, A_h, [dst, #-48]
+ ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
+ strd A_l, A_h, [dst, #-40]
+ ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
+ strd A_l, A_h, [dst, #-32]
+ ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
+ strd A_l, A_h, [dst, #-24]
+ ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
+ strd A_l, A_h, [dst, #-16]
+ ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
+ strd A_l, A_h, [dst, #-8]
+
+#endif
+ tst tmp2, #4
+ ldrne tmp1, [src], #4
+ strne tmp1, [dst], #4
+ lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
+ ldrhcs tmp1, [src], #2
+ ldrbne tmp2, [src]
+ strhcs tmp1, [dst], #2
+ strbne tmp2, [dst]
+
+.Ldone:
+ ldr tmp2, [sp], #FRAME_SIZE
+ bx lr
+
+.Lcpy_body_long: /* Count in tmp2. */
+
+ /* Long copy. We know that there's at least (prefetch_lines * 64)
+ bytes to go. */
+#ifdef USE_VFP
+ /* Don't use PLD. Instead, read some data in advance of the current
+ copy position into a register. This should act like a PLD
+ operation but we won't have to repeat the transfer. */
+
+ vldr d3, [src, #0]
+ vldr d4, [src, #64]
+ vldr d5, [src, #128]
+ vldr d6, [src, #192]
+ vldr d7, [src, #256]
+
+ vldr d0, [src, #8]
+ vldr d1, [src, #16]
+ vldr d2, [src, #24]
+ add src, src, #32
+
+ subs tmp2, tmp2, #prefetch_lines * 64 * 2
+ blt 2f
+1:
+ cpy_line_vfp d3, 0
+ cpy_line_vfp d4, 64
+ cpy_line_vfp d5, 128
+ add dst, dst, #3 * 64
+ add src, src, #3 * 64
+ cpy_line_vfp d6, 0
+ cpy_line_vfp d7, 64
+ add dst, dst, #2 * 64
+ add src, src, #2 * 64
+ subs tmp2, tmp2, #prefetch_lines * 64
+ bge 1b
+
+2:
+ cpy_tail_vfp d3, 0
+ cpy_tail_vfp d4, 64
+ cpy_tail_vfp d5, 128
+ add src, src, #3 * 64
+ add dst, dst, #3 * 64
+ cpy_tail_vfp d6, 0
+ vstr d7, [dst, #64]
+ vldr d7, [src, #64]
+ vstr d0, [dst, #64 + 8]
+ vldr d0, [src, #64 + 8]
+ vstr d1, [dst, #64 + 16]
+ vldr d1, [src, #64 + 16]
+ vstr d2, [dst, #64 + 24]
+ vldr d2, [src, #64 + 24]
+ vstr d7, [dst, #64 + 32]
+ add src, src, #96
+ vstr d0, [dst, #64 + 40]
+ vstr d1, [dst, #64 + 48]
+ vstr d2, [dst, #64 + 56]
+ add dst, dst, #128
+ add tmp2, tmp2, #prefetch_lines * 64
+ b .Lcpy_body_medium
+#else
+ /* Long copy. Use an SMS style loop to maximize the I/O
+ bandwidth of the core. We don't have enough spare registers
+ to synthesise prefetching, so use PLD operations. */
+ /* Pre-bias src and dst. */
+ sub src, src, #8
+ sub dst, dst, #8
+ pld [src, #8]
+ pld [src, #72]
+ subs tmp2, tmp2, #64
+ pld [src, #136]
+ ldrd A_l, A_h, [src, #8]
+ strd B_l, B_h, [sp, #8]
+ ldrd B_l, B_h, [src, #16]
+ strd C_l, C_h, [sp, #16]
+ ldrd C_l, C_h, [src, #24]
+ strd D_l, D_h, [sp, #24]
+ pld [src, #200]
+ ldrd D_l, D_h, [src, #32]!
+ b 1f
+ .p2align 6
+2:
+ pld [src, #232]
+ strd A_l, A_h, [dst, #40]
+ ldrd A_l, A_h, [src, #40]
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [src, #48]
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [src, #56]
+ strd D_l, D_h, [dst, #64]!
+ ldrd D_l, D_h, [src, #64]!
+ subs tmp2, tmp2, #64
+1:
+ strd A_l, A_h, [dst, #8]
+ ldrd A_l, A_h, [src, #8]
+ strd B_l, B_h, [dst, #16]
+ ldrd B_l, B_h, [src, #16]
+ strd C_l, C_h, [dst, #24]
+ ldrd C_l, C_h, [src, #24]
+ strd D_l, D_h, [dst, #32]
+ ldrd D_l, D_h, [src, #32]
+ bcs 2b
+ /* Save the remaining bytes and restore the callee-saved regs. */
+ strd A_l, A_h, [dst, #40]
+ add src, src, #40
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [sp, #8]
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [sp, #16]
+ strd D_l, D_h, [dst, #64]
+ ldrd D_l, D_h, [sp, #24]
+ add dst, dst, #72
+ tst tmp2, #0x3f
+ bne .Ltail63aligned
+ ldr tmp2, [sp], #FRAME_SIZE
+ bx lr
+#endif
+
+.Lcpy_notaligned:
+ pld [src]
+ pld [src, #64]
+ /* There's at least 64 bytes to copy, but there is no mutual
+ alignment. */
+ /* Bring DST to 64-bit alignment. */
+ lsls tmp2, dst, #29
+ pld [src, #(2 * 64)]
+ beq 1f
+ rsbs tmp2, tmp2, #0
+ sub count, count, tmp2, lsr #29
+ ldrmi tmp1, [src], #4
+ strmi tmp1, [dst], #4
+ lsls tmp2, tmp2, #2
+ ldrbne tmp1, [src], #1
+ ldrhcs tmp2, [src], #2
+ strbne tmp1, [dst], #1
+ strhcs tmp2, [dst], #2
+1:
+ pld [src, #(3 * 64)]
+ subs count, count, #64
+ ldrmi tmp2, [sp], #FRAME_SIZE
+ bmi .Ltail63unaligned
+ pld [src, #(4 * 64)]
+
+#ifdef USE_NEON
+ vld1.8 {d0-d3}, [src]!
+ vld1.8 {d4-d7}, [src]!
+ subs count, count, #64
+ bmi 2f
+1:
+ pld [src, #(4 * 64)]
+ vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
+ vld1.8 {d0-d3}, [src]!
+ vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
+ vld1.8 {d4-d7}, [src]!
+ subs count, count, #64
+ bpl 1b
+2:
+ vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
+ vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
+ ands count, count, #0x3f
+#else
+ /* Use an SMS style loop to maximize the I/O bandwidth. */
+ sub src, src, #4
+ sub dst, dst, #8
+ subs tmp2, count, #64 /* Use tmp2 for count. */
+ ldr A_l, [src, #4]
+ ldr A_h, [src, #8]
+ strd B_l, B_h, [sp, #8]
+ ldr B_l, [src, #12]
+ ldr B_h, [src, #16]
+ strd C_l, C_h, [sp, #16]
+ ldr C_l, [src, #20]
+ ldr C_h, [src, #24]
+ strd D_l, D_h, [sp, #24]
+ ldr D_l, [src, #28]
+ ldr D_h, [src, #32]!
+ b 1f
+ .p2align 6
+2:
+ pld [src, #(5 * 64) - (32 - 4)]
+ strd A_l, A_h, [dst, #40]
+ ldr A_l, [src, #36]
+ ldr A_h, [src, #40]
+ strd B_l, B_h, [dst, #48]
+ ldr B_l, [src, #44]
+ ldr B_h, [src, #48]
+ strd C_l, C_h, [dst, #56]
+ ldr C_l, [src, #52]
+ ldr C_h, [src, #56]
+ strd D_l, D_h, [dst, #64]!
+ ldr D_l, [src, #60]
+ ldr D_h, [src, #64]!
+ subs tmp2, tmp2, #64
+1:
+ strd A_l, A_h, [dst, #8]
+ ldr A_l, [src, #4]
+ ldr A_h, [src, #8]
+ strd B_l, B_h, [dst, #16]
+ ldr B_l, [src, #12]
+ ldr B_h, [src, #16]
+ strd C_l, C_h, [dst, #24]
+ ldr C_l, [src, #20]
+ ldr C_h, [src, #24]
+ strd D_l, D_h, [dst, #32]
+ ldr D_l, [src, #28]
+ ldr D_h, [src, #32]
+ bcs 2b
+
+ /* Save the remaining bytes and restore the callee-saved regs. */
+ strd A_l, A_h, [dst, #40]
+ add src, src, #36
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [sp, #8]
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [sp, #16]
+ strd D_l, D_h, [dst, #64]
+ ldrd D_l, D_h, [sp, #24]
+ add dst, dst, #72
+ ands count, tmp2, #0x3f
+#endif
+ ldr tmp2, [sp], #FRAME_SIZE
+ bne .Ltail63unaligned
+ bx lr
+
+ .size __memcpy_arm, . - __memcpy_arm
diff --git a/string/arm/memset.S b/string/arm/memset.S
new file mode 100644
index 0000000..3ee5238
--- /dev/null
+++ b/string/arm/memset.S
@@ -0,0 +1,99 @@
+/*
+ * memset - fill memory with a constant
+ *
+ * Copyright (c) 2010, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/*
+ Written by Dave Gilbert <david.gilbert@linaro.org>
+
+ This memset routine is optimised on a Cortex-A9 and should work on
+ all ARMv7 processors.
+
+ */
+
+ .syntax unified
+ .arch armv7-a
+
+@ 2011-08-30 david.gilbert@linaro.org
+@ Extracted from local git 2f11b436
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+ .text
+ .thumb
+
+@ ---------------------------------------------------------------------------
+ .thumb_func
+ .align 2
+ .p2align 4,,15
+ .global __memset_arm
+ .type __memset_arm,%function
+__memset_arm:
+ @ r0 = address
+ @ r1 = character
+ @ r2 = count
+ @ returns original address in r0
+
+ mov r3, r0 @ Leave r0 alone
+ cbz r2, 10f @ Exit if 0 length
+
+ tst r0, #7
+ beq 2f @ Already aligned
+
+ @ Ok, so we're misaligned here
+1:
+ strb r1, [r3], #1
+ subs r2,r2,#1
+ tst r3, #7
+ cbz r2, 10f @ Exit if we hit the end
+ bne 1b @ go round again if still misaligned
+
+2:
+ @ OK, so we're aligned
+ push {r4,r5,r6,r7}
+ bics r4, r2, #15 @ if less than 16 bytes then need to finish it off
+ beq 5f
+
+3:
+ @ POSIX says that ch is cast to an unsigned char. A uxtb is one
+ @ byte and takes two cycles, where an AND is four bytes but one
+ @ cycle.
+ and r1, #0xFF
+ orr r1, r1, r1, lsl#8 @ Same character into all bytes
+ orr r1, r1, r1, lsl#16
+ mov r5,r1
+ mov r6,r1
+ mov r7,r1
+
+4:
+ subs r4,r4,#16
+ stmia r3!,{r1,r5,r6,r7}
+ bne 4b
+ and r2,r2,#15
+
+ @ At this point we're still aligned and we have upto align-1 bytes left to right
+ @ we can avoid some of the byte-at-a time now by testing for some big chunks
+ tst r2,#8
+ itt ne
+ subne r2,r2,#8
+ stmiane r3!,{r1,r5}
+
+5:
+ pop {r4,r5,r6,r7}
+ cbz r2, 10f
+
+ @ Got to do any last < alignment bytes
+6:
+ subs r2,r2,#1
+ strb r1,[r3],#1
+ bne 6b
+
+10:
+ bx lr @ goodbye
+ .size __memset_arm, . - __memset_arm
diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S
new file mode 100644
index 0000000..5ea06c9
--- /dev/null
+++ b/string/arm/strcmp-armv6m.S
@@ -0,0 +1,118 @@
+/*
+ * strcmp for ARMv6-M (optimized for performance, not size)
+ *
+ * Copyright (c) 2014-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+ .thumb_func
+ .syntax unified
+ .arch armv6-m
+
+ .macro DoSub n, label
+ subs r0, r0, r1
+#ifdef __ARM_BIG_ENDIAN
+ lsrs r1, r4, \n
+#else
+ lsls r1, r4, \n
+#endif
+ orrs r1, r0
+ bne \label
+ .endm
+
+ .macro Byte_Test n, label
+ lsrs r0, r2, \n
+ lsrs r1, r3, \n
+ DoSub \n, \label
+ .endm
+
+ .text
+ .p2align 0
+ .global __strcmp_armv6m
+ .type __strcmp_armv6m, %function
+__strcmp_armv6m:
+ .cfi_startproc
+ mov r2, r0
+ push {r4, r5, r6, lr}
+ orrs r2, r1
+ lsls r2, r2, #30
+ bne 6f
+ ldr r5, =0x01010101
+ lsls r6, r5, #7
+1:
+ ldmia r0!, {r2}
+ ldmia r1!, {r3}
+ subs r4, r2, r5
+ bics r4, r2
+ ands r4, r6
+ beq 3f
+
+#ifdef __ARM_BIG_ENDIAN
+ Byte_Test #24, 4f
+ Byte_Test #16, 4f
+ Byte_Test #8, 4f
+
+ b 7f
+3:
+ cmp r2, r3
+ beq 1b
+ cmp r2, r3
+#else
+ uxtb r0, r2
+ uxtb r1, r3
+ DoSub #24, 2f
+
+ uxth r0, r2
+ uxth r1, r3
+ DoSub #16, 2f
+
+ lsls r0, r2, #8
+ lsls r1, r3, #8
+ lsrs r0, r0, #8
+ lsrs r1, r1, #8
+ DoSub #8, 2f
+
+ lsrs r0, r2, #24
+ lsrs r1, r3, #24
+ subs r0, r0, r1
+2:
+ pop {r4, r5, r6, pc}
+
+3:
+ cmp r2, r3
+ beq 1b
+ rev r0, r2
+ rev r1, r3
+ cmp r0, r1
+#endif
+
+ bls 5f
+ movs r0, #1
+4:
+ pop {r4, r5, r6, pc}
+5:
+ movs r0, #0
+ mvns r0, r0
+ pop {r4, r5, r6, pc}
+6:
+ ldrb r2, [r0, #0]
+ ldrb r3, [r1, #0]
+ adds r0, #1
+ adds r1, #1
+ cmp r2, #0
+ beq 7f
+ cmp r2, r3
+ bne 7f
+ ldrb r2, [r0, #0]
+ ldrb r3, [r1, #0]
+ adds r0, #1
+ adds r1, #1
+ cmp r2, #0
+ beq 7f
+ cmp r2, r3
+ beq 6b
+7:
+ subs r0, r2, r3
+ pop {r4, r5, r6, pc}
+ .cfi_endproc
+ .size __strcmp_armv6m, . - __strcmp_armv6m
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
new file mode 100644
index 0000000..fb9cae3
--- /dev/null
+++ b/string/arm/strcmp.S
@@ -0,0 +1,479 @@
+/*
+ * strcmp for ARMv7
+ *
+ * Copyright (c) 2012-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Implementation of strcmp for ARMv7 when DSP instructions are
+ available. Use ldrd to support wider loads, provided the data
+ is sufficiently aligned. Use saturating arithmetic to optimize
+ the compares. */
+
+/* Build Options:
+ STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
+ byte in the string. If comparing completely random strings
+ the pre-check will save time, since there is a very high
+ probability of a mismatch in the first character: we save
+ significant overhead if this is the common case. However,
+ if strings are likely to be identical (eg because we're
+ verifying a hit in a hash table), then this check is largely
+ redundant. */
+
+#define STRCMP_NO_PRECHECK 0
+
+ /* This version uses Thumb-2 code. */
+ .thumb
+ .syntax unified
+
+#ifdef __ARM_BIG_ENDIAN
+#define S2LO lsl
+#define S2LOEQ lsleq
+#define S2HI lsr
+#define MSB 0x000000ff
+#define LSB 0xff000000
+#define BYTE0_OFFSET 24
+#define BYTE1_OFFSET 16
+#define BYTE2_OFFSET 8
+#define BYTE3_OFFSET 0
+#else /* not __ARM_BIG_ENDIAN */
+#define S2LO lsr
+#define S2LOEQ lsreq
+#define S2HI lsl
+#define BYTE0_OFFSET 0
+#define BYTE1_OFFSET 8
+#define BYTE2_OFFSET 16
+#define BYTE3_OFFSET 24
+#define MSB 0xff000000
+#define LSB 0x000000ff
+#endif /* not __ARM_BIG_ENDIAN */
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+/* Parameters and result. */
+#define src1 r0
+#define src2 r1
+#define result r0 /* Overlaps src1. */
+
+/* Internal variables. */
+#define tmp1 r4
+#define tmp2 r5
+#define const_m1 r12
+
+/* Additional internal variables for 64-bit aligned data. */
+#define data1a r2
+#define data1b r3
+#define data2a r6
+#define data2b r7
+#define syndrome_a tmp1
+#define syndrome_b tmp2
+
+/* Additional internal variables for 32-bit aligned data. */
+#define data1 r2
+#define data2 r3
+#define syndrome tmp2
+
+
+ /* Macro to compute and return the result value for word-aligned
+ cases. */
+ .macro strcmp_epilogue_aligned synd d1 d2 restore_r6
+#ifdef __ARM_BIG_ENDIAN
+ /* If data1 contains a zero byte, then syndrome will contain a 1 in
+ bit 7 of that byte. Otherwise, the highest set bit in the
+ syndrome will highlight the first different bit. It is therefore
+ sufficient to extract the eight bits starting with the syndrome
+ bit. */
+ clz tmp1, \synd
+ lsl r1, \d2, tmp1
+ .if \restore_r6
+ ldrd r6, r7, [sp, #8]
+ .endif
+ .cfi_restore 6
+ .cfi_restore 7
+ lsl \d1, \d1, tmp1
+ .cfi_remember_state
+ lsr result, \d1, #24
+ ldrd r4, r5, [sp], #16
+ .cfi_restore 4
+ .cfi_restore 5
+ sub result, result, r1, lsr #24
+ bx lr
+#else
+ /* To use the big-endian trick we'd have to reverse all three words.
+ that's slower than this approach. */
+ rev \synd, \synd
+ clz tmp1, \synd
+ bic tmp1, tmp1, #7
+ lsr r1, \d2, tmp1
+ .cfi_remember_state
+ .if \restore_r6
+ ldrd r6, r7, [sp, #8]
+ .endif
+ .cfi_restore 6
+ .cfi_restore 7
+ lsr \d1, \d1, tmp1
+ and result, \d1, #255
+ and r1, r1, #255
+ ldrd r4, r5, [sp], #16
+ .cfi_restore 4
+ .cfi_restore 5
+ sub result, result, r1
+
+ bx lr
+#endif
+ .endm
+
+ .text
+ .p2align 5
+.Lstrcmp_start_addr:
+#if STRCMP_NO_PRECHECK == 0
+.Lfastpath_exit:
+ sub r0, r2, r3
+ bx lr
+ nop
+#endif
+def_fn __strcmp_arm
+#if STRCMP_NO_PRECHECK == 0
+ ldrb r2, [src1]
+ ldrb r3, [src2]
+ cmp r2, #1
+ it cs
+ cmpcs r2, r3
+ bne .Lfastpath_exit
+#endif
+ .cfi_startproc
+ strd r4, r5, [sp, #-16]!
+ .cfi_def_cfa_offset 16
+ .cfi_offset 4, -16
+ .cfi_offset 5, -12
+ orr tmp1, src1, src2
+ strd r6, r7, [sp, #8]
+ .cfi_offset 6, -8
+ .cfi_offset 7, -4
+ mvn const_m1, #0
+ lsl r2, tmp1, #29
+ cbz r2, .Lloop_aligned8
+
+.Lnot_aligned:
+ eor tmp1, src1, src2
+ tst tmp1, #7
+ bne .Lmisaligned8
+
+ /* Deal with mutual misalignment by aligning downwards and then
+ masking off the unwanted loaded data to prevent a difference. */
+ and tmp1, src1, #7
+ bic src1, src1, #7
+ and tmp2, tmp1, #3
+ bic src2, src2, #7
+ lsl tmp2, tmp2, #3 /* Bytes -> bits. */
+ ldrd data1a, data1b, [src1], #16
+ tst tmp1, #4
+ ldrd data2a, data2b, [src2], #16
+ /* In thumb code we can't use MVN with a register shift, but
+ we do have ORN. */
+ S2HI tmp1, const_m1, tmp2
+ orn data1a, data1a, tmp1
+ orn data2a, data2a, tmp1
+ beq .Lstart_realigned8
+ orn data1b, data1b, tmp1
+ mov data1a, const_m1
+ orn data2b, data2b, tmp1
+ mov data2a, const_m1
+ b .Lstart_realigned8
+
+ /* Unwind the inner loop by a factor of 2, giving 16 bytes per
+ pass. */
+ .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
+ .p2align 2 /* Always word aligned. */
+.Lloop_aligned8:
+ ldrd data1a, data1b, [src1], #16
+ ldrd data2a, data2b, [src2], #16
+.Lstart_realigned8:
+ uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
+ eor syndrome_a, data1a, data2a
+ sel syndrome_a, syndrome_a, const_m1
+ cbnz syndrome_a, .Ldiff_in_a
+ uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
+ eor syndrome_b, data1b, data2b
+ sel syndrome_b, syndrome_b, const_m1
+ cbnz syndrome_b, .Ldiff_in_b
+
+ ldrd data1a, data1b, [src1, #-8]
+ ldrd data2a, data2b, [src2, #-8]
+ uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
+ eor syndrome_a, data1a, data2a
+ sel syndrome_a, syndrome_a, const_m1
+ uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
+ eor syndrome_b, data1b, data2b
+ sel syndrome_b, syndrome_b, const_m1
+ /* Can't use CBZ for backwards branch. */
+ orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
+ beq .Lloop_aligned8
+
+.Ldiff_found:
+ cbnz syndrome_a, .Ldiff_in_a
+
+.Ldiff_in_b:
+ strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
+
+.Ldiff_in_a:
+ .cfi_restore_state
+ strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
+
+ .cfi_restore_state
+.Lmisaligned8:
+ tst tmp1, #3
+ bne .Lmisaligned4
+ ands tmp1, src1, #3
+ bne .Lmutual_align4
+
+ /* Unrolled by a factor of 2, to reduce the number of post-increment
+ operations. */
+.Lloop_aligned4:
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+.Lstart_realigned4:
+ uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
+ eor syndrome, data1, data2
+ sel syndrome, syndrome, const_m1
+ cbnz syndrome, .Laligned4_done
+ ldr data1, [src1, #-4]
+ ldr data2, [src2, #-4]
+ uadd8 syndrome, data1, const_m1
+ eor syndrome, data1, data2
+ sel syndrome, syndrome, const_m1
+ cmp syndrome, #0
+ beq .Lloop_aligned4
+
+.Laligned4_done:
+ strcmp_epilogue_aligned syndrome, data1, data2, 0
+
+.Lmutual_align4:
+ .cfi_restore_state
+ /* Deal with mutual misalignment by aligning downwards and then
+ masking off the unwanted loaded data to prevent a difference. */
+ lsl tmp1, tmp1, #3 /* Bytes -> bits. */
+ bic src1, src1, #3
+ ldr data1, [src1], #8
+ bic src2, src2, #3
+ ldr data2, [src2], #8
+
+ /* In thumb code we can't use MVN with a register shift, but
+ we do have ORN. */
+ S2HI tmp1, const_m1, tmp1
+ orn data1, data1, tmp1
+ orn data2, data2, tmp1
+ b .Lstart_realigned4
+
+.Lmisaligned4:
+ ands tmp1, src1, #3
+ beq .Lsrc1_aligned
+ sub src2, src2, tmp1
+ bic src1, src1, #3
+ lsls tmp1, tmp1, #31
+ ldr data1, [src1], #4
+ beq .Laligned_m2
+ bcs .Laligned_m1
+
+#if STRCMP_NO_PRECHECK == 1
+ ldrb data2, [src2, #1]
+ uxtb tmp1, data1, ror #BYTE1_OFFSET
+ subs tmp1, tmp1, data2
+ bne .Lmisaligned_exit
+ cbz data2, .Lmisaligned_exit
+
+.Laligned_m2:
+ ldrb data2, [src2, #2]
+ uxtb tmp1, data1, ror #BYTE2_OFFSET
+ subs tmp1, tmp1, data2
+ bne .Lmisaligned_exit
+ cbz data2, .Lmisaligned_exit
+
+.Laligned_m1:
+ ldrb data2, [src2, #3]
+ uxtb tmp1, data1, ror #BYTE3_OFFSET
+ subs tmp1, tmp1, data2
+ bne .Lmisaligned_exit
+ add src2, src2, #4
+ cbnz data2, .Lsrc1_aligned
+#else /* STRCMP_NO_PRECHECK */
+ /* If we've done the pre-check, then we don't need to check the
+ first byte again here. */
+ ldrb data2, [src2, #2]
+ uxtb tmp1, data1, ror #BYTE2_OFFSET
+ subs tmp1, tmp1, data2
+ bne .Lmisaligned_exit
+ cbz data2, .Lmisaligned_exit
+
+.Laligned_m2:
+ ldrb data2, [src2, #3]
+ uxtb tmp1, data1, ror #BYTE3_OFFSET
+ subs tmp1, tmp1, data2
+ bne .Lmisaligned_exit
+ cbnz data2, .Laligned_m1
+#endif
+
+.Lmisaligned_exit:
+ .cfi_remember_state
+ mov result, tmp1
+ ldr r4, [sp], #16
+ .cfi_restore 4
+ bx lr
+
+#if STRCMP_NO_PRECHECK == 0
+.Laligned_m1:
+ add src2, src2, #4
+#endif
+.Lsrc1_aligned:
+ .cfi_restore_state
+ /* src1 is word aligned, but src2 has no common alignment
+ with it. */
+ ldr data1, [src1], #4
+ lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
+
+ bic src2, src2, #3
+ ldr data2, [src2], #4
+ bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
+ bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
+
+ /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
+.Loverlap3:
+ bic tmp1, data1, #MSB
+ uadd8 syndrome, data1, const_m1
+ eors syndrome, tmp1, data2, S2LO #8
+ sel syndrome, syndrome, const_m1
+ bne 4f
+ cbnz syndrome, 5f
+ ldr data2, [src2], #4
+ eor tmp1, tmp1, data1
+ cmp tmp1, data2, S2HI #24
+ bne 6f
+ ldr data1, [src1], #4
+ b .Loverlap3
+4:
+ S2LO data2, data2, #8
+ b .Lstrcmp_tail
+
+5:
+ bics syndrome, syndrome, #MSB
+ bne .Lstrcmp_done_equal
+
+ /* We can only get here if the MSB of data1 contains 0, so
+ fast-path the exit. */
+ ldrb result, [src2]
+ .cfi_remember_state
+ ldrd r4, r5, [sp], #16
+ .cfi_restore 4
+ .cfi_restore 5
+ /* R6/7 Not used in this sequence. */
+ .cfi_restore 6
+ .cfi_restore 7
+ neg result, result
+ bx lr
+
+6:
+ .cfi_restore_state
+ S2LO data1, data1, #24
+ and data2, data2, #LSB
+ b .Lstrcmp_tail
+
+ .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
+.Loverlap2:
+ and tmp1, data1, const_m1, S2LO #16
+ uadd8 syndrome, data1, const_m1
+ eors syndrome, tmp1, data2, S2LO #16
+ sel syndrome, syndrome, const_m1
+ bne 4f
+ cbnz syndrome, 5f
+ ldr data2, [src2], #4
+ eor tmp1, tmp1, data1
+ cmp tmp1, data2, S2HI #16
+ bne 6f
+ ldr data1, [src1], #4
+ b .Loverlap2
+4:
+ S2LO data2, data2, #16
+ b .Lstrcmp_tail
+5:
+ ands syndrome, syndrome, const_m1, S2LO #16
+ bne .Lstrcmp_done_equal
+
+ ldrh data2, [src2]
+ S2LO data1, data1, #16
+#ifdef __ARM_BIG_ENDIAN
+ lsl data2, data2, #16
+#endif
+ b .Lstrcmp_tail
+
+6:
+ S2LO data1, data1, #16
+ and data2, data2, const_m1, S2LO #16
+ b .Lstrcmp_tail
+
+ .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
+.Loverlap1:
+ and tmp1, data1, #LSB
+ uadd8 syndrome, data1, const_m1
+ eors syndrome, tmp1, data2, S2LO #24
+ sel syndrome, syndrome, const_m1
+ bne 4f
+ cbnz syndrome, 5f
+ ldr data2, [src2], #4
+ eor tmp1, tmp1, data1
+ cmp tmp1, data2, S2HI #8
+ bne 6f
+ ldr data1, [src1], #4
+ b .Loverlap1
+4:
+ S2LO data2, data2, #24
+ b .Lstrcmp_tail
+5:
+ tst syndrome, #LSB
+ bne .Lstrcmp_done_equal
+ ldr data2, [src2]
+6:
+ S2LO data1, data1, #8
+ bic data2, data2, #MSB
+ b .Lstrcmp_tail
+
+.Lstrcmp_done_equal:
+ mov result, #0
+ .cfi_remember_state
+ ldrd r4, r5, [sp], #16
+ .cfi_restore 4
+ .cfi_restore 5
+ /* R6/7 not used in this sequence. */
+ .cfi_restore 6
+ .cfi_restore 7
+ bx lr
+
+.Lstrcmp_tail:
+ .cfi_restore_state
+#ifndef __ARM_BIG_ENDIAN
+ rev data1, data1
+ rev data2, data2
+ /* Now everything looks big-endian... */
+#endif
+ uadd8 tmp1, data1, const_m1
+ eor tmp1, data1, data2
+ sel syndrome, tmp1, const_m1
+ clz tmp1, syndrome
+ lsl data1, data1, tmp1
+ lsl data2, data2, tmp1
+ lsr result, data1, #24
+ ldrd r4, r5, [sp], #16
+ .cfi_restore 4
+ .cfi_restore 5
+ /* R6/7 not used in this sequence. */
+ .cfi_restore 6
+ .cfi_restore 7
+ sub result, result, data2, lsr #24
+ bx lr
+ .cfi_endproc
+ .size __strcmp, . - .Lstrcmp_start_addr
diff --git a/string/arm/strcpy.c b/string/arm/strcpy.c
new file mode 100644
index 0000000..48ebbe8
--- /dev/null
+++ b/string/arm/strcpy.c
@@ -0,0 +1,129 @@
+/*
+ * strcpy
+ *
+ * Copyright (c) 2008-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* For GLIBC:
+#include <string.h>
+#include <memcopy.h>
+
+#undef strcmp
+*/
+
+#ifdef __thumb2__
+#define magic1(REG) "#0x01010101"
+#define magic2(REG) "#0x80808080"
+#else
+#define magic1(REG) #REG
+#define magic2(REG) #REG ", lsl #7"
+#endif
+
+char* __attribute__((naked))
+__strcpy_arm (char* dst, const char* src)
+{
+ __asm__ (
+ "pld [r1, #0]\n\t"
+ "eor r2, r0, r1\n\t"
+ "mov ip, r0\n\t"
+ "tst r2, #3\n\t"
+ "bne 4f\n\t"
+ "tst r1, #3\n\t"
+ "bne 3f\n"
+ "5:\n\t"
+# ifndef __thumb2__
+ "str r5, [sp, #-4]!\n\t"
+ "mov r5, #0x01\n\t"
+ "orr r5, r5, r5, lsl #8\n\t"
+ "orr r5, r5, r5, lsl #16\n\t"
+# endif
+
+ "str r4, [sp, #-4]!\n\t"
+ "tst r1, #4\n\t"
+ "ldr r3, [r1], #4\n\t"
+ "beq 2f\n\t"
+ "sub r2, r3, "magic1(r5)"\n\t"
+ "bics r2, r2, r3\n\t"
+ "tst r2, "magic2(r5)"\n\t"
+ "itt eq\n\t"
+ "streq r3, [ip], #4\n\t"
+ "ldreq r3, [r1], #4\n"
+ "bne 1f\n\t"
+ /* Inner loop. We now know that r1 is 64-bit aligned, so we
+ can safely fetch up to two words. This allows us to avoid
+ load stalls. */
+ ".p2align 2\n"
+ "2:\n\t"
+ "pld [r1, #8]\n\t"
+ "ldr r4, [r1], #4\n\t"
+ "sub r2, r3, "magic1(r5)"\n\t"
+ "bics r2, r2, r3\n\t"
+ "tst r2, "magic2(r5)"\n\t"
+ "sub r2, r4, "magic1(r5)"\n\t"
+ "bne 1f\n\t"
+ "str r3, [ip], #4\n\t"
+ "bics r2, r2, r4\n\t"
+ "tst r2, "magic2(r5)"\n\t"
+ "itt eq\n\t"
+ "ldreq r3, [r1], #4\n\t"
+ "streq r4, [ip], #4\n\t"
+ "beq 2b\n\t"
+ "mov r3, r4\n"
+ "1:\n\t"
+# ifdef __ARMEB__
+ "rors r3, r3, #24\n\t"
+# endif
+ "strb r3, [ip], #1\n\t"
+ "tst r3, #0xff\n\t"
+# ifdef __ARMEL__
+ "ror r3, r3, #8\n\t"
+# endif
+ "bne 1b\n\t"
+ "ldr r4, [sp], #4\n\t"
+# ifndef __thumb2__
+ "ldr r5, [sp], #4\n\t"
+# endif
+ "BX LR\n"
+
+ /* Strings have the same offset from word alignment, but it's
+ not zero. */
+ "3:\n\t"
+ "tst r1, #1\n\t"
+ "beq 1f\n\t"
+ "ldrb r2, [r1], #1\n\t"
+ "strb r2, [ip], #1\n\t"
+ "cmp r2, #0\n\t"
+ "it eq\n"
+ "BXEQ LR\n"
+ "1:\n\t"
+ "tst r1, #2\n\t"
+ "beq 5b\n\t"
+ "ldrh r2, [r1], #2\n\t"
+# ifdef __ARMEB__
+ "tst r2, #0xff00\n\t"
+ "iteet ne\n\t"
+ "strneh r2, [ip], #2\n\t"
+ "lsreq r2, r2, #8\n\t"
+ "streqb r2, [ip]\n\t"
+ "tstne r2, #0xff\n\t"
+# else
+ "tst r2, #0xff\n\t"
+ "itet ne\n\t"
+ "strneh r2, [ip], #2\n\t"
+ "streqb r2, [ip]\n\t"
+ "tstne r2, #0xff00\n\t"
+# endif
+ "bne 5b\n\t"
+ "BX LR\n"
+
+ /* src and dst do not have a common word-alignement. Fall back to
+ byte copying. */
+ "4:\n\t"
+ "ldrb r2, [r1], #1\n\t"
+ "strb r2, [ip], #1\n\t"
+ "cmp r2, #0\n\t"
+ "bne 4b\n\t"
+ "BX LR");
+}
+/* For GLIBC: libc_hidden_builtin_def (strcpy) */
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
new file mode 100644
index 0000000..279ec87
--- /dev/null
+++ b/string/arm/strlen-armv6t2.S
@@ -0,0 +1,125 @@
+/*
+ * strlen - calculate the length of a string
+ *
+ * Copyright (c) 2010, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/*
+ Assumes:
+ ARMv6T2, AArch32
+
+ */
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+#ifdef __ARMEB__
+#define S2LO lsl
+#define S2HI lsr
+#else
+#define S2LO lsr
+#define S2HI lsl
+#endif
+
+ /* This code requires Thumb. */
+ .thumb
+ .syntax unified
+
+/* Parameters and result. */
+#define srcin r0
+#define result r0
+
+/* Internal variables. */
+#define src r1
+#define data1a r2
+#define data1b r3
+#define const_m1 r12
+#define const_0 r4
+#define tmp1 r4 /* Overlaps const_0 */
+#define tmp2 r5
+
+def_fn __strlen_armv6t2 p2align=6
+ pld [srcin, #0]
+ strd r4, r5, [sp, #-8]!
+ bic src, srcin, #7
+ mvn const_m1, #0
+ ands tmp1, srcin, #7 /* (8 - bytes) to alignment. */
+ pld [src, #32]
+ bne.w .Lmisaligned8
+ mov const_0, #0
+ mov result, #-8
+.Lloop_aligned:
+ /* Bytes 0-7. */
+ ldrd data1a, data1b, [src]
+ pld [src, #64]
+ add result, result, #8
+.Lstart_realigned:
+ uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
+ sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
+ uadd8 data1b, data1b, const_m1
+ sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
+ cbnz data1b, .Lnull_found
+
+ /* Bytes 8-15. */
+ ldrd data1a, data1b, [src, #8]
+ uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
+ add result, result, #8
+ sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
+ uadd8 data1b, data1b, const_m1
+ sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
+ cbnz data1b, .Lnull_found
+
+ /* Bytes 16-23. */
+ ldrd data1a, data1b, [src, #16]
+ uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
+ add result, result, #8
+ sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
+ uadd8 data1b, data1b, const_m1
+ sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
+ cbnz data1b, .Lnull_found
+
+ /* Bytes 24-31. */
+ ldrd data1a, data1b, [src, #24]
+ add src, src, #32
+ uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
+ add result, result, #8
+ sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
+ uadd8 data1b, data1b, const_m1
+ sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
+ cmp data1b, #0
+ beq .Lloop_aligned
+
+.Lnull_found:
+ cmp data1a, #0
+ itt eq
+ addeq result, result, #4
+ moveq data1a, data1b
+#ifndef __ARMEB__
+ rev data1a, data1a
+#endif
+ clz data1a, data1a
+ ldrd r4, r5, [sp], #8
+ add result, result, data1a, lsr #3 /* Bits -> Bytes. */
+ bx lr
+
+.Lmisaligned8:
+ ldrd data1a, data1b, [src]
+ and tmp2, tmp1, #3
+ rsb result, tmp1, #0
+ lsl tmp2, tmp2, #3 /* Bytes -> bits. */
+ tst tmp1, #4
+ pld [src, #64]
+ S2HI tmp2, const_m1, tmp2
+ orn data1a, data1a, tmp2
+ itt ne
+ ornne data1b, data1b, tmp2
+ movne data1a, const_m1
+ mov const_0, #0
+ b .Lstart_realigned
+ .size __strlen_armv6t2, . - __strlen_armv6t2
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 753d06a..96647cf 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -14,4 +14,36 @@
#if __aarch64__
void *__memcpy_bytewise (void *__restrict, const void *__restrict, size_t);
+void *__memcpy_aarch64 (void *__restrict, const void *__restrict, size_t);
+void *__memmove_aarch64 (void *__restrict, const void *__restrict, size_t);
+void *__memset_aarch64 (void *, int, size_t);
+void *__memchr_aarch64 (const void *, int, size_t);
+int __memcmp_aarch64 (const void *, const void *, size_t);
+char *__strcpy_aarch64 (char *__restrict, const char *__restrict);
+int __strcmp_aarch64 (const char *, const char *);
+char *__strchr_aarch64 (const char *, int);
+char *__strchrnul_aarch64 (const char *, int );
+size_t __strlen_aarch64 (const char *);
+size_t __strnlen_aarch64 (const char *, size_t);
+int __strncmp_aarch64 (const char *, const char *, size_t);
+# if __ARM_FEATURE_SVE
+void *__memchr_aarch64_sve (const void *, int, size_t);
+int __memcmp_aarch64_sve (const void *, const void *, size_t);
+char *__strchr_aarch64_sve (const char *, int);
+char *__strrchr_aarch64_sve (const char *, int);
+char *__strchrnul_aarch64_sve (const char *, int );
+int __strcmp_aarch64_sve (const char *, const char *);
+char *__strcpy_aarch64_sve (char *__restrict, const char *__restrict);
+size_t __strlen_aarch64_sve (const char *);
+size_t __strnlen_aarch64_sve (const char *, size_t);
+int __strncmp_aarch64_sve (const char *, const char *, size_t);
+# endif
+#elif __arm__
+void *__memcpy_arm (void *__restrict, const void *__restrict, size_t);
+void *__memset_arm (void *, int, size_t);
+void *__memchr_arm (const void *, int, size_t);
+char *__strcpy_arm (char *__restrict, const char *__restrict);
+int __strcmp_arm (const char *, const char *);
+int __strcmp_armv6m (const char *, const char *);
+size_t __strlen_armv6t2 (const char *);
#endif
diff --git a/string/memchr.S b/string/memchr.S
new file mode 100644
index 0000000..0a564d8
--- /dev/null
+++ b/string/memchr.S
@@ -0,0 +1,15 @@
+/*
+ * Selected possible memchr implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/memchr.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/memchr-sve.S"
+# endif
+#elif __arm__
+#include "arm/memchr.S"
+#endif
diff --git a/string/memcmp.S b/string/memcmp.S
new file mode 100644
index 0000000..22da685
--- /dev/null
+++ b/string/memcmp.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible memcpy implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/memcmp.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/memcmp-sve.S"
+# endif
+#endif
diff --git a/string/memcpy.S b/string/memcpy.S
new file mode 100644
index 0000000..c0f23e3
--- /dev/null
+++ b/string/memcpy.S
@@ -0,0 +1,12 @@
+/*
+ * Selected possible memcpy implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/memcpy.S"
+#elif __arm__
+#include "arm/memcpy.S"
+#endif
diff --git a/string/memmove.S b/string/memmove.S
new file mode 100644
index 0000000..be3c7a1
--- /dev/null
+++ b/string/memmove.S
@@ -0,0 +1,10 @@
+/*
+ * Selected possible memmmove implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/memmove.S"
+#endif
diff --git a/string/memset.S b/string/memset.S
new file mode 100644
index 0000000..57542ef
--- /dev/null
+++ b/string/memset.S
@@ -0,0 +1,12 @@
+/*
+ * Selected possible memset implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/memset.S"
+#elif __arm__
+#include "arm/memset.S"
+#endif
diff --git a/string/strchr.S b/string/strchr.S
new file mode 100644
index 0000000..8cead02
--- /dev/null
+++ b/string/strchr.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible strchr implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strchr.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strchr-sve.S"
+# endif
+#endif
diff --git a/string/strchrnul.S b/string/strchrnul.S
new file mode 100644
index 0000000..3dfdeef
--- /dev/null
+++ b/string/strchrnul.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible strchr implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strchrnul.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strchrnul-sve.S"
+# endif
+#endif
diff --git a/string/strcmp.S b/string/strcmp.S
new file mode 100644
index 0000000..12530ec
--- /dev/null
+++ b/string/strcmp.S
@@ -0,0 +1,19 @@
+/*
+ * Selected possible strcmp implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strcmp.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strcmp-sve.S"
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
+#include "arm/strcmp.S"
+# elif __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
+#include "arm/strcmp-armv6m.S"
+# endif
+#endif
diff --git a/string/strcpy-c.c b/string/strcpy-c.c
new file mode 100644
index 0000000..6bde24a
--- /dev/null
+++ b/string/strcpy-c.c
@@ -0,0 +1,10 @@
+/*
+ * Selected possible strcpy implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __arm__ && defined (__thumb2__) && !defined (__thumb__)
+#include "arm/strcpy.c"
+#endif
diff --git a/string/strcpy.S b/string/strcpy.S
new file mode 100644
index 0000000..a604b22
--- /dev/null
+++ b/string/strcpy.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible strcpy implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strcpy.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strcpy-sve.S"
+# endif
+#endif
diff --git a/string/strlen.S b/string/strlen.S
new file mode 100644
index 0000000..d681033
--- /dev/null
+++ b/string/strlen.S
@@ -0,0 +1,17 @@
+/*
+ * Selected possible strlen implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strlen.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strlen-sve.S"
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+#include "arm/strlen-armv6t2.S"
+# endif
+#endif
diff --git a/string/strncmp.S b/string/strncmp.S
new file mode 100644
index 0000000..26b56b7
--- /dev/null
+++ b/string/strncmp.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible strncmp implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strncmp.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strncmp-sve.S"
+# endif
+#endif
diff --git a/string/strnlen.S b/string/strnlen.S
new file mode 100644
index 0000000..eebe777
--- /dev/null
+++ b/string/strnlen.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible strnlen implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strnlen.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strnlen-sve.S"
+# endif
+#endif
diff --git a/string/strrchr.S b/string/strrchr.S
new file mode 100644
index 0000000..18b1cf9
--- /dev/null
+++ b/string/strrchr.S
@@ -0,0 +1,12 @@
+/*
+ * Selected possible strrchr implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+# if __ARM_FEATURE_SVE
+#include "aarch64/strrchr-sve.S"
+# endif
+#endif
diff --git a/string/test/memchr.c b/string/test/memchr.c
new file mode 100644
index 0000000..8d609c9
--- /dev/null
+++ b/string/test/memchr.c
@@ -0,0 +1,94 @@
+/*
+ * memchr test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+ const char *name;
+ void *(*fun)(const void *, int c, size_t n);
+} funtab[] = {
+#define F(x) {#x, x},
+F(memchr)
+#if __aarch64__
+F(__memchr_aarch64)
+# if __ARM_FEATURE_SVE
+F(__memchr_aarch64_sve)
+# endif
+#elif __arm__
+F(__memchr_arm)
+#endif
+#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define SP 512
+#define LEN 250000
+static unsigned char sbuf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+ return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int align, int seekpos, int len)
+{
+ unsigned char *src = alignup(sbuf);
+ unsigned char *s = src + align;
+ unsigned char *f = len ? s + seekpos : 0;
+ int seekchar = 0x1;
+ int i;
+ void *p;
+
+ if (len > LEN || seekpos >= len || align >= A)
+ abort();
+
+ for (i = 0; i < seekpos; i++)
+ s[i] = 'a' + i%23;
+ s[i++] = seekchar;
+ for (; i < len; i++)
+ s[i] = 'a' + i%23;
+
+ p = fun->fun(s, seekchar, len);
+
+ if (p != f) {
+ ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
+ ERR("expected: %p\n", f);
+ abort();
+ }
+}
+
+int main()
+{
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int a = 0; a < A; a++) {
+ for (int n = 0; n < 100; n++)
+ for (int sp = 0; sp < n-1; sp++)
+ test(funtab+i, a, sp, n);
+ for (int n = 100; n < LEN; n *= 2) {
+ test(funtab+i, a, n-1, n);
+ test(funtab+i, a, n/2, n);
+ }
+ }
+ if (test_status) {
+ r = -1;
+ ERR("FAIL %s\n", funtab[i].name);
+ }
+ }
+ return r;
+}
diff --git a/string/test/memcmp.c b/string/test/memcmp.c
new file mode 100644
index 0000000..63b07bd
--- /dev/null
+++ b/string/test/memcmp.c
@@ -0,0 +1,97 @@
+/*
+ * memcmp test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+ const char *name;
+ int (*fun)(const void *s1, const void *s2, size_t n);
+} funtab[] = {
+#define F(x) {#x, x},
+F(memcmp)
+#if __aarch64__
+F(__memcmp_aarch64)
+# if __ARM_FEATURE_SVE
+F(__memcmp_aarch64_sve)
+# endif
+#endif
+#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define LEN 250000
+static unsigned char s1buf[LEN+2*A];
+static unsigned char s2buf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+ return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos)
+{
+ unsigned char *src1 = alignup(s1buf);
+ unsigned char *src2 = alignup(s2buf);
+ unsigned char *s1 = src1 + s1align;
+ unsigned char *s2 = src2 + s2align;
+ int r;
+
+ if (len > LEN || s1align >= A || s2align >= A)
+ abort();
+ if (diffpos && diffpos >= len)
+ abort();
+
+ for (int i = 0; i < len+A; i++)
+ src1[i] = src2[i] = '?';
+ for (int i = 0; i < len; i++)
+ s1[i] = s2[i] = 'a' + i%23;
+ if (diffpos)
+ s1[diffpos]++;
+
+ r = fun->fun(s1, s2, len);
+
+ if ((!diffpos && r != 0) || (diffpos && r == 0)) {
+ ERR("%s(align %d, align %d, %d) failed, returned %d\n",
+ fun->name, s1align, s2align, len, r);
+ ERR("src1: %.*s\n", s1align+len+1, src1);
+ ERR("src2: %.*s\n", s2align+len+1, src2);
+ }
+}
+
+int main()
+{
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int d = 0; d < A; d++)
+ for (int s = 0; s < A; s++) {
+ int n;
+ for (n = 0; n < 100; n++) {
+ test(funtab+i, d, s, n, 0);
+ test(funtab+i, d, s, n, n / 2);
+ }
+ for (; n < LEN; n *= 2) {
+ test(funtab+i, d, s, n, 0);
+ test(funtab+i, d, s, n, n / 2);
+ }
+ }
+ if (test_status) {
+ r = -1;
+ ERR("FAIL %s\n", funtab[i].name);
+ }
+ }
+ return r;
+}
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index 1dccac7..26ab0ec 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -20,6 +20,9 @@ static const struct fun
F(memcpy)
#if __aarch64__
F(__memcpy_bytewise)
+F(__memcpy_aarch64)
+#elif __arm__
+F(__memcpy_arm)
#endif
#undef F
{0, 0}
diff --git a/string/test/memmove.c b/string/test/memmove.c
new file mode 100644
index 0000000..8164383
--- /dev/null
+++ b/string/test/memmove.c
@@ -0,0 +1,142 @@
+/*
+ * memmove test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+ const char *name;
+ void *(*fun)(void *, const void *, size_t);
+} funtab[] = {
+#define F(x) {#x, x},
+F(memmove)
+#if __aarch64__
+F(__memmove_aarch64)
+#endif
+#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define LEN 250000
+static unsigned char dbuf[LEN+2*A];
+static unsigned char sbuf[LEN+2*A];
+static unsigned char wbuf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+ return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int dalign, int salign, int len)
+{
+ unsigned char *src = alignup(sbuf);
+ unsigned char *dst = alignup(dbuf);
+ unsigned char *want = wbuf;
+ unsigned char *s = src + salign;
+ unsigned char *d = dst + dalign;
+ unsigned char *w = want + dalign;
+ void *p;
+ int i;
+
+ if (len > LEN || dalign >= A || salign >= A)
+ abort();
+ for (i = 0; i < len+A; i++) {
+ src[i] = '?';
+ want[i] = dst[i] = '*';
+ }
+ for (i = 0; i < len; i++)
+ s[i] = w[i] = 'a' + i%23;
+
+ p = fun->fun(d, s, len);
+ if (p != d)
+ ERR("%s(%p,..) returned %p\n", fun->name, d, p);
+ for (i = 0; i < len+A; i++) {
+ if (dst[i] != want[i]) {
+ ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
+ ERR("got : %.*s\n", dalign+len+1, dst);
+ ERR("want: %.*s\n", dalign+len+1, want);
+ break;
+ }
+ }
+}
+
+static void test_overlap(const struct fun *fun, int dalign, int salign, int len)
+{
+ unsigned char *src = alignup(sbuf);
+ unsigned char *dst = alignup(sbuf);
+ unsigned char *want = wbuf;
+ unsigned char *s = src + salign;
+ unsigned char *d = dst + dalign;
+ unsigned char *w = wbuf + dalign;
+ void *p;
+
+ if (len > LEN || dalign >= A || salign >= A)
+ abort();
+
+ for (int i = 0; i < len+A; i++)
+ src[i] = want[i] = '?';
+
+ for (int i = 0; i < len; i++)
+ s[i] = w[i] = 'a' + i%23;
+
+ /* Copy the potential overlap range. */
+ if (s < d) {
+ for (int i = 0; i < (uintptr_t)d-(uintptr_t)s; i++)
+ want[salign+i] = src[salign+i];
+ } else {
+ for (int i = 0; i < (uintptr_t)s-(uintptr_t)d; i++)
+ want[len + dalign + i] = src[len + dalign + i];
+ }
+
+ p = fun->fun(d, s, len);
+ if (p != d)
+ ERR("%s(%p,..) returned %p\n", fun->name, d, p);
+ for (int i = 0; i < len+A; i++) {
+ if (dst[i] != want[i]) {
+ ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
+ ERR("got : %.*s\n", dalign+len+1, dst);
+ ERR("want: %.*s\n", dalign+len+1, want);
+ abort();
+ break;
+ }
+ }
+}
+
+int main()
+{
+ test_overlap(funtab+0, 2, 1, 1);
+
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int d = 0; d < A; d++)
+ for (int s = 0; s < A; s++) {
+ int n;
+ for (n = 0; n < 100; n++) {
+ test(funtab+i, d, s, n);
+ test_overlap(funtab+i, d, s, n);
+ }
+ for (; n < LEN; n *= 2) {
+ test(funtab+i, d, s, n);
+ test_overlap(funtab+i, d, s, n);
+ }
+ }
+ if (test_status) {
+ r = -1;
+ ERR("FAIL %s\n", funtab[i].name);
+ }
+ }
+ return r;
+}
diff --git a/string/test/memset.c b/string/test/memset.c
new file mode 100644
index 0000000..c0c7ed6
--- /dev/null
+++ b/string/test/memset.c
@@ -0,0 +1,112 @@
+/*
+ * memset test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+ const char *name;
+ void *(*fun)(void *s, int c, size_t n);
+} funtab[] = {
+#define F(x) {#x, x},
+F(memset)
+#if __aarch64__
+F(__memset_aarch64)
+#elif __arm__
+F(__memset_arm)
+#endif
+#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define LEN 250000
+static unsigned char sbuf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+ return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void err(const char *name, unsigned char *src, int salign, int c, int len)
+{
+ ERR("%s(align %d, %d, %d) failed\n", name, salign, c, len);
+ ERR("got : %.*s\n", salign+len+1, src);
+}
+
+static void test(const struct fun *fun, int salign, int c, int len)
+{
+ unsigned char *src = alignup(sbuf);
+ unsigned char *s = src + salign;
+ void *p;
+ int i;
+
+ if (len > LEN || salign >= A)
+ abort();
+ for (i = 0; i < len+A; i++)
+ src[i] = '?';
+ for (i = 0; i < len; i++)
+ s[i] = 'a' + i%23;
+ for (; i<len%A; i++)
+ s[i] = '*';
+
+ p = fun->fun(s, c, len);
+ if (p != s)
+ ERR("%s(%p,..) returned %p\n", fun->name, s, p);
+
+ for (i = 0; i < salign; i++) {
+ if (src[i] != '?') {
+ err(fun->name, src, salign, c, len);
+ return;
+ }
+ }
+ for (i = salign; i < len; i++) {
+ if (src[i] != (unsigned char)c) {
+ err(fun->name, src, salign, c, len);
+ return;
+ }
+ }
+ for (; i < len%A; i++) {
+ if (src[i] != '*') {
+ err(fun->name, src, salign, c, len);
+ return;
+ }
+ }
+}
+
+int main()
+{
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int s = 0; s < A; s++) {
+ int n;
+ for (n = 0; n < 100; n++) {
+ test(funtab+i, s, 0, n);
+ test(funtab+i, s, 0x25, n);
+ test(funtab+i, s, 0xaa25, n);
+ }
+ for (; n < LEN; n *= 2) {
+ test(funtab+i, s, 0, n);
+ test(funtab+i, s, 0x25, n);
+ test(funtab+i, s, 0xaa25, n);
+ }
+ }
+ if (test_status) {
+ r = -1;
+ ERR("FAIL %s\n", funtab[i].name);
+ }
+ }
+ return r;
+}
diff --git a/string/test/strchr.c b/string/test/strchr.c
new file mode 100644
index 0000000..30c714f
--- /dev/null
+++ b/string/test/strchr.c
@@ -0,0 +1,98 @@
+/*
+ * strchr test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+ const char *name;
+ char *(*fun)(const char *s, int c);
+} funtab[] = {
+#define F(x) {#x, x},
+F(strchr)
+#if __aarch64__
+F(__strchr_aarch64)
+# if __ARM_FEATURE_SVE
+F(__strchr_aarch64_sve)
+# endif
+#endif
+#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define SP 512
+#define LEN 250000
+static char sbuf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+ return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int align, int seekpos, int len)
+{
+ char *src = alignup(sbuf);
+ char *s = src + align;
+ char *f = seekpos != -1 ? s + seekpos : 0;
+ int seekchar = 0x1;
+ void *p;
+
+ if (len > LEN || seekpos >= len - 1 || align >= A)
+ abort();
+ if (seekchar >= 'a' && seekchar <= 'a' + 23)
+ abort();
+
+ for (int i = 0; i < len + A; i++)
+ src[i] = '?';
+ for (int i = 0; i < len - 2; i++)
+ s[i] = 'a' + i%23;
+ if (seekpos != -1)
+ s[seekpos] = seekchar;
+ s[len - 1] = '\0';
+
+ p = fun->fun(s, seekchar);
+
+ if (p != f) {
+ ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
+ ERR("expected: %p\n", f);
+ abort();
+ }
+}
+
+int main()
+{
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int a = 0; a < A; a++) {
+ int n;
+ for (n = 1; n < 100; n++) {
+ for (int sp = 0; sp < n - 1; sp++)
+ test(funtab+i, a, sp, n);
+ test(funtab+i, a, -1, n);
+ }
+ for (; n < LEN; n *= 2) {
+ test(funtab+i, a, -1, n);
+ test(funtab+i, a, n / 2, n);
+ }
+ }
+ if (test_status) {
+ r = -1;
+ ERR("FAIL %s\n", funtab[i].name);
+ }
+ }
+ return r;
+}
diff --git a/string/test/strchrnul.c b/string/test/strchrnul.c
new file mode 100644
index 0000000..c4260e6
--- /dev/null
+++ b/string/test/strchrnul.c
@@ -0,0 +1,100 @@
+/*
+ * strchrnul test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define _GNU_SOURCE
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+ const char *name;
+ char *(*fun)(const char *s, int c);
+} funtab[] = {
+#define F(x) {#x, x},
+F(strchrnul)
+#if __aarch64__
+F(__strchrnul_aarch64)
+# if __ARM_FEATURE_SVE
+F(__strchrnul_aarch64_sve)
+# endif
+#endif
+#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define SP 512
+#define LEN 250000
+static char sbuf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+ return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int align, int seekpos, int len)
+{
+ char *src = alignup(sbuf);
+ char *s = src + align;
+ char *f = seekpos != -1 ? s + seekpos : s + len - 1;
+ int seekchar = 0x1;
+ void *p;
+
+ if (len > LEN || seekpos >= len - 1 || align >= A)
+ abort();
+ if (seekchar >= 'a' && seekchar <= 'a' + 23)
+ abort();
+
+ for (int i = 0; i < len + A; i++)
+ src[i] = '?';
+ for (int i = 0; i < len - 2; i++)
+ s[i] = 'a' + i%23;
+ if (seekpos != -1)
+ s[seekpos] = seekchar;
+ s[len - 1] = '\0';
+
+ p = fun->fun(s, seekchar);
+
+ if (p != f) {
+ ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
+ ERR("expected: %p\n", f);
+ abort();
+ }
+}
+
+int main()
+{
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int a = 0; a < A; a++) {
+ int n;
+ for (n = 1; n < 100; n++) {
+ for (int sp = 0; sp < n - 1; sp++)
+ test(funtab+i, a, sp, n);
+ test(funtab+i, a, -1, n);
+ }
+ for (; n < LEN; n *= 2) {
+ test(funtab+i, a, -1, n);
+ test(funtab+i, a, n / 2, n);
+ }
+ }
+ if (test_status) {
+ r = -1;
+ ERR("FAIL %s\n", funtab[i].name);
+ }
+ }
+ return r;
+}
diff --git a/string/test/strcmp.c b/string/test/strcmp.c
new file mode 100644
index 0000000..c4e8867
--- /dev/null
+++ b/string/test/strcmp.c
@@ -0,0 +1,104 @@
+/*
+ * strcmp test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+ const char *name;
+ int (*fun)(const char *s1, const char *s2);
+} funtab[] = {
+#define F(x) {#x, x},
+F(strcmp)
+#if __aarch64__
+F(__strcmp_aarch64)
+# if __ARM_FEATURE_SVE
+F(__strcmp_aarch64_sve)
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
+F(__strcmp_arm)
+# elif __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
+F(__strcmp_armv6m)
+# endif
+#endif
+#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define LEN 250000
+static char s1buf[LEN+2*A];
+static char s2buf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+ return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos)
+{
+ char *src1 = alignup(s1buf);
+ char *src2 = alignup(s2buf);
+ char *s1 = src1 + s1align;
+ char *s2 = src2 + s2align;
+ int r;
+
+ if (len > LEN || s1align >= A || s2align >= A)
+ abort();
+ if (diffpos > 1 && diffpos >= len-1)
+ abort();
+
+ for (int i = 0; i < len+A; i++)
+ src1[i] = src2[i] = '?';
+ for (int i = 0; i < len-1; i++)
+ s1[i] = s2[i] = 'a' + i%23;
+ if (diffpos > 1)
+ s1[diffpos]++;
+ s1[len] = s2[len] = '\0';
+
+ r = fun->fun(s1, s2);
+
+ if (((diffpos <= 1) && r != 0) || (diffpos > 1 && r == 0)) {
+ ERR("%s(align %d, align %d, %d) failed, returned %d\n",
+ fun->name, s1align, s2align, len, r);
+ ERR("src1: %.*s\n", s1align+len+1, src1);
+ ERR("src2: %.*s\n", s2align+len+1, src2);
+ }
+}
+
+int main()
+{
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int d = 0; d < A; d++)
+ for (int s = 0; s < A; s++) {
+ int n;
+ for (n = 0; n < 100; n++) {
+ test(funtab+i, d, s, n, 0);
+ test(funtab+i, d, s, n, n / 2);
+ }
+ for (; n < LEN; n *= 2) {
+ test(funtab+i, d, s, n, 0);
+ test(funtab+i, d, s, n, n / 2);
+ }
+ }
+ if (test_status) {
+ r = -1;
+ ERR("FAIL %s\n", funtab[i].name);
+ }
+ }
+ return r;
+}
diff --git a/string/test/strcpy.c b/string/test/strcpy.c
new file mode 100644
index 0000000..3072ade
--- /dev/null
+++ b/string/test/strcpy.c
@@ -0,0 +1,100 @@
+/*
+ * strcpy test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+ const char *name;
+ char *(*fun)(char *dest, const char *src);
+} funtab[] = {
+#define F(x) {#x, x},
+F(strcpy)
+#if __aarch64__
+F(__strcpy_aarch64)
+# if __ARM_FEATURE_SVE
+F(__strcpy_aarch64_sve)
+# endif
+#elif __arm__ && defined (__thumb2__) && !defined (__thumb__)
+F(__strcpy_arm)
+#endif
+#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define LEN 250000
+static char dbuf[LEN+2*A];
+static char sbuf[LEN+2*A];
+static char wbuf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+ return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int dalign, int salign, int len)
+{
+ char *src = alignup(sbuf);
+ char *dst = alignup(dbuf);
+ char *want = wbuf;
+ char *s = src + salign;
+ char *d = dst + dalign;
+ char *w = want + dalign;
+ void *p;
+ int i;
+
+ if (len > LEN || dalign >= A || salign >= A)
+ abort();
+ for (i = 0; i < len+A; i++) {
+ src[i] = '?';
+ want[i] = dst[i] = '*';
+ }
+ for (i = 0; i < len-1; i++)
+ s[i] = w[i] = 'a' + i%23;
+ s[i] = w[i] = '\0';
+
+ p = fun->fun(d, s);
+ if (p != d)
+ ERR("%s(%p,..) returned %p\n", fun->name, d, p);
+ for (i = 0; i < len+A; i++) {
+ if (dst[i] != want[i]) {
+ ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
+ ERR("got : %.*s\n", dalign+len+1, dst);
+ ERR("want: %.*s\n", dalign+len+1, want);
+ break;
+ }
+ }
+}
+
+int main()
+{
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int d = 0; d < A; d++)
+ for (int s = 0; s < A; s++) {
+ int n;
+ for (n = 0; n < 100; n++)
+ test(funtab+i, d, s, n);
+ for (; n < LEN; n *= 2)
+ test(funtab+i, d, s, n);
+ }
+ if (test_status) {
+ r = -1;
+ ERR("FAIL %s\n", funtab[i].name);
+ }
+ }
+ return r;
+}
diff --git a/string/test/strlen.c b/string/test/strlen.c
new file mode 100644
index 0000000..700c865
--- /dev/null
+++ b/string/test/strlen.c
@@ -0,0 +1,91 @@
+/*
+ * strlen test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+ const char *name;
+ size_t (*fun)(const char *s);
+} funtab[] = {
+#define F(x) {#x, x},
+F(strlen)
+#if __aarch64__
+F(__strlen_aarch64)
+# if __ARM_FEATURE_SVE
+F(__strlen_aarch64_sve)
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+F(__strlen_armv6t2)
+# endif
+#endif
+#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define SP 512
+#define LEN 250000
+static char sbuf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+ return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int align, int len)
+{
+ char *src = alignup(sbuf);
+ char *s = src + align;
+ size_t r;
+
+ if (len > LEN || align >= A)
+ abort();
+
+ for (int i = 0; i < len + A; i++)
+ src[i] = '?';
+ for (int i = 0; i < len - 2; i++)
+ s[i] = 'a' + i%23;
+ s[len - 1] = '\0';
+
+ r = fun->fun(s);
+ if (r != len-1) {
+ ERR("%s(%p) returned %zu\n", fun->name, s, r);
+ ERR("input: %.*s\n", align+len+1, src);
+ ERR("expected: %d\n", len);
+ abort();
+ }
+}
+
+int main()
+{
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int a = 0; a < A; a++) {
+ int n;
+ for (n = 1; n < 100; n++)
+ test(funtab+i, a, n);
+ for (; n < LEN; n *= 2)
+ test(funtab+i, a, n);
+ }
+ if (test_status) {
+ r = -1;
+ ERR("FAIL %s\n", funtab[i].name);
+ }
+ }
+ return r;
+}
diff --git a/string/test/strncmp.c b/string/test/strncmp.c
new file mode 100644
index 0000000..14e0a8c
--- /dev/null
+++ b/string/test/strncmp.c
@@ -0,0 +1,104 @@
+/*
+ * strncmp test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+ const char *name;
+ int (*fun)(const char *, const char *, size_t);
+} funtab[] = {
+#define F(x) {#x, x},
+F(strncmp)
+#if __aarch64__
+F(__strncmp_aarch64)
+# if __ARM_FEATURE_SVE
+F(__strncmp_aarch64_sve)
+# endif
+#endif
+#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define LEN 250000
+static char s1buf[LEN+2*A];
+static char s2buf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+ return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int s1align, int s2align, int maxlen, int diffpos, int len)
+{
+ char *src1 = alignup(s1buf);
+ char *src2 = alignup(s2buf);
+ char *s1 = src1 + s1align;
+ char *s2 = src2 + s2align;
+ int r;
+
+ if (len > LEN || s1align >= A || s2align >= A)
+ abort();
+ if (diffpos > 1 && diffpos >= len-1)
+ abort();
+
+ for (int i = 0; i < len+A; i++)
+ src1[i] = src2[i] = '?';
+ for (int i = 0; i < len-1; i++)
+ s1[i] = s2[i] = 'a' + i%23;
+ if (diffpos > 1)
+ s1[diffpos]++;
+ s1[len] = s2[len] = '\0';
+
+ r = fun->fun(s1, s2, maxlen);
+
+ diffpos = maxlen <= diffpos ? 0 : diffpos;
+
+ if (((diffpos <= 1) && r != 0) || (diffpos > 1 && r == 0)) {
+ ERR("%s(align %d, align %d, %d (%d)) failed, returned %d (%d)\n",
+ fun->name, s1align, s2align, maxlen, len, r, diffpos);
+ ERR("src1: %.*s\n", s1align+len+1, src1);
+ ERR("src2: %.*s\n", s2align+len+1, src2);
+ }
+}
+
+int main()
+{
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int d = 0; d < A; d++)
+ for (int s = 0; s < A; s++) {
+ int n;
+ for (n = 0; n < 100; n++) {
+ test(funtab+i, d, s, n, 0, n);
+ test(funtab+i, d, s, n, n/2, n);
+ test(funtab+i, d, s, n/2, 0, n);
+ test(funtab+i, d, s, n/2, n/2, n);
+ }
+ for (; n < LEN; n *= 2) {
+ test(funtab+i, d, s, n, 0, n);
+ test(funtab+i, d, s, n, n/2, n);
+ test(funtab+i, d, s, n/2, 0, n);
+ test(funtab+i, d, s, n/2, n/2, n);
+ }
+ }
+ if (test_status) {
+ r = -1;
+ ERR("FAIL %s\n", funtab[i].name);
+ }
+ }
+ return r;
+}
diff --git a/string/test/strnlen.c b/string/test/strnlen.c
new file mode 100644
index 0000000..9a98d80
--- /dev/null
+++ b/string/test/strnlen.c
@@ -0,0 +1,94 @@
+/*
+ * strnlen test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define _POSIX_C_SOURCE 200809L
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+ const char *name;
+ size_t (*fun)(const char *s, size_t m);
+} funtab[] = {
+#define F(x) {#x, x},
+F(strnlen)
+#if __aarch64__
+F(__strnlen_aarch64)
+# if __ARM_FEATURE_SVE
+F(__strnlen_aarch64_sve)
+# endif
+#endif
+#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define SP 512
+#define LEN 250000
+static char sbuf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+ return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int align, int maxlen, int len)
+{
+ char *src = alignup(sbuf);
+ char *s = src + align;
+ size_t r;
+ size_t e = maxlen < len ? maxlen : len - 1;
+
+ if (len > LEN || align >= A)
+ abort();
+
+ for (int i = 0; i < len + A; i++)
+ src[i] = '?';
+ for (int i = 0; i < len - 2; i++)
+ s[i] = 'a' + i%23;
+ s[len - 1] = '\0';
+
+ r = fun->fun(s, maxlen);
+ if (r != e) {
+ ERR("%s(%p) returned %zu\n", fun->name, s, r);
+ ERR("input: %.*s\n", align+len+1, src);
+ ERR("expected: %d\n", len);
+ abort();
+ }
+}
+
+int main()
+{
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int a = 0; a < A; a++) {
+ int n;
+ for (n = 1; n < 100; n++)
+ for (int maxlen = 0; maxlen < 100; maxlen++)
+ test(funtab+i, a, maxlen, n);
+ for (; n < LEN; n *= 2) {
+ test(funtab+i, a, n*2, n);
+ test(funtab+i, a, n, n);
+ test(funtab+i, a, n/2, n);
+ }
+ }
+ if (test_status) {
+ r = -1;
+ ERR("FAIL %s\n", funtab[i].name);
+ }
+ }
+ return r;
+}
diff --git a/string/test/strrchr.c b/string/test/strrchr.c
new file mode 100644
index 0000000..b3fc2a9
--- /dev/null
+++ b/string/test/strrchr.c
@@ -0,0 +1,97 @@
+/*
+ * strrchr test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+ const char *name;
+ char *(*fun)(const char *s, int c);
+} funtab[] = {
+#define F(x) {#x, x},
+F(strrchr)
+#if __aarch64__
+# if __ARM_FEATURE_SVE
+F(__strrchr_aarch64_sve)
+# endif
+#endif
+#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define SP 512
+#define LEN 250000
+static char sbuf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+ return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int align, int seekpos, int len)
+{
+ char *src = alignup(sbuf);
+ char *s = src + align;
+ char *f = seekpos != -1 ? s + seekpos : 0;
+ int seekchar = 0x1;
+ void *p;
+
+ if (len > LEN || seekpos >= len - 1 || align >= A)
+ abort();
+ if (seekchar >= 'a' && seekchar <= 'a' + 23)
+ abort();
+
+ for (int i = 0; i < len + A; i++)
+ src[i] = '?';
+ for (int i = 0; i < len - 2; i++)
+ s[i] = 'a' + i%23;
+ if (seekpos != -1)
+ s[seekpos/2] = s[seekpos] = seekchar;
+ s[len - 1] = '\0';
+
+ p = fun->fun(s, seekchar);
+
+ if (p != f) {
+ ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
+ ERR("expected: %p\n", f);
+ abort();
+ }
+}
+
+int main()
+{
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int a = 0; a < A; a++) {
+ int n;
+ for (n = 1; n < 100; n++) {
+ for (int sp = 0; sp < n - 1; sp++)
+ test(funtab+i, a, sp, n);
+ test(funtab+i, a, -1, n);
+ }
+ for (; n < LEN; n *= 2) {
+ test(funtab+i, a, -1, n);
+ test(funtab+i, a, n / 2, n);
+ }
+ }
+ if (test_status) {
+ r = -1;
+ ERR("FAIL %s\n", funtab[i].name);
+ }
+ }
+ return r;
+}