Merge "Upgrade arm-optimized-routines to 9c8399909a9835e6f55977df1661cf6306c56707"ndk-sysroot-r21

author: Treehugger Robot <treehugger-gerrit@google.com> 2019-09-18 11:57:37 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> 2019-09-18 11:57:37 +0000
commit: 6a751e14fcef594edf6a0eb7fab98799deaa484e (patch)
tree: d58d9102b2010c48c3d95474fb5914ae4c2371d9
parent: 055b2f3c3849d992f9287a2fa0dbe79bd9f2f251 (diff)
parent: 68a0658ee72ca4c3961719900e9f97562dd66931 (diff)
download: arm-optimized-routines-ndk-sysroot-r21.tar.gz
66 files changed, 5916 insertions, 7 deletions
diff --git a/METADATA b/METADATA
index 7c706d5..9762f4f 100644
--- a/METADATA
+++ b/METADATA
@@ -9,11 +9,11 @@ third_party {
     type: GIT
     value: "https://github.com/ARM-software/optimized-routines.git"
   }
-  version: "6b594432c8ac46e71686ea21fad30d1c3f79e65a"
+  version: "9c8399909a9835e6f55977df1661cf6306c56707"
   license_type: NOTICE
   last_upgrade_date {
     year: 2019
-    month: 7
-    day: 31
+    month: 9
+    day: 3
   }
 }
diff --git a/math/exp.c b/math/exp.c
index ffd3111..1909b8e 100644
--- a/math/exp.c
+++ b/math/exp.c
@@ -5,6 +5,7 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include <float.h>
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
@@ -169,4 +170,7 @@ __exp_dd (double x, double xtail)
 strong_alias (exp, __exp_finite)
 hidden_alias (exp, __ieee754_exp)
 hidden_alias (__exp_dd, __exp1)
+# if LDBL_MANT_DIG == 53
+long double expl (long double x) { return exp (x); }
+# endif
 #endif
diff --git a/math/exp2.c b/math/exp2.c
index fbedbcb..47aa479 100644
--- a/math/exp2.c
+++ b/math/exp2.c
@@ -5,6 +5,7 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include <float.h>
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
@@ -136,4 +137,7 @@ exp2 (double x)
 #if USE_GLIBC_ABI
 strong_alias (exp2, __exp2_finite)
 hidden_alias (exp2, __ieee754_exp2)
+# if LDBL_MANT_DIG == 53
+long double exp2l (long double x) { return exp2 (x); }
+# endif
 #endif
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index aac2d4d..eed294b 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -5,9 +5,6 @@
  * SPDX-License-Identifier: MIT
  */
 
-float sinf (float);
-float cosf (float);
-float tanf (float);
 float expf (float);
 float exp2f (float);
 float logf (float);
diff --git a/math/log.c b/math/log.c
index 1283ef2..b85d3ff 100644
--- a/math/log.c
+++ b/math/log.c
@@ -5,6 +5,7 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include <float.h>
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
@@ -155,4 +156,7 @@ log (double x)
 #if USE_GLIBC_ABI
 strong_alias (log, __log_finite)
 hidden_alias (log, __ieee754_log)
+# if LDBL_MANT_DIG == 53
+long double logl (long double x) { return log (x); }
+# endif
 #endif
diff --git a/math/log2.c b/math/log2.c
index 478b33d..804fb85 100644
--- a/math/log2.c
+++ b/math/log2.c
@@ -5,6 +5,7 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include <float.h>
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
@@ -134,4 +135,7 @@ log2 (double x)
 #if USE_GLIBC_ABI
 strong_alias (log2, __log2_finite)
 hidden_alias (log2, __ieee754_log2)
+# if LDBL_MANT_DIG == 53
+long double log2l (long double x) { return log2 (x); }
+# endif
 #endif
diff --git a/math/pow.c b/math/pow.c
index e55f159..493488d 100644
--- a/math/pow.c
+++ b/math/pow.c
@@ -5,6 +5,7 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include <float.h>
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
@@ -371,4 +372,7 @@ pow (double x, double y)
 #if USE_GLIBC_ABI
 strong_alias (pow, __pow_finite)
 hidden_alias (pow, __ieee754_pow)
+# if LDBL_MANT_DIG == 53
+long double powl (long double x, long double y) { return pow (x, y); }
+# endif
 #endif
diff --git a/math/test/ulp.c b/math/test/ulp.c
index 8de6e5b..8782fb0 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -704,7 +704,7 @@ main (int argc, char *argv[])
   if (!USE_MPFR && conf.mpfr)
     {
       puts ("mpfr is not available.");
-      return 1;
+      return 0;
     }
   argc--;
   argv++;
diff --git a/string/Dir.mk b/string/Dir.mk
index e179642..bd9979f 100644
--- a/string/Dir.mk
+++ b/string/Dir.mk
@@ -14,6 +14,18 @@ string-libs := \
 
 string-tools := \
 	build/bin/test/memcpy \
+	build/bin/test/memmove \
+	build/bin/test/memset \
+	build/bin/test/memchr \
+	build/bin/test/memcmp \
+	build/bin/test/strcpy \
+	build/bin/test/strcmp \
+	build/bin/test/strchr \
+	build/bin/test/strrchr \
+	build/bin/test/strchrnul \
+	build/bin/test/strlen \
+	build/bin/test/strnlen \
+	build/bin/test/strncmp
 
 string-lib-base := $(basename $(string-lib-srcs))
 string-lib-objs := $(string-lib-base:$(srcdir)/%=build/%.o)
@@ -47,5 +59,17 @@ build/bin/%.sh: $(srcdir)/string/test/%.sh
 
 check-string: $(string-tools)
 	$(EMULATOR) build/bin/test/memcpy
+	$(EMULATOR) build/bin/test/memmove
+	$(EMULATOR) build/bin/test/memset
+	$(EMULATOR) build/bin/test/memchr
+	$(EMULATOR) build/bin/test/memcmp
+	$(EMULATOR) build/bin/test/strcpy
+	$(EMULATOR) build/bin/test/strcmp
+	$(EMULATOR) build/bin/test/strchr
+	$(EMULATOR) build/bin/test/strrchr
+	$(EMULATOR) build/bin/test/strchrnul
+	$(EMULATOR) build/bin/test/strlen
+	$(EMULATOR) build/bin/test/strnlen
+	$(EMULATOR) build/bin/test/strncmp
 
 .PHONY: all-string check-string
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S
new file mode 100644
index 0000000..0d75acd
--- /dev/null
+++ b/string/aarch64/memchr-sve.S
@@ -0,0 +1,62 @@
+/*
+ * memchr - find a character in a memory zone
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+	.arch	armv8-a+sve
+	.text
+
+	.globl	__memchr_aarch64_sve
+	.type	__memchr_aarch64_sve, %function
+	.p2align 4
+__memchr_aarch64_sve:
+	dup	z1.b, w1			/* duplicate c to a vector */
+	setffr					/* initialize FFR */
+	mov	x3, 0				/* initialize off */
+	nop
+
+0:	whilelo	p1.b, x3, x2			/* make sure off < max */
+	b.none	9f
+
+	/* Read a vector's worth of bytes, bounded by max,
+	   stopping on first fault.  */
+	ldff1b	z0.b, p1/z, [x0, x3]
+	rdffrs	p0.b, p1/z
+	b.nlast	2f
+
+	/* First fault did not fail: the vector bounded by max is valid.
+	   Avoid depending on the contents of FFR beyond the branch.  */
+	incb	x3				/* speculate increment */
+	cmpeq	p2.b, p1/z, z0.b, z1.b		/* search for c */
+	b.none	0b
+	decb	x3				/* undo speculate */
+
+	/* Found C.  */
+1:	brkb	p2.b, p1/z, p2.b	/* find the first c */
+	add	x0, x0, x3		/* form partial pointer */
+	incp	x0, p2.b		/* form final pointer to c */
+	ret
+
+	/* First fault failed: only some of the vector is valid.
+	   Perform the comparision only on the valid bytes.  */
+2:	cmpeq	p2.b, p0/z, z0.b, z1.b
+	b.any	1b
+
+	/* No C found.  Re-init FFR, increment, and loop.  */
+	setffr
+	incp	x3, p0.b
+	b	0b
+
+	/* Found end of count.  */
+9:	mov	x0, 0			/* return null */
+	ret
+
+	.size	__memchr_aarch64_sve, . - __memchr_aarch64_sve
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
new file mode 100644
index 0000000..e5a3abf
--- /dev/null
+++ b/string/aarch64/memchr.S
@@ -0,0 +1,149 @@
+/*
+ * memchr - find a character in a memory zone
+ *
+ * Copyright (c) 2014-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+
+#define result		x0
+
+#define src		x3
+#define	tmp		x4
+#define wtmp2		w5
+#define synd		x6
+#define soff		x9
+#define cntrem		x10
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_chr1	v3
+#define vhas_chr2	v4
+#define vrepmask	v5
+#define vend		v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+def_fn __memchr_aarch64
+	/* Do not dereference srcin if no bytes to compare.  */
+	cbz	cntin, .Lzero_length
+	/*
+	 * Magic constant 0x40100401 allows us to identify which lane matches
+	 * the requested byte.
+	 */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	dup	vrepmask.4s, wtmp2
+	ands	soff, srcin, #31
+	and	cntrem, cntin, #31
+	b.eq	.Lloop
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	sub	tmp, soff, #32
+	adds	cntin, cntin, tmp
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Clear the soff*2 lower bits */
+	lsl	tmp, soff, #1
+	lsr	synd, synd, tmp
+	lsl	synd, synd, tmp
+	/* The first block can also be the last */
+	b.ls	.Lmasklast
+	/* Have we found something already? */
+	cbnz	synd, .Ltail
+
+.Lloop:
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	subs	cntin, cntin, #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* If we're out of data we finish regardless of the result */
+	b.ls	.Lend
+	/* Use a fast check for the termination condition */
+	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
+	addp	vend.2d, vend.2d, vend.2d
+	mov	synd, vend.2d[0]
+	/* We're not out of data, loop if we haven't found the character */
+	cbz	synd, .Lloop
+
+.Lend:
+	/* Termination condition found, let's calculate the syndrome value */
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Only do the clear for the last possible block */
+	b.hi	.Ltail
+
+.Lmasklast:
+	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+	add	tmp, cntrem, soff
+	and	tmp, tmp, #31
+	sub	tmp, tmp, #32
+	neg	tmp, tmp, lsl #1
+	lsl	synd, synd, tmp
+	lsr	synd, synd, tmp
+
+.Ltail:
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Check that we have found a character */
+	cmp	synd, #0
+	/* And count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result */
+	add	result, src, synd, lsr #1
+	/* Select result or NULL */
+	csel	result, xzr, result, eq
+	ret
+
+.Lzero_length:
+	mov	result, #0
+	ret
+
+	.size	__memchr_aarch64, . - __memchr_aarch64
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S
new file mode 100644
index 0000000..d4f6026
--- /dev/null
+++ b/string/aarch64/memcmp-sve.S
@@ -0,0 +1,48 @@
+/*
+ * memcmp - compare memory
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+	.arch	armv8-a+sve
+	.text
+
+	.globl	__memcmp_aarch64_sve
+	.type	__memcmp_aarch64_sve, %function
+	.p2align 4
+__memcmp_aarch64_sve:
+	mov	x3, 0			/* initialize off */
+
+0:	whilelo	p0.b, x3, x2		/* while off < max */
+	b.none	9f
+
+	ld1b	z0.b, p0/z, [x0, x3]	/* read vectors bounded by max.  */
+	ld1b	z1.b, p0/z, [x1, x3]
+
+	/* Increment for a whole vector, even if we've only read a partial.
+	   This is significantly cheaper than INCP, and since OFF is not
+	   used after the loop it is ok to increment OFF past MAX.  */
+	incb	x3
+
+	cmpne	p1.b, p0/z, z0.b, z1.b	/* while no inequalities */
+	b.none	0b
+
+	/* Found inequality.  */
+1:	brkb	p1.b, p0/z, p1.b	/* find first such */
+	lasta	w0, p1, z0.b		/* extract each byte */
+	lasta	w1, p1, z1.b
+	sub	x0, x0, x1		/* return comparison */
+	ret
+
+	/* Found end-of-count.  */
+9:	mov	x0, 0			/* return equality */
+	ret
+
+	.size	__memcmp_aarch64_sve, . - __memcmp_aarch64_sve
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
new file mode 100644
index 0000000..72a66bc
--- /dev/null
+++ b/string/aarch64/memcmp.S
@@ -0,0 +1,141 @@
+/* memcmp - compare memory
+ *
+ * Copyright (c) 2013, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ */
+
+#define L(l) .L ## l
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		w0
+
+/* Internal variables.  */
+#define data1		x3
+#define data1w		w3
+#define data1h		x4
+#define data2		x5
+#define data2w		w5
+#define data2h		x6
+#define tmp1		x7
+#define tmp2		x8
+
+        .macro def_fn f p2align=0
+        .text
+        .p2align \p2align
+        .global \f
+        .type \f, %function
+\f:
+        .endm
+
+def_fn __memcmp_aarch64 p2align=6
+	subs	limit, limit, 8
+	b.lo	L(less8)
+
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	b.ne	L(return)
+
+	subs	limit, limit, 8
+	b.gt	L(more16)
+
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	b	L(return)
+
+L(more16):
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	bne	L(return)
+
+	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+	   strings.  */
+	subs	limit, limit, 16
+	b.ls	L(last_bytes)
+
+	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
+	   try to align, so limit it only to strings larger than 128 bytes.  */
+	cmp	limit, 96
+	b.ls	L(loop16)
+
+	/* Align src1 and adjust src2 with bytes not yet done.  */
+	and	tmp1, src1, 15
+	add	limit, limit, tmp1
+	sub	src1, src1, tmp1
+	sub	src2, src2, tmp1
+
+	/* Loop performing 16 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 16 and must be larger than zero.
+	   Exit if <= 16 bytes left to do or if the data is not equal.  */
+	.p2align 4
+L(loop16):
+	ldp	data1, data1h, [src1], 16
+	ldp	data2, data2h, [src2], 16
+	subs	limit, limit, 16
+	ccmp	data1, data2, 0, hi
+	ccmp	data1h, data2h, 0, eq
+	b.eq	L(loop16)
+
+	cmp	data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
+	bne	L(return)
+
+	/* Compare last 1-16 bytes using unaligned access.  */
+L(last_bytes):
+	add	src1, src1, limit
+	add	src2, src2, limit
+	ldp	data1, data1h, [src1]
+	ldp	data2, data2h, [src2]
+	cmp     data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
+
+	/* Compare data bytes and set return value to 0, -1 or 1.  */
+L(return):
+#ifndef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	cmp     data1, data2
+L(ret_eq):
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+
+	.p2align 4
+	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
+L(less8):
+	adds	limit, limit, 4
+	b.lo	L(less4)
+	ldr	data1w, [src1], 4
+	ldr	data2w, [src2], 4
+	cmp	data1w, data2w
+	b.ne	L(return)
+	sub	limit, limit, 4
+L(less4):
+	adds	limit, limit, 4
+	beq	L(ret_eq)
+L(byte_loop):
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	subs	limit, limit, 1
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.eq	L(byte_loop)
+	sub	result, data1w, data2w
+	ret
+
+	.size __memcmp_aarch64, . - __memcmp_aarch64
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
new file mode 100644
index 0000000..4bbd288
--- /dev/null
+++ b/string/aarch64/memcpy.S
@@ -0,0 +1,178 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2012, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define A_hw	w7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	src
+#define E_h	count
+#define F_l	srcend
+#define F_h	dst
+#define tmp1	x9
+
+#define L(l) .L ## l
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+   medium copies of 17..96 bytes which are fully unrolled. Large copies
+   of more than 96 bytes align the destination and use an unrolled loop
+   processing 64 bytes per iteration.
+   Small and medium copies read all data before writing, allowing any
+   kind of overlap, and memmove tailcalls memcpy for these cases as
+   well as non-overlapping copies.
+*/
+
+def_fn __memcpy_aarch64 p2align=6
+	prfm	PLDL1KEEP, [src]
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 16
+	b.ls	L(copy16)
+	cmp	count, 96
+	b.hi	L(copy_long)
+
+	/* Medium copies: 17..96 bytes.  */
+	sub	tmp1, count, 1
+	ldp	A_l, A_h, [src]
+	tbnz	tmp1, 6, L(copy96)
+	ldp	D_l, D_h, [srcend, -16]
+	tbz	tmp1, 5, 1f
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
+1:
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Small copies: 0..16 bytes.  */
+L(copy16):
+	cmp	count, 8
+	b.lo	1f
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+	.p2align 4
+1:
+	tbz	count, 2, 1f
+	ldr	A_lw, [src]
+	ldr	A_hw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	A_hw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
+1:
+	cbz	count, 2f
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	A_hw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	A_hw, [dstend, -1]
+2:	ret
+
+	.p2align 4
+	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
+	   32 bytes from the end.  */
+L(copy96):
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [src, 32]
+	ldp	D_l, D_h, [src, 48]
+	ldp	E_l, E_h, [srcend, -32]
+	ldp	F_l, F_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin, 32]
+	stp	D_l, D_h, [dstin, 48]
+	stp	E_l, E_h, [dstend, -32]
+	stp	F_l, F_h, [dstend, -16]
+	ret
+
+	/* Align DST to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.	 There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.	The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	.p2align 4
+L(copy_long):
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldp	D_l, D_h, [src]
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	2f
+1:
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.	 The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the end even if
+	   there is just 1 byte left.  */
+2:
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
+	ret
+
+	.size	__memcpy_aarch64, . - __memcpy_aarch64
diff --git a/string/aarch64/memmove.S b/string/aarch64/memmove.S
new file mode 100644
index 0000000..5e70f21
--- /dev/null
+++ b/string/aarch64/memmove.S
@@ -0,0 +1,103 @@
+/*
+ * memmove - copy memory area
+ *
+ * Copyright (c) 2013, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses
+ */
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+/* Parameters and result.  */
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define srcend	x3
+#define dstend	x4
+#define tmp1	x5
+#define A_l	x6
+#define A_h	x7
+#define B_l	x8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	count
+#define E_h	tmp1
+
+/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
+   Larger backwards copies are also handled by memcpy. The only remaining
+   case is forward large copies.  The destination is aligned, and an
+   unrolled loop processes 64 bytes per iteration.
+*/
+
+def_fn __memmove_aarch64, 6
+	sub	tmp1, dstin, src
+	cmp	count, 96
+	ccmp	tmp1, count, 2, hi
+	b.hs	__memcpy_aarch64
+
+	cbz	tmp1, 3f
+	add	dstend, dstin, count
+	add	srcend, src, count
+
+	/* Align dstend to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.	 There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.	The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	and	tmp1, dstend, 15
+	ldp	D_l, D_h, [srcend, -16]
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	2f
+	nop
+1:
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
+	subs	count, count, 64
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.	 The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the start even if
+	   there is just 1 byte left.  */
+2:
+	ldp	E_l, E_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	E_l, E_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
+3:	ret
+
+	.size	__memmove_aarch64, . - __memmove_aarch64
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
new file mode 100644
index 0000000..aef22e9
--- /dev/null
+++ b/string/aarch64/memset.S
@@ -0,0 +1,188 @@
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2012, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses
+ *
+ */
+
+
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define tmp1	x5
+#define tmp1w	w5
+#define tmp2	x6
+#define tmp2w	w6
+#define zva_len x7
+#define zva_lenw w7
+
+#define L(l) .L ## l
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+def_fn __memset_aarch64 p2align=6
+
+	dup	v0.16B, valw
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	L(set_long)
+	cmp	count, 16
+	b.hs	L(set_medium)
+	mov	val, v0.D[0]
+
+	/* Set 0..15 bytes.  */
+	tbz	count, 3, 1f
+	str	val, [dstin]
+	str	val, [dstend, -8]
+	ret
+	nop
+1:	tbz	count, 2, 2f
+	str	valw, [dstin]
+	str	valw, [dstend, -4]
+	ret
+2:	cbz	count, 3f
+	strb	valw, [dstin]
+	tbz	count, 1, 3f
+	strh	valw, [dstend, -2]
+3:	ret
+
+	/* Set 17..96 bytes.  */
+L(set_medium):
+	str	q0, [dstin]
+	tbnz	count, 6, L(set96)
+	str	q0, [dstend, -16]
+	tbz	count, 5, 1f
+	str	q0, [dstin, 16]
+	str	q0, [dstend, -32]
+1:	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	str	q0, [dstin, 16]
+	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.p2align 3
+	nop
+L(set_long):
+	and	valw, valw, 255
+	bic	dst, dstin, 15
+	str	q0, [dstin]
+	cmp	count, 256
+	ccmp	valw, 0, 0, cs
+	b.eq	L(try_zva)
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	add	dst, dst, 16
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+L(tail64):
+	subs	count, count, 64
+	b.hi	1b
+2:	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.p2align 3
+L(try_zva):
+	mrs	tmp1, dczid_el0
+	tbnz	tmp1w, 4, L(no_zva)
+	and	tmp1w, tmp1w, 15
+	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
+	b.ne	 L(zva_128)
+
+	/* Write the first and last 64 byte aligned block using stp rather
+	   than using DC ZVA.  This is faster on some cores.
+	 */
+L(zva_64):
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+	nop
+1:	dc	zva, dst
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	1b
+	stp	q0, q0, [dst, 0]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.p2align 3
+L(zva_128):
+	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
+	b.ne	L(zva_other)
+
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	bic	dst, dst, 127
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+1:	dc	zva, dst
+	add	dst, dst, 128
+	subs	count, count, 128
+	b.hi	1b
+	stp	q0, q0, [dstend, -128]
+	stp	q0, q0, [dstend, -96]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+L(zva_other):
+	mov	tmp2w, 4
+	lsl	zva_lenw, tmp2w, tmp1w
+	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
+	cmp	count, tmp1
+	blo	L(no_zva)
+
+	sub	tmp2, zva_len, 1
+	add	tmp1, dst, zva_len
+	add	dst, dst, 16
+	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
+	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
+	beq	2f
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+	subs	count, count, 64
+	b.hi	1b
+2:	mov	dst, tmp1
+	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
+	subs	count, count, zva_len
+	b.lo	4f
+3:	dc	zva, dst
+	add	dst, dst, zva_len
+	subs	count, count, zva_len
+	b.hs	3b
+4:	add	count, count, zva_len
+	b	L(tail64)
+
+	.size	__memset_aarch64, . - __memset_aarch64
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S
new file mode 100644
index 0000000..8d8a319
--- /dev/null
+++ b/string/aarch64/strchr-sve.S
@@ -0,0 +1,69 @@
+/*
+ * strchr/strchrnul - find a character in a string
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+	.arch	armv8-a+sve
+	.text
+
+/* To build as strchrnul, define BUILD_STRCHRNUL before compiling this file.  */
+#ifdef BUILD_STRCHRNUL
+#define FUNC  __strchrnul_aarch64_sve
+#else
+#define FUNC  __strchr_aarch64_sve
+#endif
+
+	.globl	FUNC
+	.type	FUNC, %function
+	.p2align 4
+FUNC:
+	dup	z1.b, w1		/* replicate byte across vector */
+	setffr				/* initialize FFR */
+	ptrue	p1.b			/* all ones; loop invariant */
+
+	.p2align 4
+	/* Read a vector's worth of bytes, stopping on first fault.  */
+0:	ldff1b	z0.b, p1/z, [x0, xzr]
+	rdffrs	p0.b, p1/z
+	b.nlast	2f
+
+	/* First fault did not fail: the whole vector is valid.
+	   Avoid depending on the contents of FFR beyond the branch.  */
+	incb	x0				/* speculate increment */
+	cmpeq	p2.b, p1/z, z0.b, z1.b		/* search for c */
+	cmpeq	p3.b, p1/z, z0.b, 0		/* search for 0 */
+	orrs	p4.b, p1/z, p2.b, p3.b		/* c | 0 */
+	b.none	0b
+	decb	x0				/* undo speculate */
+
+	/* Found C or 0.  */
+1:	brka	p4.b, p1/z, p4.b	/* find first such */
+	sub	x0, x0, 1		/* adjust pointer for that byte */
+	incp	x0, p4.b
+#ifndef BUILD_STRCHRNUL
+	ptest	p4, p2.b		/* was first in c? */
+	csel	x0, xzr, x0, none	/* if there was no c, return null */
+#endif
+	ret
+
+	/* First fault failed: only some of the vector is valid.
+	   Perform the comparision only on the valid bytes.  */
+2:	cmpeq	p2.b, p0/z, z0.b, z1.b		/* search for c */
+	cmpeq	p3.b, p0/z, z0.b, 0		/* search for 0 */
+	orrs	p4.b, p0/z, p2.b, p3.b		/* c | 0 */
+	b.any	1b
+
+	/* No C or 0 found.  Re-init FFR, increment, and loop.  */
+	setffr
+	incp	x0, p0.b
+	b	0b
+
+	.size	FUNC, . - FUNC
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
new file mode 100644
index 0000000..945be3d
--- /dev/null
+++ b/string/aarch64/strchr.S
@@ -0,0 +1,137 @@
+/*
+ * strchr - find a character in a string
+ *
+ * Copyright (c) 2014-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+
+#define result		x0
+
+#define src		x2
+#define	tmp1		x3
+#define wtmp2		w4
+#define tmp3		x5
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_nul1	v3
+#define vhas_nul2	v4
+#define vhas_chr1	v5
+#define vhas_chr2	v6
+#define vrepmask_0	v7
+#define vrepmask_c	v16
+#define vend1		v17
+#define vend2		v18
+
+/* Core algorithm.
+
+   For each 32-byte hunk we calculate a 64-bit syndrome value, with
+   two bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bit 0 is set iff
+   the relevant byte matched the requested character; bit 1 is set
+   iff the relevant byte matched the NUL end of string (we trigger
+   off bit0 for the special case of looking for NUL).  Since the bits
+   in the syndrome reflect exactly the order in which things occur
+   in the original string a count_trailing_zeros() operation will
+   identify exactly which byte is causing the termination, and why.  */
+
+/* Locals and temporaries.  */
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+def_fn __strchr_aarch64
+	/* Magic constant 0x40100401 to allow us to identify which lane
+	   matches the requested byte.  Magic constant 0x80200802 used
+	   similarly for NUL termination.  */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
+	dup	vrepmask_c.4s, wtmp2
+	ands	tmp1, srcin, #31
+	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+	b.eq	.Lloop
+
+	/* Input string is not 32-byte aligned.  Rather than forcing
+	   the padding bytes to a safe value, we calculate the syndrome
+	   for all the bytes, but then mask off those bits of the
+	   syndrome that are related to the padding.  */
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	neg	tmp1, tmp1
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
+	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+	lsl	tmp1, tmp1, #1
+	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
+	mov	tmp3, #~0
+	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
+	lsr	tmp1, tmp3, tmp1
+
+	mov	tmp3, vend1.2d[0]
+	bic	tmp1, tmp3, tmp1	// Mask padding bits.
+	cbnz	tmp1, .Ltail
+
+.Lloop:
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* Use a fast check for the termination condition.  */
+	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
+	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+	orr	vend1.16b, vend1.16b, vend2.16b
+	addp	vend1.2d, vend1.2d, vend1.2d
+	mov	tmp1, vend1.2d[0]
+	cbz	tmp1, .Lloop
+
+	/* Termination condition found.  Now need to establish exactly why
+	   we terminated.  */
+	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
+	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
+	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
+
+	mov	tmp1, vend1.2d[0]
+.Ltail:
+	/* Count the trailing zeros, by bit reversing...  */
+	rbit	tmp1, tmp1
+	/* Re-bias source.  */
+	sub	src, src, #32
+	clz	tmp1, tmp1	/* And counting the leading zeros.  */
+	/* Tmp1 is even if the target charager was found first.  Otherwise
+	   we've found the end of string and we weren't looking for NUL.  */
+	tst	tmp1, #1
+	add	result, src, tmp1, lsr #1
+	csel	result, result, xzr, eq
+	ret
+
+	.size	__strchr_aarch64, . - __strchr_aarch64
diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S
new file mode 100644
index 0000000..5140e59
--- /dev/null
+++ b/string/aarch64/strchrnul-sve.S
@@ -0,0 +1,9 @@
+/*
+ * strchrnul - find a character or nul in a string
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define BUILD_STRCHRNUL
+#include "strchr-sve.S"
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
new file mode 100644
index 0000000..d19c0e8
--- /dev/null
+++ b/string/aarch64/strchrnul.S
@@ -0,0 +1,122 @@
+/*
+ * strchrnul - find a character or nul in a string
+ *
+ * Copyright (c) 2014-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+
+#define result		x0
+
+#define src		x2
+#define	tmp1		x3
+#define wtmp2		w4
+#define tmp3		x5
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_nul1	v3
+#define vhas_nul2	v4
+#define vhas_chr1	v5
+#define vhas_chr2	v6
+#define vrepmask	v7
+#define vend1		v16
+
+/* Core algorithm.
+
+   For each 32-byte hunk we calculate a 64-bit syndrome value, with
+   two bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bit 0 is set iff
+   the relevant byte matched the requested character or nul.  Since the
+   bits in the syndrome reflect exactly the order in which things occur
+   in the original string a count_trailing_zeros() operation will
+   identify exactly which byte is causing the termination.  */
+
+/* Locals and temporaries.  */
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+def_fn __strchrnul_aarch64
+	/* Magic constant 0x40100401 to allow us to identify which lane
+	   matches the termination condition.  */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
+	dup	vrepmask.4s, wtmp2
+	ands	tmp1, srcin, #31
+	b.eq	.Lloop
+
+	/* Input string is not 32-byte aligned.  Rather than forcing
+	   the padding bytes to a safe value, we calculate the syndrome
+	   for all the bytes, but then mask off those bits of the
+	   syndrome that are related to the padding.  */
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	neg	tmp1, tmp1
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	orr	vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
+	orr	vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	lsl	tmp1, tmp1, #1
+	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	mov	tmp3, #~0
+	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
+	lsr	tmp1, tmp3, tmp1
+
+	mov	tmp3, vend1.2d[0]
+	bic	tmp1, tmp3, tmp1	// Mask padding bits.
+	cbnz	tmp1, .Ltail
+
+.Lloop:
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* Use a fast check for the termination condition.  */
+	orr	vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
+	orr	vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
+	orr	vend1.16b, vhas_chr1.16b, vhas_chr2.16b
+	addp	vend1.2d, vend1.2d, vend1.2d
+	mov	tmp1, vend1.2d[0]
+	cbz	tmp1, .Lloop
+
+	/* Termination condition found.  Now need to establish exactly why
+	   we terminated.  */
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128
+	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
+
+	mov	tmp1, vend1.2d[0]
+.Ltail:
+	/* Count the trailing zeros, by bit reversing...  */
+	rbit	tmp1, tmp1
+	/* Re-bias source.  */
+	sub	src, src, #32
+	clz	tmp1, tmp1	/* ... and counting the leading zeros.  */
+	/* tmp1 is twice the offset into the fragment.  */
+	add	result, src, tmp1, lsr #1
+	ret
+
+	.size	__strchrnul_aarch64, . - __strchrnul_aarch64
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S
new file mode 100644
index 0000000..91bac19
--- /dev/null
+++ b/string/aarch64/strcmp-sve.S
@@ -0,0 +1,57 @@
+/*
+ * __strcmp_aarch64_sve - compare two strings
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+	.arch	armv8-a+sve
+	.text
+
+	.globl	__strcmp_aarch64_sve
+	.type	__strcmp_aarch64_sve, %function
+	.p2align 4
+__strcmp_aarch64_sve:
+	setffr				/* initialize FFR */
+	ptrue	p1.b, all		/* all ones; loop invariant */
+	mov	x2, 0			/* initialize offset */
+	nop
+
+	/* Read a vector's worth of bytes, stopping on first fault.  */
+0:	ldff1b	z0.b, p1/z, [x0, x2]
+	ldff1b	z1.b, p1/z, [x1, x2]
+	rdffrs	p0.b, p1/z
+	b.nlast	2f
+
+	/* First fault did not fail: the whole vector is valid.
+	   Avoid depending on the contents of FFR beyond the branch.  */
+	incb	x2, all			/* skip bytes for next round */
+	cmpeq	p2.b, p1/z, z0.b, z1.b	/* compare strings */
+	cmpne	p3.b, p1/z, z0.b, 0	/* search for ~zero */
+	nands	p2.b, p1/z, p2.b, p3.b	/* ~(eq & ~zero) -> ne | zero */
+	b.none	0b
+
+	/* Found end-of-string or inequality.  */
+1:	brkb	p2.b, p1/z, p2.b	/* find first such */
+	lasta	w0, p2, z0.b		/* extract each char */
+	lasta	w1, p2, z1.b
+	sub	x0, x0, x1		/* return comparison */
+	ret
+
+	/* First fault failed: only some of the vector is valid.
+	   Perform the comparison only on the valid bytes.  */
+2:	incp	x2, p0.b		/* skip bytes for next round */
+	setffr				/* re-init FFR for next round */
+	cmpeq	p2.b, p0/z, z0.b, z1.b	/* compare strings, as above */
+	cmpne	p3.b, p0/z, z0.b, 0
+	nands	p2.b, p0/z, p2.b, p3.b
+	b.none	0b
+	b	1b
+
+	.size	__strcmp_aarch64_sve, . - __strcmp_aarch64_sve
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
new file mode 100644
index 0000000..2aa367c
--- /dev/null
+++ b/string/aarch64/strcmp.S
@@ -0,0 +1,177 @@
+/*
+ * strcmp - compare two strings
+ *
+ * Copyright (c) 2012, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+#define L(label) .L ## label
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define result		x0
+
+/* Internal variables.  */
+#define data1		x2
+#define data1w		w2
+#define data2		x3
+#define data2w		w3
+#define has_nul		x4
+#define diff		x5
+#define syndrome	x6
+#define tmp1		x7
+#define tmp2		x8
+#define tmp3		x9
+#define zeroones	x10
+#define pos		x11
+
+	/* Start of performance-critical section  -- one 64B cache line.  */
+def_fn __strcmp_aarch64 p2align=6
+	eor	tmp1, src1, src2
+	mov	zeroones, #REP8_01
+	tst	tmp1, #7
+	b.ne	L(misaligned8)
+	ands	tmp1, src1, #7
+	b.ne	L(mutual_align)
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+L(loop_aligned):
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+L(start_realigned):
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	orr	syndrome, diff, has_nul
+	cbz	syndrome, L(loop_aligned)
+	/* End of performance-critical section  -- one 64B cache line.  */
+
+L(end):
+#ifndef	__AARCH64EB__
+	rev	syndrome, syndrome
+	rev	data1, data1
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	clz	pos, syndrome
+	rev	data2, data2
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#else
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#endif
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that preceed the start point.  */
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
+	ldr	data1, [src1], #8
+	neg	tmp1, tmp1		/* Bits to alignment -64.  */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	b	L(start_realigned)
+
+L(misaligned8):
+	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+	   checking to make sure that we don't access beyond page boundary in
+	   SRC2.  */
+	tst	src1, #7
+	b.eq	L(loop_misaligned)
+L(do_misaligned):
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	tst	src1, #7
+	b.ne	L(do_misaligned)
+
+L(loop_misaligned):
+	/* Test if we are within the last dword of the end of a 4K page.  If
+	   yes then jump back to the misaligned loop to copy a byte at a time.  */
+	and	tmp1, src2, #0xff8
+	eor	tmp1, tmp1, #0xff8
+	cbz	tmp1, L(do_misaligned)
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	orr	syndrome, diff, has_nul
+	cbz	syndrome, L(loop_misaligned)
+	b	L(end)
+
+L(done):
+	sub	result, data1, data2
+	ret
+	.size   __strcmp_aarch64, .-__strcmp_aarch64
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S
new file mode 100644
index 0000000..c929f37
--- /dev/null
+++ b/string/aarch64/strcpy-sve.S
@@ -0,0 +1,69 @@
+/*
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+	.arch	armv8-a+sve
+	.text
+
+/* To build as stpcpy, define BUILD_STPCPY before compiling this file.  */
+#ifdef BUILD_STPCPY
+#define FUNC  __stpcpy_aarch64_sve
+#else
+#define FUNC  __strcpy_aarch64_sve
+#endif
+
+	.globl	FUNC
+	.type	FUNC, %function
+	.p2align 4
+FUNC:
+	setffr				/* initialize FFR */
+	ptrue	p2.b, all		/* all ones; loop invariant */
+	mov	x2, 0			/* initialize offset */
+
+	.p2align 4
+	/* Read a vector's worth of bytes, stopping on first fault.  */
+0:	ldff1b	z0.b, p2/z, [x1, x2]
+	rdffrs	p0.b, p2/z
+	b.nlast	1f
+
+	/* First fault did not fail: the whole vector is valid.
+	   Avoid depending on the contexts of FFR beyond the branch.  */
+	cmpeq	p1.b, p2/z, z0.b, 0	/* search for zeros */
+	b.any	2f
+
+	/* No zero found.  Store the whole vector and loop.  */
+	st1b	z0.b, p2, [x0, x2]
+	incb	x2, all
+	b	0b
+
+	/* First fault failed: only some of the vector is valid.
+	   Perform the comparison only on the valid bytes.  */
+1:	cmpeq	p1.b, p0/z, z0.b, 0	/* search for zeros */
+	b.any	2f
+
+	/* No zero found.  Store the valid portion of the vector and loop.  */
+	setffr				/* re-init FFR */
+	st1b	z0.b, p0, [x0, x2]
+	incp	x2, p0.b
+	b	0b
+
+	/* Zero found.  Crop the vector to the found zero and finish.  */
+2:	brka	p0.b, p2/z, p1.b
+	st1b	z0.b, p0, [x0, x2]
+#ifdef BUILD_STPCPY
+	add	x0, x0, x2
+	sub	x0, x0, 1
+	incp	x0, p0.b
+#endif
+	ret
+
+	.size	FUNC, . - FUNC
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
new file mode 100644
index 0000000..4e10b4d
--- /dev/null
+++ b/string/aarch64/strcpy.S
@@ -0,0 +1,314 @@
+/*
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
+ *
+ * Copyright (c) 2013-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ */
+
+/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
+
+   To test the page crossing code path more thoroughly, compile with
+   -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
+   entry path.  This option is not intended for production use.  */
+
+/* Arguments and results.  */
+#define dstin		x0
+#define srcin		x1
+
+/* Locals and temporaries.  */
+#define src		x2
+#define dst		x3
+#define data1		x4
+#define data1w		w4
+#define data2		x5
+#define data2w		w5
+#define has_nul1	x6
+#define has_nul2	x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define tmp4		x11
+#define zeroones	x12
+#define data1a		x13
+#define data2a		x14
+#define pos		x15
+#define len		x16
+#define to_align	x17
+
+#ifdef BUILD_STPCPY
+#define STRCPY __stpcpy_aarch64
+#else
+#define STRCPY __strcpy_aarch64
+#endif
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+	/* AArch64 systems have a minimum page size of 4k.  We can do a quick
+	   page size check for crossing this boundary on entry and if we
+	   do not, then we can short-circuit much of the entry code.  We
+	   expect early page-crossing strings to be rare (probability of
+	   16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
+	   predictable, even with random strings.
+
+	   We don't bother checking for larger page sizes, the cost of setting
+	   up the correct page size is just not worth the extra gain from
+	   a small reduction in the cases taking the slow path.  Note that
+	   we only care about whether the first fetch, which may be
+	   misaligned, crosses a page boundary - after that we move to aligned
+	   fetches for the remainder of the string.  */
+
+#ifdef STRCPY_TEST_PAGE_CROSS
+	/* Make everything that isn't Qword aligned look like a page cross.  */
+#define MIN_PAGE_P2 4
+#else
+#define MIN_PAGE_P2 12
+#endif
+
+#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
+
+def_fn STRCPY p2align=6
+	/* For moderately short strings, the fastest way to do the copy is to
+	   calculate the length of the string in the same way as strlen, then
+	   essentially do a memcpy of the result.  This avoids the need for
+	   multiple byte copies and further means that by the time we
+	   reach the bulk copy loop we know we can always use DWord
+	   accesses.  We expect __strcpy_aarch64 to rarely be called repeatedly
+	   with the same source string, so branch prediction is likely to
+	   always be difficult - we mitigate against this by preferring
+	   conditional select operations over branches whenever this is
+	   feasible.  */
+	and	tmp2, srcin, #(MIN_PAGE_SIZE - 1)
+	mov	zeroones, #REP8_01
+	and	to_align, srcin, #15
+	cmp	tmp2, #(MIN_PAGE_SIZE - 16)
+	neg	tmp1, to_align
+	/* The first fetch will straddle a (possible) page boundary iff
+	   srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
+	   aligned string will never fail the page align check, so will
+	   always take the fast path.  */
+	b.gt	.Lpage_cross
+
+.Lpage_cross_ok:
+	ldp	data1, data2, [srcin]
+#ifdef __AARCH64EB__
+	/* Because we expect the end to be found within 16 characters
+	   (profiling shows this is the most common case), it's worth
+	   swapping the bytes now to save having to recalculate the
+	   termination syndrome later.  We preserve data1 and data2
+	   so that we can re-use the values later on.  */
+	rev	tmp2, data1
+	sub	tmp1, tmp2, zeroones
+	orr	tmp2, tmp2, #REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	b.ne	.Lfp_le8
+	rev	tmp4, data2
+	sub	tmp3, tmp4, zeroones
+	orr	tmp4, tmp4, #REP8_7f
+#else
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	b.ne	.Lfp_le8
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+#endif
+	bics	has_nul2, tmp3, tmp4
+	b.eq	.Lbulk_entry
+
+	/* The string is short (<=16 bytes).  We don't know exactly how
+	   short though, yet.  Work out the exact length so that we can
+	   quickly select the optimal copy strategy.  */
+.Lfp_gt8:
+	rev	has_nul2, has_nul2
+	clz	pos, has_nul2
+	mov	tmp2, #56
+	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
+	sub	pos, tmp2, pos
+#ifdef __AARCH64EB__
+	lsr	data2, data2, pos
+#else
+	lsl	data2, data2, pos
+#endif
+	str	data2, [dst, #1]
+	str	data1, [dstin]
+#ifdef BUILD_STPCPY
+	add	dstin, dst, #8
+#endif
+	ret
+
+.Lfp_le8:
+	rev	has_nul1, has_nul1
+	clz	pos, has_nul1
+	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
+	subs	tmp2, pos, #24			/* Pos in bits. */
+	b.lt	.Lfp_lt4
+#ifdef __AARCH64EB__
+	mov	tmp2, #56
+	sub	pos, tmp2, pos
+	lsr	data2, data1, pos
+	lsr	data1, data1, #32
+#else
+	lsr	data2, data1, tmp2
+#endif
+	/* 4->7 bytes to copy.  */
+	str	data2w, [dst, #-3]
+	str	data1w, [dstin]
+#ifdef BUILD_STPCPY
+	mov	dstin, dst
+#endif
+	ret
+.Lfp_lt4:
+	cbz	pos, .Lfp_lt2
+	/* 2->3 bytes to copy.  */
+#ifdef __AARCH64EB__
+	lsr	data1, data1, #48
+#endif
+	strh	data1w, [dstin]
+	/* Fall-through, one byte (max) to go.  */
+.Lfp_lt2:
+	/* Null-terminated string.  Last character must be zero!  */
+	strb	wzr, [dst]
+#ifdef BUILD_STPCPY
+	mov	dstin, dst
+#endif
+	ret
+
+	.p2align 6
+	/* Aligning here ensures that the entry code and main loop all lies
+	   within one 64-byte cache line.  */
+.Lbulk_entry:
+	sub	to_align, to_align, #16
+	stp	data1, data2, [dstin]
+	sub	src, srcin, to_align
+	sub	dst, dstin, to_align
+	b	.Lentry_no_page_cross
+
+	/* The inner loop deals with two Dwords at a time.  This has a
+	   slightly higher start-up cost, but we should win quite quickly,
+	   especially on cores with a high number of issue slots per
+	   cycle, as we get much better parallelism out of the operations.  */
+.Lmain_loop:
+	stp	data1, data2, [dst], #16
+.Lentry_no_page_cross:
+	ldp	data1, data2, [src], #16
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bics	has_nul2, tmp3, tmp4
+	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
+	b.eq	.Lmain_loop
+
+	/* Since we know we are copying at least 16 bytes, the fastest way
+	   to deal with the tail is to determine the location of the
+	   trailing NUL, then (re)copy the 16 bytes leading up to that.  */
+	cmp	has_nul1, #0
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul directly.  The
+	   easiest way to get the correct byte is to byte-swap the data
+	   and calculate the syndrome a second time.  */
+	csel	data1, data1, data2, ne
+	rev	data1, data1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+#else
+	csel	has_nul1, has_nul1, has_nul2, ne
+#endif
+	rev	has_nul1, has_nul1
+	clz	pos, has_nul1
+	add	tmp1, pos, #72
+	add	pos, pos, #8
+	csel	pos, pos, tmp1, ne
+	add	src, src, pos, lsr #3
+	add	dst, dst, pos, lsr #3
+	ldp	data1, data2, [src, #-32]
+	stp	data1, data2, [dst, #-16]
+#ifdef BUILD_STPCPY
+	sub	dstin, dst, #1
+#endif
+	ret
+
+.Lpage_cross:
+	bic	src, srcin, #15
+	/* Start by loading two words at [srcin & ~15], then forcing the
+	   bytes that precede srcin to 0xff.  This means they never look
+	   like termination bytes.  */
+	ldp	data1, data2, [src]
+	lsl	tmp1, tmp1, #3	/* Bytes beyond alignment -> bits.  */
+	tst	to_align, #7
+	csetm	tmp2, ne
+#ifdef __AARCH64EB__
+	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#else
+	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	orr	data1, data1, tmp2
+	orr	data2a, data2, tmp2
+	cmp	to_align, #8
+	csinv	data1, data1, xzr, lt
+	csel	data2, data2, data2a, lt
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bics	has_nul2, tmp3, tmp4
+	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
+	b.eq	.Lpage_cross_ok
+	/* We now need to make data1 and data2 look like they've been
+	   loaded directly from srcin.  Do a rotate on the 128-bit value.  */
+	lsl	tmp1, to_align, #3	/* Bytes->bits.  */
+	neg	tmp2, to_align, lsl #3
+#ifdef __AARCH64EB__
+	lsl	data1a, data1, tmp1
+	lsr	tmp4, data2, tmp2
+	lsl	data2, data2, tmp1
+	orr	tmp4, tmp4, data1a
+	cmp	to_align, #8
+	csel	data1, tmp4, data2, lt
+	rev	tmp2, data1
+	rev	tmp4, data2
+	sub	tmp1, tmp2, zeroones
+	orr	tmp2, tmp2, #REP8_7f
+	sub	tmp3, tmp4, zeroones
+	orr	tmp4, tmp4, #REP8_7f
+#else
+	lsr	data1a, data1, tmp1
+	lsl	tmp4, data2, tmp2
+	lsr	data2, data2, tmp1
+	orr	tmp4, tmp4, data1a
+	cmp	to_align, #8
+	csel	data1, tmp4, data2, lt
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+#endif
+	bic	has_nul1, tmp1, tmp2
+	cbnz	has_nul1, .Lfp_le8
+	bic	has_nul2, tmp3, tmp4
+	b	.Lfp_gt8
+
+	.size	STRCPY, . - STRCPY
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S
new file mode 100644
index 0000000..64ede85
--- /dev/null
+++ b/string/aarch64/strlen-sve.S
@@ -0,0 +1,55 @@
+/*
+ * __strlen_aarch64_sve - compute the length of a string
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+	.arch	armv8-a+sve
+	.text
+
+	.globl	__strlen_aarch64_sve
+	.type	__strlen_aarch64_sve, %function
+	.p2align 4
+__strlen_aarch64_sve:
+	setffr			/* initialize FFR */
+	ptrue	p2.b		/* all ones; loop invariant */
+	mov	x1, 0		/* initialize length */
+	nop
+
+	/* Read a vector's worth of bytes, stopping on first fault.  */
+0:	ldff1b	z0.b, p2/z, [x0, x1]
+	nop
+	rdffrs	p0.b, p2/z
+	b.nlast	2f
+
+	/* First fault did not fail: the whole vector is valid.
+	   Avoid depending on the contents of FFR beyond the branch.  */
+	incb	x1, all			/* speculate increment */
+	cmpeq	p1.b, p2/z, z0.b, 0	/* loop if no zeros */
+	b.none	0b
+	decb	x1, all			/* undo speculate */
+
+	/* Zero found.  Select the bytes before the first and count them.  */
+1:	brkb	p0.b, p2/z, p1.b
+	incp	x1, p0.b
+	mov	x0, x1
+	ret
+
+	/* First fault failed: only some of the vector is valid.
+	   Perform the comparison only on the valid bytes.  */
+2:	cmpeq	p1.b, p0/z, z0.b, 0
+	b.any	1b
+
+	/* No zero found.  Re-init FFR, increment, and loop.  */
+	setffr
+	incp	x1, p0.b
+	b	0b
+
+	.size	__strlen_aarch64_sve, . - __strlen_aarch64_sve
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
new file mode 100644
index 0000000..26388d7
--- /dev/null
+++ b/string/aarch64/strlen.S
@@ -0,0 +1,214 @@
+/*
+ * strlen - calculate the length of a string
+ *
+ * Copyright (c) 2013, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ */
+
+/* To test the page crossing code path more thoroughly, compile with
+   -DTEST_PAGE_CROSS - this will force all calls through the slower
+   entry path.  This option is not intended for production use.	 */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define len		x0
+
+/* Locals and temporaries.  */
+#define src		x1
+#define data1		x2
+#define data2		x3
+#define has_nul1	x4
+#define has_nul2	x5
+#define tmp1		x4
+#define tmp2		x5
+#define tmp3		x6
+#define tmp4		x7
+#define zeroones	x8
+
+#define L(l) .L ## l
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word. A faster check
+	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
+	   false hits for characters 129..255.	*/
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+#ifdef TEST_PAGE_CROSS
+# define MIN_PAGE_SIZE 15
+#else
+# define MIN_PAGE_SIZE 4096
+#endif
+
+	/* Since strings are short on average, we check the first 16 bytes
+	   of the string for a NUL character.  In order to do an unaligned ldp
+	   safely we have to do a page cross check first.  If there is a NUL
+	   byte we calculate the length from the 2 8-byte words using
+	   conditional select to reduce branch mispredictions (it is unlikely
+	   __strlen_aarch64 will be repeatedly called on strings with the same length).
+
+	   If the string is longer than 16 bytes, we align src so don't need
+	   further page cross checks, and process 32 bytes per iteration
+	   using the fast NUL check.  If we encounter non-ASCII characters,
+	   fallback to a second loop using the full NUL check.
+
+	   If the page cross check fails, we read 16 bytes from an aligned
+	   address, remove any characters before the string, and continue
+	   in the main loop using aligned loads.  Since strings crossing a
+	   page in the first 16 bytes are rare (probability of
+	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
+
+	   AArch64 systems have a minimum page size of 4k.  We don't bother
+	   checking for larger page sizes - the cost of setting up the correct
+	   page size is just not worth the extra gain from a small reduction in
+	   the cases taking the slow path.  Note that we only care about
+	   whether the first fetch, which may be misaligned, crosses a page
+	   boundary.  */
+
+def_fn __strlen_aarch64 p2align=6
+	and	tmp1, srcin, MIN_PAGE_SIZE - 1
+	mov	zeroones, REP8_01
+	cmp	tmp1, MIN_PAGE_SIZE - 16
+	b.gt	L(page_cross)
+	ldp	data1, data2, [srcin]
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul1/2 directly.
+	   Since we expect strings to be small and early-exit,
+	   byte-swap the data now so has_null1/2 will be correct.  */
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(main_loop_entry)
+
+	/* Enter with C = has_nul1 == 0.  */
+	csel	has_nul1, has_nul1, has_nul2, cc
+	mov	len, 8
+	rev	has_nul1, has_nul1
+	clz	tmp1, has_nul1
+	csel	len, xzr, len, cc
+	add	len, len, tmp1, lsr 3
+	ret
+
+	/* The inner loop processes 32 bytes per iteration and uses the fast
+	   NUL check.  If we encounter non-ASCII characters, use a second
+	   loop with the accurate NUL check.  */
+	.p2align 4
+L(main_loop_entry):
+	bic	src, srcin, 15
+	sub	src, src, 16
+L(main_loop):
+	ldp	data1, data2, [src, 32]!
+.Lpage_cross_entry:
+	sub	tmp1, data1, zeroones
+	sub	tmp3, data2, zeroones
+	orr	tmp2, tmp1, tmp3
+	tst	tmp2, zeroones, lsl 7
+	bne	1f
+	ldp	data1, data2, [src, 16]
+	sub	tmp1, data1, zeroones
+	sub	tmp3, data2, zeroones
+	orr	tmp2, tmp1, tmp3
+	tst	tmp2, zeroones, lsl 7
+	beq	L(main_loop)
+	add	src, src, 16
+1:
+	/* The fast check failed, so do the slower, accurate NUL check.	 */
+	orr	tmp2, data1, REP8_7f
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(nonascii_loop)
+
+	/* Enter with C = has_nul1 == 0.  */
+L(tail):
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul1/2 directly.  The
+	   easiest way to get the correct byte is to byte-swap the data
+	   and calculate the syndrome a second time.  */
+	csel	data1, data1, data2, cc
+	rev	data1, data1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	bic	has_nul1, tmp1, tmp2
+#else
+	csel	has_nul1, has_nul1, has_nul2, cc
+#endif
+	sub	len, src, srcin
+	rev	has_nul1, has_nul1
+	add	tmp2, len, 8
+	clz	tmp1, has_nul1
+	csel	len, len, tmp2, cc
+	add	len, len, tmp1, lsr 3
+	ret
+
+L(nonascii_loop):
+	ldp	data1, data2, [src, 16]!
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	bne	L(tail)
+	ldp	data1, data2, [src, 16]!
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(nonascii_loop)
+	b	L(tail)
+
+	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
+	   srcin to 0x7f, so we ignore any NUL bytes before the string.
+	   Then continue in the aligned loop.  */
+L(page_cross):
+	bic	src, srcin, 15
+	ldp	data1, data2, [src]
+	lsl	tmp1, srcin, 3
+	mov	tmp4, -1
+#ifdef __AARCH64EB__
+	/* Big-endian.	Early bytes are at MSB.	 */
+	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	orr	tmp1, tmp1, REP8_80
+	orn	data1, data1, tmp1
+	orn	tmp2, data2, tmp1
+	tst	srcin, 8
+	csel	data1, data1, tmp4, eq
+	csel	data2, data2, tmp2, eq
+	b	L(page_cross_entry)
+
+	.size	__strlen_aarch64, . - __strlen_aarch64
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S
new file mode 100644
index 0000000..6f31eca
--- /dev/null
+++ b/string/aarch64/strncmp-sve.S
@@ -0,0 +1,66 @@
+/*
+ * strncmp - compare two strings with limit
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+	.arch	armv8-a+sve
+	.text
+
+	.globl	__strncmp_aarch64_sve
+	.type	__strncmp_aarch64_sve, %function
+	.p2align 4
+__strncmp_aarch64_sve:
+	setffr				/* initialize FFR */
+	mov	x3, 0			/* initialize off */
+
+0:	whilelo	p0.b, x3, x2		/* while off < max */
+	b.none	9f
+
+	ldff1b	z0.b, p0/z, [x0, x3]
+	ldff1b	z1.b, p0/z, [x1, x3]
+	rdffrs	p1.b, p0/z
+	b.nlast	2f
+
+	/* First fault did not fail: the vector up to max is valid.
+	   Avoid depending on the contents of FFR beyond the branch.
+	   Increment for a whole vector, even if we've only read a partial.
+	   This is significantly cheaper than INCP, and since OFF is not
+	   used after the loop it is ok to increment OFF past MAX.  */
+	incb	x3
+	cmpeq	p1.b, p0/z, z0.b, z1.b	/* compare strings */
+	cmpne	p2.b, p0/z, z0.b, 0	/* search for ~zero */
+	nands	p2.b, p0/z, p1.b, p2.b	/* ~(eq & ~zero) -> ne | zero */
+	b.none	0b
+
+	/* Found end-of-string or inequality.  */
+1:	brkb	p2.b, p0/z, p2.b	/* find first such */
+	lasta	w0, p2, z0.b		/* extract each char */
+	lasta	w1, p2, z1.b
+	sub	x0, x0, x1		/* return comparison */
+	ret
+
+	/* First fault failed: only some of the vector is valid.
+	   Perform the comparison only on the valid bytes.  */
+2:	cmpeq	p2.b, p1/z, z0.b, z1.b	/* compare strings, as above */
+	cmpne	p3.b, p1/z, z0.b, 0
+	nands	p2.b, p1/z, p2.b, p3.b
+	b.any	1b
+
+	/* No inequality or zero found.  Re-init FFR, incr and loop.  */
+	setffr
+	incp	x3, p1.b
+	b	0b
+
+	/* Found end-of-count.  */
+9:	mov	x0, 0			/* return equal */
+	ret
+
+	.size	__strncmp_aarch64_sve, . - __strncmp_aarch64_sve
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
new file mode 100644
index 0000000..ced72b9
--- /dev/null
+++ b/string/aarch64/strncmp.S
@@ -0,0 +1,266 @@
+/*
+ * strncmp - compare two strings
+ *
+ * Copyright (c) 2013, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		x0
+
+/* Internal variables.  */
+#define data1		x3
+#define data1w		w3
+#define data2		x4
+#define data2w		w4
+#define has_nul		x5
+#define diff		x6
+#define syndrome	x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define zeroones	x11
+#define pos		x12
+#define limit_wd	x13
+#define mask		x14
+#define endloop		x15
+#define count		mask
+
+	.text
+	.p2align 6
+	.rep 7
+	nop	/* Pad so that the loop below fits a cache line.  */
+	.endr
+def_fn __strncmp_aarch64
+	cbz	limit, .Lret0
+	eor	tmp1, src1, src2
+	mov	zeroones, #REP8_01
+	tst	tmp1, #7
+	and	count, src1, #7
+	b.ne	.Lmisaligned8
+	cbnz	count, .Lmutual_align
+	/* Calculate the number of full and partial words -1.  */
+	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
+	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+	/* Start of performance-critical section  -- one 64B cache line.  */
+.Lloop_aligned:
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+.Lstart_realigned:
+	subs	limit_wd, limit_wd, #1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	endloop, #0, #0, eq
+	b.eq	.Lloop_aligned
+	/* End of performance-critical section  -- one 64B cache line.  */
+
+	/* Not reached the limit, must have found the end or a diff.  */
+	tbz	limit_wd, #63, .Lnot_limit
+
+	/* Limit % 8 == 0 => all bytes significant.  */
+	ands	limit, limit, #7
+	b.eq	.Lnot_limit
+
+	lsl	limit, limit, #3	/* Bits -> bytes.  */
+	mov	mask, #~0
+#ifdef __AARCH64EB__
+	lsr	mask, mask, limit
+#else
+	lsl	mask, mask, limit
+#endif
+	bic	data1, data1, mask
+	bic	data2, data2, mask
+
+	/* Make sure that the NUL byte is marked in the syndrome.  */
+	orr	has_nul, has_nul, mask
+
+.Lnot_limit:
+	orr	syndrome, diff, has_nul
+
+#ifndef	__AARCH64EB__
+	rev	syndrome, syndrome
+	rev	data1, data1
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	clz	pos, syndrome
+	rev	data2, data2
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#else
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#endif
+
+.Lmutual_align:
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.
+	   We also need to adjust the limit calculations, but without
+	   overflowing if the limit is near ULONG_MAX.  */
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	ldr	data1, [src1], #8
+	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+#endif
+	and	tmp3, limit_wd, #7
+	lsr	limit_wd, limit_wd, #3
+	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
+	add	limit, limit, count
+	add	tmp3, tmp3, count
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	add	limit_wd, limit_wd, tmp3, lsr #3
+	b	.Lstart_realigned
+
+	.p2align 6
+	/* Don't bother with dwords for up to 16 bytes.  */
+.Lmisaligned8:
+	cmp	limit, #16
+	b.hs	.Ltry_misaligned_words
+
+.Lbyte_loop:
+	/* Perhaps we can do better than this.  */
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	subs	limit, limit, #1
+	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.eq	.Lbyte_loop
+.Ldone:
+	sub	result, data1, data2
+	ret
+	/* Align the SRC1 to a dword by doing a bytewise compare and then do
+	   the dword loop.  */
+.Ltry_misaligned_words:
+	lsr	limit_wd, limit, #3
+	cbz	count, .Ldo_misaligned
+
+	neg	count, count
+	and	count, count, #7
+	sub	limit, limit, count
+	lsr	limit_wd, limit, #3
+
+.Lpage_end_loop:
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	.Ldone
+	subs	count, count, #1
+	b.hi	.Lpage_end_loop
+
+.Ldo_misaligned:
+	/* Prepare ourselves for the next page crossing.  Unlike the aligned
+	   loop, we fetch 1 less dword because we risk crossing bounds on
+	   SRC2.  */
+	mov	count, #8
+	subs	limit_wd, limit_wd, #1
+	b.lo	.Ldone_loop
+.Lloop_misaligned:
+	and	tmp2, src2, #0xff8
+	eor	tmp2, tmp2, #0xff8
+	cbz	tmp2, .Lpage_end_loop
+
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	.Lnot_limit
+	subs	limit_wd, limit_wd, #1
+	b.pl	.Lloop_misaligned
+
+.Ldone_loop:
+	/* We found a difference or a NULL before the limit was reached.  */
+	and	limit, limit, #7
+	cbz	limit, .Lnot_limit
+	/* Read the last word.  */
+	sub	src1, src1, 8
+	sub	src2, src2, 8
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	.Lnot_limit
+
+.Lret0:
+	mov	result, #0
+	ret
+	.size __strncmp_aarch64, . - __strncmp_aarch64
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S
new file mode 100644
index 0000000..3a9be08
--- /dev/null
+++ b/string/aarch64/strnlen-sve.S
@@ -0,0 +1,72 @@
+/*
+ * strnlen - calculate the length of a string with limit.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+	.arch	armv8-a+sve
+	.text
+
+	.globl	__strnlen_aarch64_sve
+	.type	__strnlen_aarch64_sve, %function
+	.p2align 4
+__strnlen_aarch64_sve:
+	setffr				/* initialize FFR */
+	mov	x2, 0			/* initialize len */
+	b	1f
+
+	.p2align 4
+	/* We have off + vl <= max, and so may read the whole vector.  */
+0:	ldff1b	z0.b, p0/z, [x0, x2]
+	rdffrs	p1.b, p0/z
+	b.nlast	2f
+
+	/* First fault did not fail: the whole vector is valid.
+	   Avoid depending on the contents of FFR beyond the branch.  */
+	cmpeq	p2.b, p0/z, z0.b, 0
+	b.any	8f
+	incb	x2
+
+1:	whilelo	p0.b, x2, x1
+	b.last	0b
+
+	/* We have off + vl < max.  Test for off == max before proceeding.  */
+	b.none	9f
+
+	ldff1b	z0.b, p0/z, [x0, x2]
+	rdffrs	p1.b, p0/z
+	b.nlast	2f
+
+	/* First fault did not fail: the vector up to max is valid.
+	   Avoid depending on the contents of FFR beyond the branch.
+	   Compare for end-of-string, but there are no more bytes.  */
+	cmpeq	p2.b, p0/z, z0.b, 0
+
+	/* Found end-of-string or zero.  */
+8:	brkb	p2.b, p0/z, p2.b
+	mov	x0, x2
+	incp	x0, p2.b
+	ret
+
+	/* First fault failed: only some of the vector is valid.
+	   Perform the comparison only on the valid bytes.  */
+2:	cmpeq	p2.b, p1/z, z0.b, 0
+	b.any	8b
+
+	/* No inequality or zero found.  Re-init FFR, incr and loop.  */
+	setffr
+	incp	x2, p1.b
+	b	1b
+
+	/* End of count.  Return max.  */
+9:	mov	x0, x2
+	ret
+
+	.size	__strnlen_aarch64_sve, . - __strnlen_aarch64_sve
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
new file mode 100644
index 0000000..b02c846
--- /dev/null
+++ b/string/aarch64/strnlen.S
@@ -0,0 +1,160 @@
+/*
+ * strnlen - calculate the length of a string with limit.
+ *
+ * Copyright (c) 2013, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define len		x0
+#define limit		x1
+
+/* Locals and temporaries.  */
+#define src		x2
+#define data1		x3
+#define data2		x4
+#define data2a		x5
+#define has_nul1	x6
+#define has_nul2	x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define tmp4		x11
+#define zeroones	x12
+#define pos		x13
+#define limit_wd	x14
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+	.text
+	.p2align	6
+.Lstart:
+	/* Pre-pad to ensure critical loop begins an icache line.  */
+	.rep 7
+	nop
+	.endr
+	/* Put this code here to avoid wasting more space with pre-padding.  */
+.Lhit_limit:
+	mov	len, limit
+	ret
+
+def_fn __strnlen_aarch64
+	cbz	limit, .Lhit_limit
+	mov	zeroones, #REP8_01
+	bic	src, srcin, #15
+	ands	tmp1, srcin, #15
+	b.ne	.Lmisaligned
+	/* Calculate the number of full and partial words -1.  */
+	sub	limit_wd, limit, #1	/* Limit != 0, so no underflow.  */
+	lsr	limit_wd, limit_wd, #4	/* Convert to Qwords.  */
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+	/* The inner loop deals with two Dwords at a time.  This has a
+	   slightly higher start-up cost, but we should win quite quickly,
+	   especially on cores with a high number of issue slots per
+	   cycle, as we get much better parallelism out of the operations.  */
+
+	/* Start of critial section -- keep to one 64Byte cache line.  */
+.Lloop:
+	ldp	data1, data2, [src], #16
+.Lrealigned:
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	subs	limit_wd, limit_wd, #1
+	orr	tmp1, has_nul1, has_nul2
+	ccmp	tmp1, #0, #0, pl	/* NZCV = 0000  */
+	b.eq	.Lloop
+	/* End of critical section -- keep to one 64Byte cache line.  */
+
+	orr	tmp1, has_nul1, has_nul2
+	cbz	tmp1, .Lhit_limit	/* No null in final Qword.  */
+
+	/* We know there's a null in the final Qword.  The easiest thing
+	   to do now is work out the length of the string and return
+	   MIN (len, limit).  */
+
+	sub	len, src, srcin
+	cbz	has_nul1, .Lnul_in_data2
+#ifdef __AARCH64EB__
+	mov	data2, data1
+#endif
+	sub	len, len, #8
+	mov	has_nul2, has_nul1
+.Lnul_in_data2:
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul directly.  The
+	   easiest way to get the correct byte is to byte-swap the data
+	   and calculate the syndrome a second time.  */
+	rev	data2, data2
+	sub	tmp1, data2, zeroones
+	orr	tmp2, data2, #REP8_7f
+	bic	has_nul2, tmp1, tmp2
+#endif
+	sub	len, len, #8
+	rev	has_nul2, has_nul2
+	clz	pos, has_nul2
+	add	len, len, pos, lsr #3		/* Bits to bytes.  */
+	cmp	len, limit
+	csel	len, len, limit, ls		/* Return the lower value.  */
+	ret
+
+.Lmisaligned:
+	/* Deal with a partial first word.
+	   We're doing two things in parallel here;
+	   1) Calculate the number of words (but avoiding overflow if
+	      limit is near ULONG_MAX) - to do this we need to work out
+	      limit + tmp1 - 1 as a 65-bit value before shifting it;
+	   2) Load and mask the initial data words - we force the bytes
+	      before the ones we are interested in to 0xff - this ensures
+	      early bytes will not hit any zero detection.  */
+	sub	limit_wd, limit, #1
+	neg	tmp4, tmp1
+	cmp	tmp1, #8
+
+	and	tmp3, limit_wd, #15
+	lsr	limit_wd, limit_wd, #4
+	mov	tmp2, #~0
+
+	ldp	data1, data2, [src], #16
+	lsl	tmp4, tmp4, #3		/* Bytes beyond alignment -> bits.  */
+	add	tmp3, tmp3, tmp1
+
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsl	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsr	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
+#endif
+	add	limit_wd, limit_wd, tmp3, lsr #4
+
+	orr	data1, data1, tmp2
+	orr	data2a, data2, tmp2
+
+	csinv	data1, data1, xzr, le
+	csel	data2, data2, data2a, le
+	b	.Lrealigned
+	.size	__strnlen_aarch64, . - .Lstart	/* Include pre-padding in size.  */
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S
new file mode 100644
index 0000000..bb522e7
--- /dev/null
+++ b/string/aarch64/strrchr-sve.S
@@ -0,0 +1,83 @@
+/*
+ * strrchr - find the last of a character in a string
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+	.arch	armv8-a+sve
+	.text
+
+	.globl	__strrchr_aarch64_sve
+	.type	__strrchr_aarch64_sve, %function
+	.p2align 4
+__strrchr_aarch64_sve:
+	dup	z1.b, w1		/* replicate byte across vector */
+	setffr				/* initialize FFR */
+	ptrue	p1.b			/* all ones; loop invariant */
+	mov	x2, 0			/* no match found so far */
+	pfalse	p2.b
+
+	.p2align 4
+	/* Read a vector's worth of bytes, stopping on first fault.  */
+0:	ldff1b	z0.b, p1/z, [x0, xzr]
+	rdffrs	p0.b, p1/z
+	b.nlast	1f
+
+	/* First fault did not fail: the whole vector is valid.
+	   Avoid depending on the contents of FFR beyond the branch.  */
+	incb	x0, all			/* skip bytes this round */
+	cmpeq	p3.b, p1/z, z0.b, 0	/* search for 0 */
+	b.any	3f
+
+	cmpeq	p3.b, p1/z, z0.b, z1.b	/* search for c; no eos */
+	b.none	0b
+
+	mov	x2, x0			/* save advanced base */
+	mov	p2.b, p3.b		/* save current search */
+	b	0b
+
+	/* First fault failed: only some of the vector is valid.
+	   Perform the comparisions only on the valid bytes.  */
+1:	cmpeq	p3.b, p0/z, z0.b, 0	/* search for 0 */
+	b.any	2f
+
+	cmpeq	p3.b, p0/z, z0.b, z1.b	/* search for c; no eos */
+	mov	x3, x0
+	incp	x0, p0.b		/* skip bytes this round */
+	setffr				/* re-init FFR */
+	b.none	0b
+
+	addvl	x2, x3, 1		/* save advanced base */
+	mov	p2.b, p3.b		/* save current search */
+	b	0b
+
+	/* Found end-of-string.  */
+2:	incb	x0, all			/* advance base */
+3:	brka	p3.b, p1/z, p3.b	/* mask after first 0 */
+	cmpeq	p3.b, p3/z, z0.b, z1.b	/* search for c not after eos */
+	b.any	4f
+
+	/* No C within last vector.  Did we have one before?  */
+	cbz	x2, 5f
+	mov	x0, x2			/* restore advanced base */
+	mov	p3.b, p2.b		/* restore saved search */
+
+	/* Find the *last* match in the predicate.  This is slightly
+	   more complicated than finding the first match.  */
+4:	rev	p3.b, p3.b		/* reverse the bits */
+	brka	p3.b, p1/z, p3.b	/* find position of last match */
+	decp	x0, p3.b		/* retard pointer to last match */
+	ret
+
+	/* No C whatsoever.  Return NULL.  */
+5:	mov	x0, 0
+	ret
+
+	.size	__strrchr_aarch64_sve, . - __strrchr_aarch64_sve
diff --git a/string/arm/memchr.S b/string/arm/memchr.S
new file mode 100644
index 0000000..2eff4d1
--- /dev/null
+++ b/string/arm/memchr.S
@@ -0,0 +1,133 @@
+/*
+ * memchr - scan memory for a character
+ *
+ * Copyright (c) 2010, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/*
+   Written by Dave Gilbert <david.gilbert@linaro.org>
+
+   This __memchr_arm routine is optimised on a Cortex-A9 and should work on
+   all ARMv7 processors.   It has a fast past for short sizes, and has
+   an optimised path for large data sets; the worst case is finding the
+   match early in a large data set.
+
+ */
+
+@ 2011-02-07 david.gilbert@linaro.org
+@    Extracted from local git a5b438d861
+@ 2011-07-14 david.gilbert@linaro.org
+@    Import endianness fix from local git ea786f1b
+@ 2011-12-07 david.gilbert@linaro.org
+@    Removed unneeded cbz from align loop
+
+	.syntax unified
+	.arch armv7-a
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+	.text
+	.thumb
+
+@ ---------------------------------------------------------------------------
+	.thumb_func
+	.align 2
+	.p2align 4,,15
+	.global __memchr_arm
+	.type __memchr_arm,%function
+__memchr_arm:
+	@ r0 = start of memory to scan
+	@ r1 = character to look for
+	@ r2 = length
+	@ returns r0 = pointer to character or NULL if not found
+	and	r1,r1,#0xff	@ Don't think we can trust the caller to actually pass a char
+
+	cmp	r2,#16		@ If it's short don't bother with anything clever
+	blt	20f 
+
+	tst	r0, #7		@ If it's already aligned skip the next bit
+	beq	10f
+
+	@ Work up to an aligned point
+5:
+	ldrb	r3, [r0],#1
+	subs	r2, r2, #1
+	cmp	r3, r1
+	beq	50f		@ If it matches exit found
+	tst	r0, #7
+	bne	5b		@ If not aligned yet then do next byte
+	
+10:
+	@ At this point, we are aligned, we know we have at least 8 bytes to work with
+	push	{r4,r5,r6,r7}
+	orr	r1, r1, r1, lsl #8	@ expand the match word across to all bytes
+	orr	r1, r1, r1, lsl #16
+	bic	r4, r2, #7	@ Number of double words to work with
+	mvns	r7, #0		@ all F's
+	movs	r3, #0
+	
+15:
+	ldmia	r0!,{r5,r6}
+	subs	r4, r4, #8
+	eor	r5,r5, r1	@ Get it so that r5,r6 have 00's where the bytes match the target
+	eor	r6,r6, r1
+	uadd8	r5, r5, r7	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+	sel	r5, r3, r7	@ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+	uadd8	r6, r6, r7	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+	sel	r6, r5, r7	@ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+	cbnz	r6, 60f
+	bne	15b		@ (Flags from the subs above) If not run out of bytes then go around again
+
+	pop	{r4,r5,r6,r7}
+	and	r1,r1,#0xff	@ Get r1 back to a single character from the expansion above
+	and	r2,r2,#7	@ Leave the count remaining as the number after the double words have been done
+ 
+20:
+	cbz	r2, 40f		@ 0 length or hit the end already then not found
+
+21:  @ Post aligned section, or just a short call
+	ldrb	r3,[r0],#1
+	subs	r2,r2,#1
+	eor	r3,r3,r1	@ r3 = 0 if match - doesn't break flags from sub
+	cbz	r3, 50f
+	bne	21b		@ on r2 flags
+
+40:
+	movs	r0,#0		@ not found
+	bx	lr
+
+50:
+	subs	r0,r0,#1	@ found
+	bx	lr
+
+60:  @ We're here because the fast path found a hit - now we have to track down exactly which word it was
+	@ r0 points to the start of the double word after the one that was tested
+	@ r5 has the 00/ff pattern for the first word, r6 has the chained value
+	cmp	r5, #0
+	itte	eq
+	moveq	r5, r6		@ the end is in the 2nd word
+	subeq	r0,r0,#3	@ Points to 2nd byte of 2nd word
+	subne	r0,r0,#7	@ or 2nd byte of 1st word
+
+	@ r0 currently points to the 3rd byte of the word containing the hit
+	tst	r5, # CHARTSTMASK(0)	@ 1st character
+	bne	61f
+	adds	r0,r0,#1
+	tst	r5, # CHARTSTMASK(1)	@ 2nd character
+	ittt	eq
+	addeq	r0,r0,#1
+	tsteq	r5, # (3<<15)		@ 2nd & 3rd character
+	@ If not the 3rd must be the last one
+	addeq	r0,r0,#1
+
+61:
+	pop	{r4,r5,r6,r7}
+	subs	r0,r0,#1
+	bx	lr
+
+	.size	__memchr_arm, . - __memchr_arm
diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S
new file mode 100644
index 0000000..3346e4f
--- /dev/null
+++ b/string/arm/memcpy.S
@@ -0,0 +1,593 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2013, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/*
+   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
+   of VFP or NEON when built with the appropriate flags.
+
+   Assumptions:
+
+    ARMv6 (ARMv7-a if using Neon)
+    ARM state
+    Unaligned accesses
+
+ */
+
+	.syntax unified
+	/* This implementation requires ARM state.  */
+	.arm
+
+#ifdef __ARM_NEON__
+
+	.fpu	neon
+	.arch	armv7-a
+# define FRAME_SIZE	4
+# define USE_VFP
+# define USE_NEON
+
+#elif !defined (__SOFTFP__)
+
+	.arch	armv6
+	.fpu	vfpv2
+# define FRAME_SIZE	32
+# define USE_VFP
+
+#else
+	.arch	armv6
+# define FRAME_SIZE    32
+
+#endif
+
+/* Old versions of GAS incorrectly implement the NEON align semantics.  */
+#ifdef BROKEN_ASM_NEON_ALIGN
+#define ALIGN(addr, align) addr,:align
+#else
+#define ALIGN(addr, align) addr:align
+#endif
+
+#define PC_OFFSET	8	/* PC pipeline compensation.  */
+#define INSN_SIZE	4
+
+/* Call parameters.  */
+#define dstin	r0
+#define src	r1
+#define count	r2
+
+/* Locals.  */
+#define tmp1	r3
+#define dst	ip
+#define tmp2	r10
+
+#ifndef USE_NEON
+/* For bulk copies using GP registers.  */
+#define	A_l	r2		/* Call-clobbered.  */
+#define	A_h	r3		/* Call-clobbered.  */
+#define	B_l	r4
+#define	B_h	r5
+#define	C_l	r6
+#define	C_h	r7
+#define	D_l	r8
+#define	D_h	r9
+#endif
+
+/* Number of lines ahead to pre-fetch data.  If you change this the code
+   below will need adjustment to compensate.  */
+
+#define prefetch_lines	5
+
+#ifdef USE_VFP
+	.macro	cpy_line_vfp vreg, base
+	vstr	\vreg, [dst, #\base]
+	vldr	\vreg, [src, #\base]
+	vstr	d0, [dst, #\base + 8]
+	vldr	d0, [src, #\base + 8]
+	vstr	d1, [dst, #\base + 16]
+	vldr	d1, [src, #\base + 16]
+	vstr	d2, [dst, #\base + 24]
+	vldr	d2, [src, #\base + 24]
+	vstr	\vreg, [dst, #\base + 32]
+	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
+	vstr	d0, [dst, #\base + 40]
+	vldr	d0, [src, #\base + 40]
+	vstr	d1, [dst, #\base + 48]
+	vldr	d1, [src, #\base + 48]
+	vstr	d2, [dst, #\base + 56]
+	vldr	d2, [src, #\base + 56]
+	.endm
+
+	.macro	cpy_tail_vfp vreg, base
+	vstr	\vreg, [dst, #\base]
+	vldr	\vreg, [src, #\base]
+	vstr	d0, [dst, #\base + 8]
+	vldr	d0, [src, #\base + 8]
+	vstr	d1, [dst, #\base + 16]
+	vldr	d1, [src, #\base + 16]
+	vstr	d2, [dst, #\base + 24]
+	vldr	d2, [src, #\base + 24]
+	vstr	\vreg, [dst, #\base + 32]
+	vstr	d0, [dst, #\base + 40]
+	vldr	d0, [src, #\base + 40]
+	vstr	d1, [dst, #\base + 48]
+	vldr	d1, [src, #\base + 48]
+	vstr	d2, [dst, #\base + 56]
+	vldr	d2, [src, #\base + 56]
+	.endm
+#endif
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+def_fn __memcpy_arm p2align=6
+
+	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
+	cmp	count, #64
+	bge	.Lcpy_not_short
+	/* Deal with small copies quickly by dropping straight into the
+	   exit block.  */
+
+.Ltail63unaligned:
+#ifdef USE_NEON
+	and	tmp1, count, #0x38
+	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+	add	pc, pc, tmp1
+	vld1.8	{d0}, [src]!	/* 14 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 12 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 10 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 8 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 6 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 4 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 2 words to go.  */
+	vst1.8	{d0}, [dst]!
+
+	tst	count, #4
+	ldrne	tmp1, [src], #4
+	strne	tmp1, [dst], #4
+#else
+	/* Copy up to 15 full words of data.  May not be aligned.  */
+	/* Cannot use VFP for unaligned data.  */
+	and	tmp1, count, #0x3c
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
+	/* Jump directly into the sequence below at the correct offset.  */
+	add	pc, pc, tmp1, lsl #1
+
+	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
+	str	tmp1, [dst, #-60]
+
+	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
+	str	tmp1, [dst, #-56]
+	ldr	tmp1, [src, #-52]
+	str	tmp1, [dst, #-52]
+
+	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
+	str	tmp1, [dst, #-48]
+	ldr	tmp1, [src, #-44]
+	str	tmp1, [dst, #-44]
+
+	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
+	str	tmp1, [dst, #-40]
+	ldr	tmp1, [src, #-36]
+	str	tmp1, [dst, #-36]
+
+	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
+	str	tmp1, [dst, #-32]
+	ldr	tmp1, [src, #-28]
+	str	tmp1, [dst, #-28]
+
+	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
+	str	tmp1, [dst, #-24]
+	ldr	tmp1, [src, #-20]
+	str	tmp1, [dst, #-20]
+
+	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
+	str	tmp1, [dst, #-16]
+	ldr	tmp1, [src, #-12]
+	str	tmp1, [dst, #-12]
+
+	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
+	str	tmp1, [dst, #-8]
+	ldr	tmp1, [src, #-4]
+	str	tmp1, [dst, #-4]
+#endif
+
+	lsls	count, count, #31
+	ldrhcs	tmp1, [src], #2
+	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
+	strhcs	tmp1, [dst], #2
+	strbne	src, [dst]
+	bx	lr
+
+.Lcpy_not_short:
+	/* At least 64 bytes to copy, but don't know the alignment yet.  */
+	str	tmp2, [sp, #-FRAME_SIZE]!
+	and	tmp2, src, #7
+	and	tmp1, dst, #7
+	cmp	tmp1, tmp2
+	bne	.Lcpy_notaligned
+
+#ifdef USE_VFP
+	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
+	   that the FP pipeline is much better at streaming loads and
+	   stores.  This is outside the critical loop.  */
+	vmov.f32	s0, s0
+#endif
+
+	/* SRC and DST have the same mutual 64-bit alignment, but we may
+	   still need to pre-copy some bytes to get to natural alignment.
+	   We bring SRC and DST into full 64-bit alignment.  */
+	lsls	tmp2, dst, #29
+	beq	1f
+	rsbs	tmp2, tmp2, #0
+	sub	count, count, tmp2, lsr #29
+	ldrmi	tmp1, [src], #4
+	strmi	tmp1, [dst], #4
+	lsls	tmp2, tmp2, #2
+	ldrhcs	tmp1, [src], #2
+	ldrbne	tmp2, [src], #1
+	strhcs	tmp1, [dst], #2
+	strbne	tmp2, [dst], #1
+
+1:
+	subs	tmp2, count, #64	/* Use tmp2 for count.  */
+	blt	.Ltail63aligned
+
+	cmp	tmp2, #512
+	bge	.Lcpy_body_long
+
+.Lcpy_body_medium:			/* Count in tmp2.  */
+#ifdef USE_VFP
+1:
+	vldr	d0, [src, #0]
+	subs	tmp2, tmp2, #64
+	vldr	d1, [src, #8]
+	vstr	d0, [dst, #0]
+	vldr	d0, [src, #16]
+	vstr	d1, [dst, #8]
+	vldr	d1, [src, #24]
+	vstr	d0, [dst, #16]
+	vldr	d0, [src, #32]
+	vstr	d1, [dst, #24]
+	vldr	d1, [src, #40]
+	vstr	d0, [dst, #32]
+	vldr	d0, [src, #48]
+	vstr	d1, [dst, #40]
+	vldr	d1, [src, #56]
+	vstr	d0, [dst, #48]
+	add	src, src, #64
+	vstr	d1, [dst, #56]
+	add	dst, dst, #64
+	bge	1b
+	tst	tmp2, #0x3f
+	beq	.Ldone
+
+.Ltail63aligned:			/* Count in tmp2.  */
+	and	tmp1, tmp2, #0x38
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+	add	pc, pc, tmp1
+
+	vldr	d0, [src, #-56]	/* 14 words to go.  */
+	vstr	d0, [dst, #-56]
+	vldr	d0, [src, #-48]	/* 12 words to go.  */
+	vstr	d0, [dst, #-48]
+	vldr	d0, [src, #-40]	/* 10 words to go.  */
+	vstr	d0, [dst, #-40]
+	vldr	d0, [src, #-32]	/* 8 words to go.  */
+	vstr	d0, [dst, #-32]
+	vldr	d0, [src, #-24]	/* 6 words to go.  */
+	vstr	d0, [dst, #-24]
+	vldr	d0, [src, #-16]	/* 4 words to go.  */
+	vstr	d0, [dst, #-16]
+	vldr	d0, [src, #-8]	/* 2 words to go.  */
+	vstr	d0, [dst, #-8]
+#else
+	sub	src, src, #8
+	sub	dst, dst, #8
+1:
+	ldrd	A_l, A_h, [src, #8]
+	strd	A_l, A_h, [dst, #8]
+	ldrd	A_l, A_h, [src, #16]
+	strd	A_l, A_h, [dst, #16]
+	ldrd	A_l, A_h, [src, #24]
+	strd	A_l, A_h, [dst, #24]
+	ldrd	A_l, A_h, [src, #32]
+	strd	A_l, A_h, [dst, #32]
+	ldrd	A_l, A_h, [src, #40]
+	strd	A_l, A_h, [dst, #40]
+	ldrd	A_l, A_h, [src, #48]
+	strd	A_l, A_h, [dst, #48]
+	ldrd	A_l, A_h, [src, #56]
+	strd	A_l, A_h, [dst, #56]
+	ldrd	A_l, A_h, [src, #64]!
+	strd	A_l, A_h, [dst, #64]!
+	subs	tmp2, tmp2, #64
+	bge	1b
+	tst	tmp2, #0x3f
+	bne	1f
+	ldr	tmp2,[sp], #FRAME_SIZE
+	bx	lr
+1:
+	add	src, src, #8
+	add	dst, dst, #8
+
+.Ltail63aligned:			/* Count in tmp2.  */
+	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
+	   we know that the src and dest are 64-bit aligned so we can use
+	   LDRD/STRD to improve efficiency.  */
+	/* TMP2 is now negative, but we don't care about that.  The bottom
+	   six bits still tell us how many bytes are left to copy.  */
+
+	and	tmp1, tmp2, #0x38
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+	add	pc, pc, tmp1
+	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
+	strd	A_l, A_h, [dst, #-56]
+	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
+	strd	A_l, A_h, [dst, #-48]
+	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
+	strd	A_l, A_h, [dst, #-40]
+	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
+	strd	A_l, A_h, [dst, #-32]
+	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
+	strd	A_l, A_h, [dst, #-24]
+	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
+	strd	A_l, A_h, [dst, #-16]
+	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
+	strd	A_l, A_h, [dst, #-8]
+
+#endif
+	tst	tmp2, #4
+	ldrne	tmp1, [src], #4
+	strne	tmp1, [dst], #4
+	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
+	ldrhcs	tmp1, [src], #2
+	ldrbne	tmp2, [src]
+	strhcs	tmp1, [dst], #2
+	strbne	tmp2, [dst]
+
+.Ldone:
+	ldr	tmp2, [sp], #FRAME_SIZE
+	bx	lr
+
+.Lcpy_body_long:			/* Count in tmp2.  */
+
+	/* Long copy.  We know that there's at least (prefetch_lines * 64)
+	   bytes to go.  */
+#ifdef USE_VFP
+	/* Don't use PLD.  Instead, read some data in advance of the current
+	   copy position into a register.  This should act like a PLD
+	   operation but we won't have to repeat the transfer.  */
+
+	vldr	d3, [src, #0]
+	vldr	d4, [src, #64]
+	vldr	d5, [src, #128]
+	vldr	d6, [src, #192]
+	vldr	d7, [src, #256]
+
+	vldr	d0, [src, #8]
+	vldr	d1, [src, #16]
+	vldr	d2, [src, #24]
+	add	src, src, #32
+
+	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
+	blt	2f
+1:
+	cpy_line_vfp	d3, 0
+	cpy_line_vfp	d4, 64
+	cpy_line_vfp	d5, 128
+	add	dst, dst, #3 * 64
+	add	src, src, #3 * 64
+	cpy_line_vfp	d6, 0
+	cpy_line_vfp	d7, 64
+	add	dst, dst, #2 * 64
+	add	src, src, #2 * 64
+	subs	tmp2, tmp2, #prefetch_lines * 64
+	bge	1b
+
+2:
+	cpy_tail_vfp	d3, 0
+	cpy_tail_vfp	d4, 64
+	cpy_tail_vfp	d5, 128
+	add	src, src, #3 * 64
+	add	dst, dst, #3 * 64
+	cpy_tail_vfp	d6, 0
+	vstr	d7, [dst, #64]
+	vldr	d7, [src, #64]
+	vstr	d0, [dst, #64 + 8]
+	vldr	d0, [src, #64 + 8]
+	vstr	d1, [dst, #64 + 16]
+	vldr	d1, [src, #64 + 16]
+	vstr	d2, [dst, #64 + 24]
+	vldr	d2, [src, #64 + 24]
+	vstr	d7, [dst, #64 + 32]
+	add	src, src, #96
+	vstr	d0, [dst, #64 + 40]
+	vstr	d1, [dst, #64 + 48]
+	vstr	d2, [dst, #64 + 56]
+	add	dst, dst, #128
+	add	tmp2, tmp2, #prefetch_lines * 64
+	b	.Lcpy_body_medium
+#else
+	/* Long copy.  Use an SMS style loop to maximize the I/O
+	   bandwidth of the core.  We don't have enough spare registers
+	   to synthesise prefetching, so use PLD operations.  */
+	/* Pre-bias src and dst.  */
+	sub	src, src, #8
+	sub	dst, dst, #8
+	pld	[src, #8]
+	pld	[src, #72]
+	subs	tmp2, tmp2, #64
+	pld	[src, #136]
+	ldrd	A_l, A_h, [src, #8]
+	strd	B_l, B_h, [sp, #8]
+	ldrd	B_l, B_h, [src, #16]
+	strd	C_l, C_h, [sp, #16]
+	ldrd	C_l, C_h, [src, #24]
+	strd	D_l, D_h, [sp, #24]
+	pld	[src, #200]
+	ldrd	D_l, D_h, [src, #32]!
+	b	1f
+	.p2align	6
+2:
+	pld	[src, #232]
+	strd	A_l, A_h, [dst, #40]
+	ldrd	A_l, A_h, [src, #40]
+	strd	B_l, B_h, [dst, #48]
+	ldrd	B_l, B_h, [src, #48]
+	strd	C_l, C_h, [dst, #56]
+	ldrd	C_l, C_h, [src, #56]
+	strd	D_l, D_h, [dst, #64]!
+	ldrd	D_l, D_h, [src, #64]!
+	subs	tmp2, tmp2, #64
+1:
+	strd	A_l, A_h, [dst, #8]
+	ldrd	A_l, A_h, [src, #8]
+	strd	B_l, B_h, [dst, #16]
+	ldrd	B_l, B_h, [src, #16]
+	strd	C_l, C_h, [dst, #24]
+	ldrd	C_l, C_h, [src, #24]
+	strd	D_l, D_h, [dst, #32]
+	ldrd	D_l, D_h, [src, #32]
+	bcs	2b
+	/* Save the remaining bytes and restore the callee-saved regs.  */
+	strd	A_l, A_h, [dst, #40]
+	add	src, src, #40
+	strd	B_l, B_h, [dst, #48]
+	ldrd	B_l, B_h, [sp, #8]
+	strd	C_l, C_h, [dst, #56]
+	ldrd	C_l, C_h, [sp, #16]
+	strd	D_l, D_h, [dst, #64]
+	ldrd	D_l, D_h, [sp, #24]
+	add	dst, dst, #72
+	tst	tmp2, #0x3f
+	bne	.Ltail63aligned
+	ldr	tmp2, [sp], #FRAME_SIZE
+	bx	lr
+#endif
+
+.Lcpy_notaligned:
+	pld	[src]
+	pld	[src, #64]
+	/* There's at least 64 bytes to copy, but there is no mutual
+	   alignment.  */
+	/* Bring DST to 64-bit alignment.  */
+	lsls	tmp2, dst, #29
+	pld	[src, #(2 * 64)]
+	beq	1f
+	rsbs	tmp2, tmp2, #0
+	sub	count, count, tmp2, lsr #29
+	ldrmi	tmp1, [src], #4
+	strmi	tmp1, [dst], #4
+	lsls	tmp2, tmp2, #2
+	ldrbne	tmp1, [src], #1
+	ldrhcs	tmp2, [src], #2
+	strbne	tmp1, [dst], #1
+	strhcs	tmp2, [dst], #2
+1:
+	pld	[src, #(3 * 64)]
+	subs	count, count, #64
+	ldrmi	tmp2, [sp], #FRAME_SIZE
+	bmi	.Ltail63unaligned
+	pld	[src, #(4 * 64)]
+
+#ifdef USE_NEON
+	vld1.8	{d0-d3}, [src]!
+	vld1.8	{d4-d7}, [src]!
+	subs	count, count, #64
+	bmi	2f
+1:
+	pld	[src, #(4 * 64)]
+	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
+	vld1.8	{d0-d3}, [src]!
+	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
+	vld1.8	{d4-d7}, [src]!
+	subs	count, count, #64
+	bpl	1b
+2:
+	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
+	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
+	ands	count, count, #0x3f
+#else
+	/* Use an SMS style loop to maximize the I/O bandwidth.  */
+	sub	src, src, #4
+	sub	dst, dst, #8
+	subs	tmp2, count, #64	/* Use tmp2 for count.  */
+	ldr	A_l, [src, #4]
+	ldr	A_h, [src, #8]
+	strd	B_l, B_h, [sp, #8]
+	ldr	B_l, [src, #12]
+	ldr	B_h, [src, #16]
+	strd	C_l, C_h, [sp, #16]
+	ldr	C_l, [src, #20]
+	ldr	C_h, [src, #24]
+	strd	D_l, D_h, [sp, #24]
+	ldr	D_l, [src, #28]
+	ldr	D_h, [src, #32]!
+	b	1f
+	.p2align	6
+2:
+	pld	[src, #(5 * 64) - (32 - 4)]
+	strd	A_l, A_h, [dst, #40]
+	ldr	A_l, [src, #36]
+	ldr	A_h, [src, #40]
+	strd	B_l, B_h, [dst, #48]
+	ldr	B_l, [src, #44]
+	ldr	B_h, [src, #48]
+	strd	C_l, C_h, [dst, #56]
+	ldr	C_l, [src, #52]
+	ldr	C_h, [src, #56]
+	strd	D_l, D_h, [dst, #64]!
+	ldr	D_l, [src, #60]
+	ldr	D_h, [src, #64]!
+	subs	tmp2, tmp2, #64
+1:
+	strd	A_l, A_h, [dst, #8]
+	ldr	A_l, [src, #4]
+	ldr	A_h, [src, #8]
+	strd	B_l, B_h, [dst, #16]
+	ldr	B_l, [src, #12]
+	ldr	B_h, [src, #16]
+	strd	C_l, C_h, [dst, #24]
+	ldr	C_l, [src, #20]
+	ldr	C_h, [src, #24]
+	strd	D_l, D_h, [dst, #32]
+	ldr	D_l, [src, #28]
+	ldr	D_h, [src, #32]
+	bcs	2b
+
+	/* Save the remaining bytes and restore the callee-saved regs.  */
+	strd	A_l, A_h, [dst, #40]
+	add	src, src, #36
+	strd	B_l, B_h, [dst, #48]
+	ldrd	B_l, B_h, [sp, #8]
+	strd	C_l, C_h, [dst, #56]
+	ldrd	C_l, C_h, [sp, #16]
+	strd	D_l, D_h, [dst, #64]
+	ldrd	D_l, D_h, [sp, #24]
+	add	dst, dst, #72
+	ands	count, tmp2, #0x3f
+#endif
+	ldr	tmp2, [sp], #FRAME_SIZE
+	bne	.Ltail63unaligned
+	bx	lr
+
+	.size	__memcpy_arm, . - __memcpy_arm
diff --git a/string/arm/memset.S b/string/arm/memset.S
new file mode 100644
index 0000000..3ee5238
--- /dev/null
+++ b/string/arm/memset.S
@@ -0,0 +1,99 @@
+/*
+ * memset - fill memory with a constant
+ *
+ * Copyright (c) 2010, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/*
+   Written by Dave Gilbert <david.gilbert@linaro.org>
+
+   This memset routine is optimised on a Cortex-A9 and should work on
+   all ARMv7 processors.
+
+ */
+
+	.syntax unified
+	.arch armv7-a
+
+@ 2011-08-30 david.gilbert@linaro.org
+@    Extracted from local git 2f11b436
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+	.text
+	.thumb
+
+@ ---------------------------------------------------------------------------
+	.thumb_func
+	.align 2
+	.p2align 4,,15
+	.global __memset_arm
+	.type __memset_arm,%function
+__memset_arm:
+	@ r0 = address
+	@ r1 = character
+	@ r2 = count
+	@ returns original address in r0
+
+	mov	r3, r0		@ Leave r0 alone
+	cbz	r2, 10f		@ Exit if 0 length
+
+	tst	r0, #7
+	beq	2f		@ Already aligned
+
+	@ Ok, so we're misaligned here
+1:
+	strb	r1, [r3], #1
+	subs	r2,r2,#1
+	tst	r3, #7
+	cbz	r2, 10f		@ Exit if we hit the end
+	bne	1b		@ go round again if still misaligned
+
+2:
+	@ OK, so we're aligned
+	push	{r4,r5,r6,r7}
+	bics	r4, r2, #15	@ if less than 16 bytes then need to finish it off
+	beq	5f
+
+3:
+	@ POSIX says that ch is cast to an unsigned char.  A uxtb is one
+	@ byte and takes two cycles, where an AND is four bytes but one
+	@ cycle.
+	and	r1, #0xFF
+	orr	r1, r1, r1, lsl#8	@ Same character into all bytes
+	orr	r1, r1, r1, lsl#16
+	mov	r5,r1
+	mov	r6,r1
+	mov	r7,r1
+
+4:
+	subs	r4,r4,#16
+	stmia	r3!,{r1,r5,r6,r7}
+	bne	4b
+	and	r2,r2,#15
+
+	@ At this point we're still aligned and we have upto align-1 bytes left to right
+	@ we can avoid some of the byte-at-a time now by testing for some big chunks
+	tst	r2,#8
+	itt	ne
+	subne	r2,r2,#8
+	stmiane	r3!,{r1,r5}
+
+5:
+	pop	{r4,r5,r6,r7}
+	cbz	r2, 10f
+
+	@ Got to do any last < alignment bytes
+6:
+	subs	r2,r2,#1
+	strb	r1,[r3],#1
+	bne	6b
+
+10:
+	bx	lr		@ goodbye
+	.size	__memset_arm, . - __memset_arm
diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S
new file mode 100644
index 0000000..5ea06c9
--- /dev/null
+++ b/string/arm/strcmp-armv6m.S
@@ -0,0 +1,118 @@
+/*
+ * strcmp for ARMv6-M (optimized for performance, not size)
+ *
+ * Copyright (c) 2014-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+	.thumb_func
+	.syntax unified
+	.arch	armv6-m
+
+	.macro DoSub n, label
+	subs	r0, r0, r1
+#ifdef __ARM_BIG_ENDIAN
+	lsrs	r1, r4, \n
+#else
+	lsls	r1, r4, \n
+#endif
+	orrs	r1, r0
+	bne	\label
+	.endm
+
+	.macro Byte_Test n, label
+	lsrs	r0, r2, \n
+	lsrs	r1, r3, \n
+	DoSub	\n, \label
+	.endm
+
+	.text
+	.p2align	0
+	.global	__strcmp_armv6m
+	.type	__strcmp_armv6m, %function
+__strcmp_armv6m:
+	.cfi_startproc
+	mov	r2, r0
+	push	{r4, r5, r6, lr}
+	orrs	r2, r1
+	lsls	r2, r2, #30
+	bne	6f
+	ldr	r5, =0x01010101
+	lsls	r6, r5, #7
+1:
+	ldmia	r0!, {r2}
+	ldmia	r1!, {r3}
+	subs	r4, r2, r5
+	bics	r4, r2
+	ands	r4, r6
+	beq	3f
+
+#ifdef __ARM_BIG_ENDIAN
+	Byte_Test #24, 4f
+	Byte_Test #16, 4f
+	Byte_Test #8, 4f
+
+	b       7f
+3:
+	cmp     r2, r3
+	beq     1b
+	cmp     r2, r3
+#else
+	uxtb    r0, r2
+	uxtb    r1, r3
+	DoSub   #24, 2f
+
+	uxth    r0, r2
+	uxth    r1, r3
+	DoSub   #16, 2f
+
+	lsls    r0, r2, #8
+	lsls    r1, r3, #8
+	lsrs    r0, r0, #8
+	lsrs    r1, r1, #8
+	DoSub   #8, 2f
+
+	lsrs    r0, r2, #24
+	lsrs    r1, r3, #24
+	subs    r0, r0, r1
+2:
+	pop     {r4, r5, r6, pc}
+
+3:
+	cmp     r2, r3
+	beq     1b
+	rev     r0, r2
+	rev     r1, r3
+	cmp     r0, r1
+#endif
+
+	bls	5f
+	movs	r0, #1
+4:
+	pop	{r4, r5, r6, pc}
+5:
+	movs	r0, #0
+	mvns	r0, r0
+	pop	{r4, r5, r6, pc}
+6:
+	ldrb	r2, [r0, #0]
+	ldrb	r3, [r1, #0]
+	adds	r0, #1
+	adds	r1, #1
+	cmp	r2, #0
+	beq	7f
+	cmp	r2, r3
+	bne	7f
+	ldrb	r2, [r0, #0]
+	ldrb	r3, [r1, #0]
+	adds	r0, #1
+	adds	r1, #1
+	cmp	r2, #0
+	beq	7f
+	cmp	r2, r3
+	beq	6b
+7:
+	subs	r0, r2, r3
+	pop	{r4, r5, r6, pc}
+	.cfi_endproc
+	.size	__strcmp_armv6m, . - __strcmp_armv6m
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
new file mode 100644
index 0000000..fb9cae3
--- /dev/null
+++ b/string/arm/strcmp.S
@@ -0,0 +1,479 @@
+/*
+ * strcmp for ARMv7
+ *
+ * Copyright (c) 2012-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Implementation of strcmp for ARMv7 when DSP instructions are
+   available.  Use ldrd to support wider loads, provided the data
+   is sufficiently aligned.  Use saturating arithmetic to optimize
+   the compares.  */
+
+/* Build Options:
+   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
+   byte in the string.  If comparing completely random strings
+   the pre-check will save time, since there is a very high
+   probability of a mismatch in the first character: we save
+   significant overhead if this is the common case.  However,
+   if strings are likely to be identical (eg because we're
+   verifying a hit in a hash table), then this check is largely
+   redundant.  */
+
+#define STRCMP_NO_PRECHECK	0
+
+	/* This version uses Thumb-2 code.  */
+	.thumb
+	.syntax unified
+
+#ifdef __ARM_BIG_ENDIAN
+#define S2LO lsl
+#define S2LOEQ lsleq
+#define S2HI lsr
+#define MSB 0x000000ff
+#define LSB 0xff000000
+#define BYTE0_OFFSET 24
+#define BYTE1_OFFSET 16
+#define BYTE2_OFFSET 8
+#define BYTE3_OFFSET 0
+#else /* not  __ARM_BIG_ENDIAN */
+#define S2LO lsr
+#define S2LOEQ lsreq
+#define S2HI lsl
+#define BYTE0_OFFSET 0
+#define BYTE1_OFFSET 8
+#define BYTE2_OFFSET 16
+#define BYTE3_OFFSET 24
+#define MSB 0xff000000
+#define LSB 0x000000ff
+#endif /* not  __ARM_BIG_ENDIAN */
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+/* Parameters and result.  */
+#define src1		r0
+#define src2		r1
+#define result		r0	/* Overlaps src1.  */
+
+/* Internal variables.  */
+#define tmp1		r4
+#define tmp2		r5
+#define const_m1	r12
+
+/* Additional internal variables for 64-bit aligned data.  */
+#define data1a		r2
+#define data1b		r3
+#define data2a		r6
+#define data2b		r7
+#define syndrome_a	tmp1
+#define syndrome_b	tmp2
+
+/* Additional internal variables for 32-bit aligned data.  */
+#define data1		r2
+#define data2		r3
+#define syndrome	tmp2
+
+
+	/* Macro to compute and return the result value for word-aligned
+	   cases.  */
+	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
+#ifdef __ARM_BIG_ENDIAN
+	/* If data1 contains a zero byte, then syndrome will contain a 1 in
+	   bit 7 of that byte.  Otherwise, the highest set bit in the
+	   syndrome will highlight the first different bit.  It is therefore
+	   sufficient to extract the eight bits starting with the syndrome
+	   bit.  */
+	clz	tmp1, \synd
+	lsl	r1, \d2, tmp1
+	.if \restore_r6
+	ldrd	r6, r7, [sp, #8]
+	.endif
+	.cfi_restore 6
+	.cfi_restore 7
+	lsl	\d1, \d1, tmp1
+	.cfi_remember_state
+	lsr	result, \d1, #24
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	sub	result, result, r1, lsr #24
+	bx	lr
+#else
+	/* To use the big-endian trick we'd have to reverse all three words.
+	   that's slower than this approach.  */
+	rev	\synd, \synd
+	clz	tmp1, \synd
+	bic	tmp1, tmp1, #7
+	lsr	r1, \d2, tmp1
+	.cfi_remember_state
+	.if \restore_r6
+	ldrd	r6, r7, [sp, #8]
+	.endif
+	.cfi_restore 6
+	.cfi_restore 7
+	lsr	\d1, \d1, tmp1
+	and	result, \d1, #255
+	and	r1, r1, #255
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	sub	result, result, r1
+
+	bx	lr
+#endif
+	.endm
+
+	.text
+	.p2align	5
+.Lstrcmp_start_addr:
+#if STRCMP_NO_PRECHECK == 0
+.Lfastpath_exit:
+	sub	r0, r2, r3
+	bx	lr
+	nop
+#endif
+def_fn	__strcmp_arm
+#if STRCMP_NO_PRECHECK == 0
+	ldrb	r2, [src1]
+	ldrb	r3, [src2]
+	cmp	r2, #1
+	it	cs
+	cmpcs	r2, r3
+	bne	.Lfastpath_exit
+#endif
+	.cfi_startproc
+	strd	r4, r5, [sp, #-16]!
+	.cfi_def_cfa_offset 16
+	.cfi_offset 4, -16
+	.cfi_offset 5, -12
+	orr	tmp1, src1, src2
+	strd	r6, r7, [sp, #8]
+	.cfi_offset 6, -8
+	.cfi_offset 7, -4
+	mvn	const_m1, #0
+	lsl	r2, tmp1, #29
+	cbz	r2, .Lloop_aligned8
+
+.Lnot_aligned:
+	eor	tmp1, src1, src2
+	tst	tmp1, #7
+	bne	.Lmisaligned8
+
+	/* Deal with mutual misalignment by aligning downwards and then
+	   masking off the unwanted loaded data to prevent a difference.  */
+	and	tmp1, src1, #7
+	bic	src1, src1, #7
+	and	tmp2, tmp1, #3
+	bic	src2, src2, #7
+	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
+	ldrd	data1a, data1b, [src1], #16
+	tst	tmp1, #4
+	ldrd	data2a, data2b, [src2], #16
+	/* In thumb code we can't use MVN with a register shift, but
+	   we do have ORN.  */
+	S2HI	tmp1, const_m1, tmp2
+	orn	data1a, data1a, tmp1
+	orn	data2a, data2a, tmp1
+	beq	.Lstart_realigned8
+	orn	data1b, data1b, tmp1
+	mov	data1a, const_m1
+	orn	data2b, data2b, tmp1
+	mov	data2a, const_m1
+	b	.Lstart_realigned8
+
+	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
+	   pass.  */
+	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
+	.p2align 2	/* Always word aligned.  */
+.Lloop_aligned8:
+	ldrd	data1a, data1b, [src1], #16
+	ldrd	data2a, data2b, [src2], #16
+.Lstart_realigned8:
+	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
+	eor	syndrome_a, data1a, data2a
+	sel	syndrome_a, syndrome_a, const_m1
+	cbnz	syndrome_a, .Ldiff_in_a
+	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
+	eor	syndrome_b, data1b, data2b
+	sel	syndrome_b, syndrome_b, const_m1
+	cbnz	syndrome_b, .Ldiff_in_b
+
+	ldrd	data1a, data1b, [src1, #-8]
+	ldrd	data2a, data2b, [src2, #-8]
+	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
+	eor	syndrome_a, data1a, data2a
+	sel	syndrome_a, syndrome_a, const_m1
+	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
+	eor	syndrome_b, data1b, data2b
+	sel	syndrome_b, syndrome_b, const_m1
+	/* Can't use CBZ for backwards branch.  */
+	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
+	beq	.Lloop_aligned8
+
+.Ldiff_found:
+	cbnz	syndrome_a, .Ldiff_in_a
+
+.Ldiff_in_b:
+	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
+
+.Ldiff_in_a:
+	.cfi_restore_state
+	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
+
+	.cfi_restore_state
+.Lmisaligned8:
+	tst	tmp1, #3
+	bne	.Lmisaligned4
+	ands	tmp1, src1, #3
+	bne	.Lmutual_align4
+
+	/* Unrolled by a factor of 2, to reduce the number of post-increment
+	   operations.  */
+.Lloop_aligned4:
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+.Lstart_realigned4:
+	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
+	eor	syndrome, data1, data2
+	sel	syndrome, syndrome, const_m1
+	cbnz	syndrome, .Laligned4_done
+	ldr	data1, [src1, #-4]
+	ldr	data2, [src2, #-4]
+	uadd8	syndrome, data1, const_m1
+	eor	syndrome, data1, data2
+	sel	syndrome, syndrome, const_m1
+	cmp	syndrome, #0
+	beq	.Lloop_aligned4
+
+.Laligned4_done:
+	strcmp_epilogue_aligned syndrome, data1, data2, 0
+
+.Lmutual_align4:
+	.cfi_restore_state
+	/* Deal with mutual misalignment by aligning downwards and then
+	   masking off the unwanted loaded data to prevent a difference.  */
+	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
+	bic	src1, src1, #3
+	ldr	data1, [src1], #8
+	bic	src2, src2, #3
+	ldr	data2, [src2], #8
+
+	/* In thumb code we can't use MVN with a register shift, but
+	   we do have ORN.  */
+	S2HI	tmp1, const_m1, tmp1
+	orn	data1, data1, tmp1
+	orn	data2, data2, tmp1
+	b	.Lstart_realigned4
+
+.Lmisaligned4:
+	ands	tmp1, src1, #3
+	beq	.Lsrc1_aligned
+	sub	src2, src2, tmp1
+	bic	src1, src1, #3
+	lsls	tmp1, tmp1, #31
+	ldr	data1, [src1], #4
+	beq	.Laligned_m2
+	bcs	.Laligned_m1
+
+#if STRCMP_NO_PRECHECK == 1
+	ldrb	data2, [src2, #1]
+	uxtb	tmp1, data1, ror #BYTE1_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	cbz	data2, .Lmisaligned_exit
+
+.Laligned_m2:
+	ldrb	data2, [src2, #2]
+	uxtb	tmp1, data1, ror #BYTE2_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	cbz	data2, .Lmisaligned_exit
+
+.Laligned_m1:
+	ldrb	data2, [src2, #3]
+	uxtb	tmp1, data1, ror #BYTE3_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	add	src2, src2, #4
+	cbnz	data2, .Lsrc1_aligned
+#else  /* STRCMP_NO_PRECHECK */
+	/* If we've done the pre-check, then we don't need to check the
+	   first byte again here.  */
+	ldrb	data2, [src2, #2]
+	uxtb	tmp1, data1, ror #BYTE2_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	cbz	data2, .Lmisaligned_exit
+
+.Laligned_m2:
+	ldrb	data2, [src2, #3]
+	uxtb	tmp1, data1, ror #BYTE3_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	cbnz	data2, .Laligned_m1
+#endif
+
+.Lmisaligned_exit:
+	.cfi_remember_state
+	mov	result, tmp1
+	ldr	r4, [sp], #16
+	.cfi_restore 4
+	bx	lr
+
+#if STRCMP_NO_PRECHECK == 0
+.Laligned_m1:
+	add	src2, src2, #4
+#endif
+.Lsrc1_aligned:
+	.cfi_restore_state
+	/* src1 is word aligned, but src2 has no common alignment
+	   with it.  */
+	ldr	data1, [src1], #4
+	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
+
+	bic	src2, src2, #3
+	ldr	data2, [src2], #4
+	bhi	.Loverlap1		/* C=1, Z=0 => src2[1:0] = 0b11.  */
+	bcs	.Loverlap2		/* C=1, Z=1 => src2[1:0] = 0b10.  */
+
+	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
+.Loverlap3:
+	bic	tmp1, data1, #MSB
+	uadd8	syndrome, data1, const_m1
+	eors	syndrome, tmp1, data2, S2LO #8
+	sel	syndrome, syndrome, const_m1
+	bne	4f
+	cbnz	syndrome, 5f
+	ldr	data2, [src2], #4
+	eor	tmp1, tmp1, data1
+	cmp	tmp1, data2, S2HI #24
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap3
+4:
+	S2LO	data2, data2, #8
+	b	.Lstrcmp_tail
+
+5:
+	bics	syndrome, syndrome, #MSB
+	bne	.Lstrcmp_done_equal
+
+	/* We can only get here if the MSB of data1 contains 0, so
+	   fast-path the exit.  */
+	ldrb	result, [src2]
+	.cfi_remember_state
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	/* R6/7 Not used in this sequence.  */
+	.cfi_restore 6
+	.cfi_restore 7
+	neg	result, result
+	bx	lr
+
+6:
+	.cfi_restore_state
+	S2LO	data1, data1, #24
+	and	data2, data2, #LSB
+	b	.Lstrcmp_tail
+
+	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
+.Loverlap2:
+	and	tmp1, data1, const_m1, S2LO #16
+	uadd8	syndrome, data1, const_m1
+	eors	syndrome, tmp1, data2, S2LO #16
+	sel	syndrome, syndrome, const_m1
+	bne	4f
+	cbnz	syndrome, 5f
+	ldr	data2, [src2], #4
+	eor	tmp1, tmp1, data1
+	cmp	tmp1, data2, S2HI #16
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap2
+4:
+	S2LO	data2, data2, #16
+	b	.Lstrcmp_tail
+5:
+	ands	syndrome, syndrome, const_m1, S2LO #16
+	bne	.Lstrcmp_done_equal
+
+	ldrh	data2, [src2]
+	S2LO	data1, data1, #16
+#ifdef __ARM_BIG_ENDIAN
+	lsl	data2, data2, #16
+#endif
+	b	.Lstrcmp_tail
+
+6:
+	S2LO	data1, data1, #16
+	and	data2, data2, const_m1, S2LO #16
+	b	.Lstrcmp_tail
+
+	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
+.Loverlap1:
+	and	tmp1, data1, #LSB
+	uadd8	syndrome, data1, const_m1
+	eors	syndrome, tmp1, data2, S2LO #24
+	sel	syndrome, syndrome, const_m1
+	bne	4f
+	cbnz	syndrome, 5f
+	ldr	data2, [src2], #4
+	eor	tmp1, tmp1, data1
+	cmp	tmp1, data2, S2HI #8
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap1
+4:
+	S2LO	data2, data2, #24
+	b	.Lstrcmp_tail
+5:
+	tst	syndrome, #LSB
+	bne	.Lstrcmp_done_equal
+	ldr	data2, [src2]
+6:
+	S2LO	data1, data1, #8
+	bic	data2, data2, #MSB
+	b	.Lstrcmp_tail
+
+.Lstrcmp_done_equal:
+	mov	result, #0
+	.cfi_remember_state
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	/* R6/7 not used in this sequence.  */
+	.cfi_restore 6
+	.cfi_restore 7
+	bx	lr
+
+.Lstrcmp_tail:
+	.cfi_restore_state
+#ifndef __ARM_BIG_ENDIAN
+	rev	data1, data1
+	rev	data2, data2
+	/* Now everything looks big-endian...  */
+#endif
+	uadd8	tmp1, data1, const_m1
+	eor	tmp1, data1, data2
+	sel	syndrome, tmp1, const_m1
+	clz	tmp1, syndrome
+	lsl	data1, data1, tmp1
+	lsl	data2, data2, tmp1
+	lsr	result, data1, #24
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	/* R6/7 not used in this sequence.  */
+	.cfi_restore 6
+	.cfi_restore 7
+	sub	result, result, data2, lsr #24
+	bx	lr
+	.cfi_endproc
+	.size __strcmp, . - .Lstrcmp_start_addr
diff --git a/string/arm/strcpy.c b/string/arm/strcpy.c
new file mode 100644
index 0000000..48ebbe8
--- /dev/null
+++ b/string/arm/strcpy.c
@@ -0,0 +1,129 @@
+/*
+ * strcpy
+ *
+ * Copyright (c) 2008-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* For GLIBC:
+#include <string.h>
+#include <memcopy.h>
+
+#undef strcmp
+*/
+
+#ifdef __thumb2__
+#define magic1(REG) "#0x01010101"
+#define magic2(REG) "#0x80808080"
+#else
+#define magic1(REG) #REG
+#define magic2(REG) #REG ", lsl #7"
+#endif
+
+char* __attribute__((naked))
+__strcpy_arm (char* dst, const char* src)
+{
+  __asm__ (
+       "pld	[r1, #0]\n\t"
+       "eor	r2, r0, r1\n\t"
+       "mov	ip, r0\n\t"
+       "tst	r2, #3\n\t"
+       "bne	4f\n\t"
+       "tst	r1, #3\n\t"
+       "bne	3f\n"
+  "5:\n\t"
+# ifndef __thumb2__
+       "str	r5, [sp, #-4]!\n\t"
+       "mov	r5, #0x01\n\t"
+       "orr	r5, r5, r5, lsl #8\n\t"
+       "orr	r5, r5, r5, lsl #16\n\t"
+# endif
+
+       "str	r4, [sp, #-4]!\n\t"
+       "tst	r1, #4\n\t"
+       "ldr	r3, [r1], #4\n\t"
+       "beq	2f\n\t"
+       "sub	r2, r3, "magic1(r5)"\n\t"
+       "bics	r2, r2, r3\n\t"
+       "tst	r2, "magic2(r5)"\n\t"
+       "itt	eq\n\t"
+       "streq	r3, [ip], #4\n\t"
+       "ldreq	r3, [r1], #4\n"
+       "bne	1f\n\t"
+       /* Inner loop.  We now know that r1 is 64-bit aligned, so we
+	  can safely fetch up to two words.  This allows us to avoid
+	  load stalls.  */
+       ".p2align 2\n"
+  "2:\n\t"
+       "pld	[r1, #8]\n\t"
+       "ldr	r4, [r1], #4\n\t"
+       "sub	r2, r3, "magic1(r5)"\n\t"
+       "bics	r2, r2, r3\n\t"
+       "tst	r2, "magic2(r5)"\n\t"
+       "sub	r2, r4, "magic1(r5)"\n\t"
+       "bne	1f\n\t"
+       "str	r3, [ip], #4\n\t"
+       "bics	r2, r2, r4\n\t"
+       "tst	r2, "magic2(r5)"\n\t"
+       "itt	eq\n\t"
+       "ldreq	r3, [r1], #4\n\t"
+       "streq	r4, [ip], #4\n\t"
+       "beq	2b\n\t"
+       "mov	r3, r4\n"
+  "1:\n\t"
+# ifdef __ARMEB__
+       "rors	r3, r3, #24\n\t"
+# endif
+       "strb	r3, [ip], #1\n\t"
+       "tst	r3, #0xff\n\t"
+# ifdef __ARMEL__
+       "ror	r3, r3, #8\n\t"
+# endif
+       "bne	1b\n\t"
+       "ldr	r4, [sp], #4\n\t"
+# ifndef __thumb2__
+       "ldr	r5, [sp], #4\n\t"
+# endif
+       "BX LR\n"
+
+       /* Strings have the same offset from word alignment, but it's
+	  not zero.  */
+  "3:\n\t"
+       "tst	r1, #1\n\t"
+       "beq	1f\n\t"
+       "ldrb	r2, [r1], #1\n\t"
+       "strb	r2, [ip], #1\n\t"
+       "cmp	r2, #0\n\t"
+       "it	eq\n"
+       "BXEQ LR\n"
+  "1:\n\t"
+       "tst	r1, #2\n\t"
+       "beq	5b\n\t"
+       "ldrh	r2, [r1], #2\n\t"
+# ifdef __ARMEB__
+       "tst	r2, #0xff00\n\t"
+       "iteet	ne\n\t"
+       "strneh	r2, [ip], #2\n\t"
+       "lsreq	r2, r2, #8\n\t"
+       "streqb	r2, [ip]\n\t"
+       "tstne	r2, #0xff\n\t"
+# else
+       "tst	r2, #0xff\n\t"
+       "itet	ne\n\t"
+       "strneh	r2, [ip], #2\n\t"
+       "streqb	r2, [ip]\n\t"
+       "tstne	r2, #0xff00\n\t"
+# endif
+       "bne	5b\n\t"
+       "BX LR\n"
+
+       /* src and dst do not have a common word-alignement.  Fall back to
+	  byte copying.  */
+  "4:\n\t"
+       "ldrb	r2, [r1], #1\n\t"
+       "strb	r2, [ip], #1\n\t"
+       "cmp	r2, #0\n\t"
+       "bne	4b\n\t"
+       "BX LR");
+}
+/* For GLIBC: libc_hidden_builtin_def (strcpy) */
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
new file mode 100644
index 0000000..279ec87
--- /dev/null
+++ b/string/arm/strlen-armv6t2.S
@@ -0,0 +1,125 @@
+/*
+ * strlen - calculate the length of a string
+ *
+ * Copyright (c) 2010, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/*
+   Assumes:
+   ARMv6T2, AArch32
+
+ */
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+#ifdef __ARMEB__
+#define S2LO		lsl
+#define S2HI		lsr
+#else
+#define S2LO		lsr
+#define S2HI		lsl
+#endif
+
+	/* This code requires Thumb.  */
+	.thumb
+	.syntax unified
+
+/* Parameters and result.  */
+#define srcin		r0
+#define result		r0
+
+/* Internal variables.  */
+#define src		r1
+#define data1a		r2
+#define data1b		r3
+#define const_m1	r12
+#define const_0		r4
+#define tmp1		r4		/* Overlaps const_0  */
+#define tmp2		r5
+
+def_fn	__strlen_armv6t2 p2align=6
+	pld	[srcin, #0]
+	strd	r4, r5, [sp, #-8]!
+	bic	src, srcin, #7
+	mvn	const_m1, #0
+	ands	tmp1, srcin, #7		/* (8 - bytes) to alignment.  */
+	pld	[src, #32]
+	bne.w	.Lmisaligned8
+	mov	const_0, #0
+	mov	result, #-8
+.Lloop_aligned:
+	/* Bytes 0-7.  */
+	ldrd	data1a, data1b, [src]
+	pld	[src, #64]
+	add	result, result, #8
+.Lstart_realigned:
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cbnz	data1b, .Lnull_found
+
+	/* Bytes 8-15.  */
+	ldrd	data1a, data1b, [src, #8]
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	add	result, result, #8
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cbnz	data1b, .Lnull_found
+
+	/* Bytes 16-23.  */
+	ldrd	data1a, data1b, [src, #16]
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	add	result, result, #8
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cbnz	data1b, .Lnull_found
+
+	/* Bytes 24-31.  */
+	ldrd	data1a, data1b, [src, #24]
+	add	src, src, #32
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	add	result, result, #8
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cmp	data1b, #0
+	beq	.Lloop_aligned
+
+.Lnull_found:
+	cmp	data1a, #0
+	itt	eq
+	addeq	result, result, #4
+	moveq	data1a, data1b
+#ifndef __ARMEB__
+	rev	data1a, data1a
+#endif
+	clz	data1a, data1a
+	ldrd	r4, r5, [sp], #8
+	add	result, result, data1a, lsr #3	/* Bits -> Bytes.  */
+	bx	lr
+
+.Lmisaligned8:
+	ldrd	data1a, data1b, [src]
+	and	tmp2, tmp1, #3
+	rsb	result, tmp1, #0
+	lsl	tmp2, tmp2, #3			/* Bytes -> bits.  */
+	tst	tmp1, #4
+	pld	[src, #64]
+	S2HI	tmp2, const_m1, tmp2
+	orn	data1a, data1a, tmp2
+	itt	ne
+	ornne	data1b, data1b, tmp2
+	movne	data1a, const_m1
+	mov	const_0, #0
+	b	.Lstart_realigned
+	.size	__strlen_armv6t2, . - __strlen_armv6t2
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 753d06a..96647cf 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -14,4 +14,36 @@
 
 #if __aarch64__
 void *__memcpy_bytewise (void *__restrict, const void *__restrict, size_t);
+void *__memcpy_aarch64 (void *__restrict, const void *__restrict, size_t);
+void *__memmove_aarch64 (void *__restrict, const void *__restrict, size_t);
+void *__memset_aarch64 (void *, int, size_t);
+void *__memchr_aarch64 (const void *, int, size_t);
+int __memcmp_aarch64 (const void *, const void *, size_t);
+char *__strcpy_aarch64 (char *__restrict, const char *__restrict);
+int __strcmp_aarch64 (const char *, const char *);
+char *__strchr_aarch64 (const char *, int);
+char *__strchrnul_aarch64 (const char *, int );
+size_t __strlen_aarch64 (const char *);
+size_t __strnlen_aarch64 (const char *, size_t);
+int __strncmp_aarch64 (const char *, const char *, size_t);
+# if __ARM_FEATURE_SVE
+void *__memchr_aarch64_sve (const void *, int, size_t);
+int __memcmp_aarch64_sve (const void *, const void *, size_t);
+char *__strchr_aarch64_sve (const char *, int);
+char *__strrchr_aarch64_sve (const char *, int);
+char *__strchrnul_aarch64_sve (const char *, int );
+int __strcmp_aarch64_sve (const char *, const char *);
+char *__strcpy_aarch64_sve (char *__restrict, const char *__restrict);
+size_t __strlen_aarch64_sve (const char *);
+size_t __strnlen_aarch64_sve (const char *, size_t);
+int __strncmp_aarch64_sve (const char *, const char *, size_t);
+# endif
+#elif __arm__
+void *__memcpy_arm (void *__restrict, const void *__restrict, size_t);
+void *__memset_arm (void *, int, size_t);
+void *__memchr_arm (const void *, int, size_t);
+char *__strcpy_arm (char *__restrict, const char *__restrict);
+int __strcmp_arm (const char *, const char *);
+int __strcmp_armv6m (const char *, const char *);
+size_t __strlen_armv6t2 (const char *);
 #endif
diff --git a/string/memchr.S b/string/memchr.S
new file mode 100644
index 0000000..0a564d8
--- /dev/null
+++ b/string/memchr.S
@@ -0,0 +1,15 @@
+/*
+ * Selected possible memchr implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/memchr.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/memchr-sve.S"
+# endif
+#elif __arm__
+#include "arm/memchr.S"
+#endif
diff --git a/string/memcmp.S b/string/memcmp.S
new file mode 100644
index 0000000..22da685
--- /dev/null
+++ b/string/memcmp.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible memcpy implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/memcmp.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/memcmp-sve.S"
+# endif
+#endif
diff --git a/string/memcpy.S b/string/memcpy.S
new file mode 100644
index 0000000..c0f23e3
--- /dev/null
+++ b/string/memcpy.S
@@ -0,0 +1,12 @@
+/*
+ * Selected possible memcpy implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/memcpy.S"
+#elif __arm__
+#include "arm/memcpy.S"
+#endif
diff --git a/string/memmove.S b/string/memmove.S
new file mode 100644
index 0000000..be3c7a1
--- /dev/null
+++ b/string/memmove.S
@@ -0,0 +1,10 @@
+/*
+ * Selected possible memmmove implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/memmove.S"
+#endif
diff --git a/string/memset.S b/string/memset.S
new file mode 100644
index 0000000..57542ef
--- /dev/null
+++ b/string/memset.S
@@ -0,0 +1,12 @@
+/*
+ * Selected possible memset implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/memset.S"
+#elif __arm__
+#include "arm/memset.S"
+#endif
diff --git a/string/strchr.S b/string/strchr.S
new file mode 100644
index 0000000..8cead02
--- /dev/null
+++ b/string/strchr.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible strchr implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strchr.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strchr-sve.S"
+# endif
+#endif
diff --git a/string/strchrnul.S b/string/strchrnul.S
new file mode 100644
index 0000000..3dfdeef
--- /dev/null
+++ b/string/strchrnul.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible strchr implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strchrnul.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strchrnul-sve.S"
+# endif
+#endif
diff --git a/string/strcmp.S b/string/strcmp.S
new file mode 100644
index 0000000..12530ec
--- /dev/null
+++ b/string/strcmp.S
@@ -0,0 +1,19 @@
+/*
+ * Selected possible strcmp implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strcmp.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strcmp-sve.S"
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
+#include "arm/strcmp.S"
+# elif __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
+#include "arm/strcmp-armv6m.S"
+# endif
+#endif
diff --git a/string/strcpy-c.c b/string/strcpy-c.c
new file mode 100644
index 0000000..6bde24a
--- /dev/null
+++ b/string/strcpy-c.c
@@ -0,0 +1,10 @@
+/*
+ * Selected possible strcpy implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __arm__ && defined (__thumb2__) && !defined (__thumb__)
+#include "arm/strcpy.c"
+#endif
diff --git a/string/strcpy.S b/string/strcpy.S
new file mode 100644
index 0000000..a604b22
--- /dev/null
+++ b/string/strcpy.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible strcpy implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strcpy.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strcpy-sve.S"
+# endif
+#endif
diff --git a/string/strlen.S b/string/strlen.S
new file mode 100644
index 0000000..d681033
--- /dev/null
+++ b/string/strlen.S
@@ -0,0 +1,17 @@
+/*
+ * Selected possible strlen implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strlen.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strlen-sve.S"
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+#include "arm/strlen-armv6t2.S"
+# endif
+#endif
diff --git a/string/strncmp.S b/string/strncmp.S
new file mode 100644
index 0000000..26b56b7
--- /dev/null
+++ b/string/strncmp.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible strncmp implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strncmp.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strncmp-sve.S"
+# endif
+#endif
diff --git a/string/strnlen.S b/string/strnlen.S
new file mode 100644
index 0000000..eebe777
--- /dev/null
+++ b/string/strnlen.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible strnlen implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strnlen.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strnlen-sve.S"
+# endif
+#endif
diff --git a/string/strrchr.S b/string/strrchr.S
new file mode 100644
index 0000000..18b1cf9
--- /dev/null
+++ b/string/strrchr.S
@@ -0,0 +1,12 @@
+/*
+ * Selected possible strrchr implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+# if __ARM_FEATURE_SVE
+#include "aarch64/strrchr-sve.S"
+# endif
+#endif
diff --git a/string/test/memchr.c b/string/test/memchr.c
new file mode 100644
index 0000000..8d609c9
--- /dev/null
+++ b/string/test/memchr.c
@@ -0,0 +1,94 @@
+/*
+ * memchr test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+	const char *name;
+	void *(*fun)(const void *, int c, size_t n);
+} funtab[] = {
+#define F(x) {#x, x},
+F(memchr)
+#if __aarch64__
+F(__memchr_aarch64)
+# if __ARM_FEATURE_SVE
+F(__memchr_aarch64_sve)
+# endif
+#elif __arm__
+F(__memchr_arm)
+#endif
+#undef F
+	{0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define SP 512
+#define LEN 250000
+static unsigned char sbuf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+	return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int align, int seekpos, int len)
+{
+	unsigned char *src = alignup(sbuf);
+	unsigned char *s = src + align;
+	unsigned char *f = len ? s + seekpos : 0;
+	int seekchar = 0x1;
+	int i;
+	void *p;
+
+	if (len > LEN || seekpos >= len || align >= A)
+		abort();
+
+	for (i = 0; i < seekpos; i++)
+		s[i] = 'a' + i%23;
+	s[i++] = seekchar;
+	for (; i < len; i++)
+		s[i] = 'a' + i%23;
+
+	p = fun->fun(s, seekchar, len);
+
+	if (p != f) {
+		ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
+		ERR("expected: %p\n", f);
+		abort();
+	}
+}
+
+int main()
+{
+	int r = 0;
+	for (int i=0; funtab[i].name; i++) {
+		test_status = 0;
+		for (int a = 0; a < A; a++) {
+			for (int n = 0; n < 100; n++)
+				for (int sp = 0; sp < n-1; sp++)
+					test(funtab+i, a, sp, n);
+			for (int n = 100; n < LEN; n *= 2) {
+				test(funtab+i, a, n-1, n);
+				test(funtab+i, a, n/2, n);
+			}
+		}
+		if (test_status) {
+			r = -1;
+			ERR("FAIL %s\n", funtab[i].name);
+		}
+	}
+	return r;
+}
diff --git a/string/test/memcmp.c b/string/test/memcmp.c
new file mode 100644
index 0000000..63b07bd
--- /dev/null
+++ b/string/test/memcmp.c
@@ -0,0 +1,97 @@
+/*
+ * memcmp test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+	const char *name;
+	int (*fun)(const void *s1, const void *s2, size_t n);
+} funtab[] = {
+#define F(x) {#x, x},
+F(memcmp)
+#if __aarch64__
+F(__memcmp_aarch64)
+# if __ARM_FEATURE_SVE
+F(__memcmp_aarch64_sve)
+# endif
+#endif
+#undef F
+	{0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define LEN 250000
+static unsigned char s1buf[LEN+2*A];
+static unsigned char s2buf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+	return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos)
+{
+	unsigned char *src1 = alignup(s1buf);
+	unsigned char *src2 = alignup(s2buf);
+	unsigned char *s1 = src1 + s1align;
+	unsigned char *s2 = src2 + s2align;
+	int r;
+
+	if (len > LEN || s1align >= A || s2align >= A)
+		abort();
+	if (diffpos && diffpos >= len)
+		abort();
+
+	for (int i = 0; i < len+A; i++)
+		src1[i] = src2[i] = '?';
+	for (int i = 0; i < len; i++)
+		s1[i] = s2[i] = 'a' + i%23;
+	if (diffpos)
+		s1[diffpos]++;
+
+	r = fun->fun(s1, s2, len);
+
+	if ((!diffpos && r != 0) || (diffpos && r == 0)) {
+		ERR("%s(align %d, align %d, %d) failed, returned %d\n",
+			fun->name, s1align, s2align, len, r);
+		ERR("src1: %.*s\n", s1align+len+1, src1);
+		ERR("src2: %.*s\n", s2align+len+1, src2);
+	}
+}
+
+int main()
+{
+	int r = 0;
+	for (int i=0; funtab[i].name; i++) {
+		test_status = 0;
+		for (int d = 0; d < A; d++)
+			for (int s = 0; s < A; s++) {
+				int n;
+				for (n = 0; n < 100; n++) {
+					test(funtab+i, d, s, n, 0);
+					test(funtab+i, d, s, n, n / 2);
+				}
+				for (; n < LEN; n *= 2) {
+					test(funtab+i, d, s, n, 0);
+					test(funtab+i, d, s, n, n / 2);
+				}
+			}
+		if (test_status) {
+			r = -1;
+			ERR("FAIL %s\n", funtab[i].name);
+		}
+	}
+	return r;
+}
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index 1dccac7..26ab0ec 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -20,6 +20,9 @@ static const struct fun
 F(memcpy)
 #if __aarch64__
 F(__memcpy_bytewise)
+F(__memcpy_aarch64)
+#elif __arm__
+F(__memcpy_arm)
 #endif
 #undef F
 	{0, 0}
diff --git a/string/test/memmove.c b/string/test/memmove.c
new file mode 100644
index 0000000..8164383
--- /dev/null
+++ b/string/test/memmove.c
@@ -0,0 +1,142 @@
+/*
+ * memmove test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+	const char *name;
+	void *(*fun)(void *, const void *, size_t);
+} funtab[] = {
+#define F(x) {#x, x},
+F(memmove)
+#if __aarch64__
+F(__memmove_aarch64)
+#endif
+#undef F
+	{0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define LEN 250000
+static unsigned char dbuf[LEN+2*A];
+static unsigned char sbuf[LEN+2*A];
+static unsigned char wbuf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+	return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int dalign, int salign, int len)
+{
+	unsigned char *src = alignup(sbuf);
+	unsigned char *dst = alignup(dbuf);
+	unsigned char *want = wbuf;
+	unsigned char *s = src + salign;
+	unsigned char *d = dst + dalign;
+	unsigned char *w = want + dalign;
+	void *p;
+	int i;
+
+	if (len > LEN || dalign >= A || salign >= A)
+		abort();
+	for (i = 0; i < len+A; i++) {
+		src[i] = '?';
+		want[i] = dst[i] = '*';
+	}
+	for (i = 0; i < len; i++)
+		s[i] = w[i] = 'a' + i%23;
+
+	p = fun->fun(d, s, len);
+	if (p != d)
+		ERR("%s(%p,..) returned %p\n", fun->name, d, p);
+	for (i = 0; i < len+A; i++) {
+		if (dst[i] != want[i]) {
+			ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
+			ERR("got : %.*s\n", dalign+len+1, dst);
+			ERR("want: %.*s\n", dalign+len+1, want);
+			break;
+		}
+	}
+}
+
+static void test_overlap(const struct fun *fun, int dalign, int salign, int len)
+{
+	unsigned char *src = alignup(sbuf);
+	unsigned char *dst = alignup(sbuf);
+	unsigned char *want = wbuf;
+	unsigned char *s = src + salign;
+	unsigned char *d = dst + dalign;
+	unsigned char *w = wbuf + dalign;
+	void *p;
+
+	if (len > LEN || dalign >= A || salign >= A)
+		abort();
+
+	for (int i = 0; i < len+A; i++)
+		src[i] = want[i] = '?';
+
+	for (int i = 0; i < len; i++)
+		s[i] = w[i] = 'a' + i%23;
+
+	/* Copy the potential overlap range.  */
+	if (s < d) {
+		for (int i = 0; i < (uintptr_t)d-(uintptr_t)s; i++)
+			want[salign+i] = src[salign+i];
+	} else {
+		for (int i = 0; i < (uintptr_t)s-(uintptr_t)d; i++)
+			want[len + dalign + i] = src[len + dalign + i];
+	}
+
+	p = fun->fun(d, s, len);
+	if (p != d)
+		ERR("%s(%p,..) returned %p\n", fun->name, d, p);
+	for (int i = 0; i < len+A; i++) {
+		if (dst[i] != want[i]) {
+			ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
+			ERR("got : %.*s\n", dalign+len+1, dst);
+			ERR("want: %.*s\n", dalign+len+1, want);
+			abort();
+			break;
+		}
+	}
+}
+
+int main()
+{
+	test_overlap(funtab+0, 2, 1, 1);
+
+	int r = 0;
+	for (int i=0; funtab[i].name; i++) {
+		test_status = 0;
+		for (int d = 0; d < A; d++)
+			for (int s = 0; s < A; s++) {
+				int n;
+				for (n = 0; n < 100; n++) {
+					test(funtab+i, d, s, n);
+					test_overlap(funtab+i, d, s, n);
+				}
+				for (; n < LEN; n *= 2) {
+					test(funtab+i, d, s, n);
+					test_overlap(funtab+i, d, s, n);
+				}
+			}
+		if (test_status) {
+			r = -1;
+			ERR("FAIL %s\n", funtab[i].name);
+		}
+	}
+	return r;
+}
diff --git a/string/test/memset.c b/string/test/memset.c
new file mode 100644
index 0000000..c0c7ed6
--- /dev/null
+++ b/string/test/memset.c
@@ -0,0 +1,112 @@
+/*
+ * memset test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+	const char *name;
+	void *(*fun)(void *s, int c, size_t n);
+} funtab[] = {
+#define F(x) {#x, x},
+F(memset)
+#if __aarch64__
+F(__memset_aarch64)
+#elif __arm__
+F(__memset_arm)
+#endif
+#undef F
+	{0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define LEN 250000
+static unsigned char sbuf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+	return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void err(const char *name, unsigned char *src, int salign, int c, int len)
+{
+	ERR("%s(align %d, %d, %d) failed\n", name, salign, c, len);
+	ERR("got : %.*s\n", salign+len+1, src);
+}
+
+static void test(const struct fun *fun, int salign, int c, int len)
+{
+	unsigned char *src = alignup(sbuf);
+	unsigned char *s = src + salign;
+	void *p;
+	int i;
+
+	if (len > LEN || salign >= A)
+		abort();
+	for (i = 0; i < len+A; i++)
+		src[i] = '?';
+	for (i = 0; i < len; i++)
+		s[i] = 'a' + i%23;
+	for (; i<len%A; i++)
+		s[i] = '*';
+
+	p = fun->fun(s, c, len);
+	if (p != s)
+		ERR("%s(%p,..) returned %p\n", fun->name, s, p);
+
+	for (i = 0; i < salign; i++) {
+		if (src[i] != '?') {
+			err(fun->name, src, salign, c, len);
+			return;
+		}
+	}
+	for (i = salign; i < len; i++) {
+		if (src[i] != (unsigned char)c) {
+			err(fun->name, src, salign, c, len);
+			return;
+		}
+	}
+	for (; i < len%A; i++) {
+		if (src[i] != '*') {
+			err(fun->name, src, salign, c, len);
+			return;
+		}
+	}
+}
+
+int main()
+{
+	int r = 0;
+	for (int i=0; funtab[i].name; i++) {
+		test_status = 0;
+		for (int s = 0; s < A; s++) {
+			int n;
+			for (n = 0; n < 100; n++) {
+				test(funtab+i, s, 0, n);
+				test(funtab+i, s, 0x25, n);
+				test(funtab+i, s, 0xaa25, n);
+			}
+			for (; n < LEN; n *= 2) {
+				test(funtab+i, s, 0, n);
+				test(funtab+i, s, 0x25, n);
+				test(funtab+i, s, 0xaa25, n);
+			}
+		}
+		if (test_status) {
+			r = -1;
+			ERR("FAIL %s\n", funtab[i].name);
+		}
+	}
+	return r;
+}
diff --git a/string/test/strchr.c b/string/test/strchr.c
new file mode 100644
index 0000000..30c714f
--- /dev/null
+++ b/string/test/strchr.c
@@ -0,0 +1,98 @@
+/*
+ * strchr test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+	const char *name;
+	char *(*fun)(const char *s, int c);
+} funtab[] = {
+#define F(x) {#x, x},
+F(strchr)
+#if __aarch64__
+F(__strchr_aarch64)
+# if __ARM_FEATURE_SVE
+F(__strchr_aarch64_sve)
+# endif
+#endif
+#undef F
+	{0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define SP 512
+#define LEN 250000
+static char sbuf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+	return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int align, int seekpos, int len)
+{
+	char *src = alignup(sbuf);
+	char *s = src + align;
+	char *f = seekpos != -1 ? s + seekpos : 0;
+	int seekchar = 0x1;
+	void *p;
+
+	if (len > LEN || seekpos >= len - 1 || align >= A)
+		abort();
+	if (seekchar >= 'a' && seekchar <= 'a' + 23)
+		abort();
+
+	for (int i = 0; i < len + A; i++)
+		src[i] = '?';
+	for (int i = 0; i < len - 2; i++)
+		s[i] = 'a' + i%23;
+	if (seekpos != -1)
+		s[seekpos] = seekchar;
+	s[len - 1] = '\0';
+
+	p = fun->fun(s, seekchar);
+
+	if (p != f) {
+		ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
+		ERR("expected: %p\n", f);
+		abort();
+	}
+}
+
+int main()
+{
+	int r = 0;
+	for (int i=0; funtab[i].name; i++) {
+		test_status = 0;
+		for (int a = 0; a < A; a++) {
+			int n;
+			for (n = 1; n < 100; n++) {
+				for (int sp = 0; sp < n - 1; sp++)
+					test(funtab+i, a, sp, n);
+				test(funtab+i, a, -1, n);
+			}
+			for (; n < LEN; n *= 2) {
+				test(funtab+i, a, -1, n);
+				test(funtab+i, a, n / 2, n);
+			}
+		}
+		if (test_status) {
+			r = -1;
+			ERR("FAIL %s\n", funtab[i].name);
+		}
+	}
+	return r;
+}
diff --git a/string/test/strchrnul.c b/string/test/strchrnul.c
new file mode 100644
index 0000000..c4260e6
--- /dev/null
+++ b/string/test/strchrnul.c
@@ -0,0 +1,100 @@
+/*
+ * strchrnul test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define _GNU_SOURCE
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+	const char *name;
+	char *(*fun)(const char *s, int c);
+} funtab[] = {
+#define F(x) {#x, x},
+F(strchrnul)
+#if __aarch64__
+F(__strchrnul_aarch64)
+# if __ARM_FEATURE_SVE
+F(__strchrnul_aarch64_sve)
+# endif
+#endif
+#undef F
+	{0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define SP 512
+#define LEN 250000
+static char sbuf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+	return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int align, int seekpos, int len)
+{
+	char *src = alignup(sbuf);
+	char *s = src + align;
+	char *f = seekpos != -1 ? s + seekpos : s + len - 1;
+	int seekchar = 0x1;
+	void *p;
+
+	if (len > LEN || seekpos >= len - 1 || align >= A)
+		abort();
+	if (seekchar >= 'a' && seekchar <= 'a' + 23)
+		abort();
+
+	for (int i = 0; i < len + A; i++)
+		src[i] = '?';
+	for (int i = 0; i < len - 2; i++)
+		s[i] = 'a' + i%23;
+	if (seekpos != -1)
+		s[seekpos] = seekchar;
+	s[len - 1] = '\0';
+
+	p = fun->fun(s, seekchar);
+
+	if (p != f) {
+		ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
+		ERR("expected: %p\n", f);
+		abort();
+	}
+}
+
+int main()
+{
+	int r = 0;
+	for (int i=0; funtab[i].name; i++) {
+		test_status = 0;
+		for (int a = 0; a < A; a++) {
+			int n;
+			for (n = 1; n < 100; n++) {
+				for (int sp = 0; sp < n - 1; sp++)
+					test(funtab+i, a, sp, n);
+				test(funtab+i, a, -1, n);
+			}
+			for (; n < LEN; n *= 2) {
+				test(funtab+i, a, -1, n);
+				test(funtab+i, a, n / 2, n);
+			}
+		}
+		if (test_status) {
+			r = -1;
+			ERR("FAIL %s\n", funtab[i].name);
+		}
+	}
+	return r;
+}
diff --git a/string/test/strcmp.c b/string/test/strcmp.c
new file mode 100644
index 0000000..c4e8867
--- /dev/null
+++ b/string/test/strcmp.c
@@ -0,0 +1,104 @@
+/*
+ * strcmp test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+	const char *name;
+	int (*fun)(const char *s1, const char *s2);
+} funtab[] = {
+#define F(x) {#x, x},
+F(strcmp)
+#if __aarch64__
+F(__strcmp_aarch64)
+# if __ARM_FEATURE_SVE
+F(__strcmp_aarch64_sve)
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
+F(__strcmp_arm)
+# elif __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
+F(__strcmp_armv6m)
+# endif
+#endif
+#undef F
+	{0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define LEN 250000
+static char s1buf[LEN+2*A];
+static char s2buf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+	return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos)
+{
+	char *src1 = alignup(s1buf);
+	char *src2 = alignup(s2buf);
+	char *s1 = src1 + s1align;
+	char *s2 = src2 + s2align;
+	int r;
+
+	if (len > LEN || s1align >= A || s2align >= A)
+		abort();
+	if (diffpos > 1 && diffpos >= len-1)
+		abort();
+
+	for (int i = 0; i < len+A; i++)
+		src1[i] = src2[i] = '?';
+	for (int i = 0; i < len-1; i++)
+		s1[i] = s2[i] = 'a' + i%23;
+	if (diffpos > 1)
+		s1[diffpos]++;
+	s1[len] = s2[len] = '\0';
+
+	r = fun->fun(s1, s2);
+
+	if (((diffpos <= 1) && r != 0) || (diffpos > 1 && r == 0)) {
+		ERR("%s(align %d, align %d, %d) failed, returned %d\n",
+			fun->name, s1align, s2align, len, r);
+		ERR("src1: %.*s\n", s1align+len+1, src1);
+		ERR("src2: %.*s\n", s2align+len+1, src2);
+	}
+}
+
+int main()
+{
+	int r = 0;
+	for (int i=0; funtab[i].name; i++) {
+		test_status = 0;
+		for (int d = 0; d < A; d++)
+			for (int s = 0; s < A; s++) {
+				int n;
+				for (n = 0; n < 100; n++) {
+					test(funtab+i, d, s, n, 0);
+					test(funtab+i, d, s, n, n / 2);
+				}
+				for (; n < LEN; n *= 2) {
+					test(funtab+i, d, s, n, 0);
+					test(funtab+i, d, s, n, n / 2);
+				}
+			}
+		if (test_status) {
+			r = -1;
+			ERR("FAIL %s\n", funtab[i].name);
+		}
+	}
+	return r;
+}
diff --git a/string/test/strcpy.c b/string/test/strcpy.c
new file mode 100644
index 0000000..3072ade
--- /dev/null
+++ b/string/test/strcpy.c
@@ -0,0 +1,100 @@
+/*
+ * strcpy test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+	const char *name;
+	char *(*fun)(char *dest, const char *src);
+} funtab[] = {
+#define F(x) {#x, x},
+F(strcpy)
+#if __aarch64__
+F(__strcpy_aarch64)
+# if __ARM_FEATURE_SVE
+F(__strcpy_aarch64_sve)
+# endif
+#elif __arm__ && defined (__thumb2__) && !defined (__thumb__)
+F(__strcpy_arm)
+#endif
+#undef F
+	{0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define LEN 250000
+static char dbuf[LEN+2*A];
+static char sbuf[LEN+2*A];
+static char wbuf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+	return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int dalign, int salign, int len)
+{
+	char *src = alignup(sbuf);
+	char *dst = alignup(dbuf);
+	char *want = wbuf;
+	char *s = src + salign;
+	char *d = dst + dalign;
+	char *w = want + dalign;
+	void *p;
+	int i;
+
+	if (len > LEN || dalign >= A || salign >= A)
+		abort();
+	for (i = 0; i < len+A; i++) {
+		src[i] = '?';
+		want[i] = dst[i] = '*';
+	}
+	for (i = 0; i < len-1; i++)
+		s[i] = w[i] = 'a' + i%23;
+	s[i] = w[i] = '\0';
+
+	p = fun->fun(d, s);
+	if (p != d)
+		ERR("%s(%p,..) returned %p\n", fun->name, d, p);
+	for (i = 0; i < len+A; i++) {
+		if (dst[i] != want[i]) {
+			ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
+			ERR("got : %.*s\n", dalign+len+1, dst);
+			ERR("want: %.*s\n", dalign+len+1, want);
+			break;
+		}
+	}
+}
+
+int main()
+{
+	int r = 0;
+	for (int i=0; funtab[i].name; i++) {
+		test_status = 0;
+		for (int d = 0; d < A; d++)
+			for (int s = 0; s < A; s++) {
+				int n;
+				for (n = 0; n < 100; n++)
+					test(funtab+i, d, s, n);
+				for (; n < LEN; n *= 2)
+					test(funtab+i, d, s, n);
+			}
+		if (test_status) {
+			r = -1;
+			ERR("FAIL %s\n", funtab[i].name);
+		}
+	}
+	return r;
+}
diff --git a/string/test/strlen.c b/string/test/strlen.c
new file mode 100644
index 0000000..700c865
--- /dev/null
+++ b/string/test/strlen.c
@@ -0,0 +1,91 @@
+/*
+ * strlen test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+	const char *name;
+	size_t (*fun)(const char *s);
+} funtab[] = {
+#define F(x) {#x, x},
+F(strlen)
+#if __aarch64__
+F(__strlen_aarch64)
+# if __ARM_FEATURE_SVE
+F(__strlen_aarch64_sve)
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+F(__strlen_armv6t2)
+# endif
+#endif
+#undef F
+	{0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define SP 512
+#define LEN 250000
+static char sbuf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+	return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int align, int len)
+{
+	char *src = alignup(sbuf);
+	char *s = src + align;
+	size_t r;
+
+	if (len > LEN || align >= A)
+		abort();
+
+	for (int i = 0; i < len + A; i++)
+		src[i] = '?';
+	for (int i = 0; i < len - 2; i++)
+		s[i] = 'a' + i%23;
+	s[len - 1] = '\0';
+
+	r = fun->fun(s);
+	if (r != len-1) {
+		ERR("%s(%p) returned %zu\n", fun->name, s, r);
+		ERR("input:    %.*s\n", align+len+1, src);
+		ERR("expected: %d\n", len);
+		abort();
+	}
+}
+
+int main()
+{
+	int r = 0;
+	for (int i=0; funtab[i].name; i++) {
+		test_status = 0;
+		for (int a = 0; a < A; a++) {
+			int n;
+			for (n = 1; n < 100; n++)
+				test(funtab+i, a, n);
+			for (; n < LEN; n *= 2)
+				test(funtab+i, a, n);
+		}
+		if (test_status) {
+			r = -1;
+			ERR("FAIL %s\n", funtab[i].name);
+		}
+	}
+	return r;
+}
diff --git a/string/test/strncmp.c b/string/test/strncmp.c
new file mode 100644
index 0000000..14e0a8c
--- /dev/null
+++ b/string/test/strncmp.c
@@ -0,0 +1,104 @@
+/*
+ * strncmp test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+	const char *name;
+	int (*fun)(const char *, const char *, size_t);
+} funtab[] = {
+#define F(x) {#x, x},
+F(strncmp)
+#if __aarch64__
+F(__strncmp_aarch64)
+# if __ARM_FEATURE_SVE
+F(__strncmp_aarch64_sve)
+# endif
+#endif
+#undef F
+	{0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define LEN 250000
+static char s1buf[LEN+2*A];
+static char s2buf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+	return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int s1align, int s2align, int maxlen, int diffpos, int len)
+{
+	char *src1 = alignup(s1buf);
+	char *src2 = alignup(s2buf);
+	char *s1 = src1 + s1align;
+	char *s2 = src2 + s2align;
+	int r;
+
+	if (len > LEN || s1align >= A || s2align >= A)
+		abort();
+	if (diffpos > 1 && diffpos >= len-1)
+		abort();
+
+	for (int i = 0; i < len+A; i++)
+		src1[i] = src2[i] = '?';
+	for (int i = 0; i < len-1; i++)
+		s1[i] = s2[i] = 'a' + i%23;
+	if (diffpos > 1)
+		s1[diffpos]++;
+	s1[len] = s2[len] = '\0';
+
+	r = fun->fun(s1, s2, maxlen);
+
+	diffpos = maxlen <= diffpos ? 0 : diffpos;
+
+	if (((diffpos <= 1) && r != 0) || (diffpos > 1 && r == 0)) {
+		ERR("%s(align %d, align %d, %d (%d)) failed, returned %d (%d)\n",
+			fun->name, s1align, s2align, maxlen, len, r, diffpos);
+		ERR("src1: %.*s\n", s1align+len+1, src1);
+		ERR("src2: %.*s\n", s2align+len+1, src2);
+	}
+}
+
+int main()
+{
+	int r = 0;
+	for (int i=0; funtab[i].name; i++) {
+		test_status = 0;
+		for (int d = 0; d < A; d++)
+			for (int s = 0; s < A; s++) {
+				int n;
+				for (n = 0; n < 100; n++) {
+					test(funtab+i, d, s, n,   0,   n);
+					test(funtab+i, d, s, n,   n/2, n);
+					test(funtab+i, d, s, n/2, 0,   n);
+					test(funtab+i, d, s, n/2, n/2, n);
+				}
+				for (; n < LEN; n *= 2) {
+					test(funtab+i, d, s, n,   0,   n);
+					test(funtab+i, d, s, n,   n/2, n);
+					test(funtab+i, d, s, n/2, 0,   n);
+					test(funtab+i, d, s, n/2, n/2, n);
+				}
+			}
+		if (test_status) {
+			r = -1;
+			ERR("FAIL %s\n", funtab[i].name);
+		}
+	}
+	return r;
+}
diff --git a/string/test/strnlen.c b/string/test/strnlen.c
new file mode 100644
index 0000000..9a98d80
--- /dev/null
+++ b/string/test/strnlen.c
@@ -0,0 +1,94 @@
+/*
+ * strnlen test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define _POSIX_C_SOURCE 200809L
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+	const char *name;
+	size_t (*fun)(const char *s, size_t m);
+} funtab[] = {
+#define F(x) {#x, x},
+F(strnlen)
+#if __aarch64__
+F(__strnlen_aarch64)
+# if __ARM_FEATURE_SVE
+F(__strnlen_aarch64_sve)
+# endif
+#endif
+#undef F
+	{0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define SP 512
+#define LEN 250000
+static char sbuf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+	return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int align, int maxlen, int len)
+{
+	char *src = alignup(sbuf);
+	char *s = src + align;
+	size_t r;
+	size_t e = maxlen < len ? maxlen : len - 1;
+
+	if (len > LEN || align >= A)
+		abort();
+
+	for (int i = 0; i < len + A; i++)
+		src[i] = '?';
+	for (int i = 0; i < len - 2; i++)
+		s[i] = 'a' + i%23;
+	s[len - 1] = '\0';
+
+	r = fun->fun(s, maxlen);
+	if (r != e) {
+		ERR("%s(%p) returned %zu\n", fun->name, s, r);
+		ERR("input:    %.*s\n", align+len+1, src);
+		ERR("expected: %d\n", len);
+		abort();
+	}
+}
+
+int main()
+{
+	int r = 0;
+	for (int i=0; funtab[i].name; i++) {
+		test_status = 0;
+		for (int a = 0; a < A; a++) {
+			int n;
+			for (n = 1; n < 100; n++)
+				for (int maxlen = 0; maxlen < 100; maxlen++)
+					test(funtab+i, a, maxlen, n);
+			for (; n < LEN; n *= 2) {
+				test(funtab+i, a, n*2, n);
+				test(funtab+i, a, n, n);
+				test(funtab+i, a, n/2, n);
+			}
+		}
+		if (test_status) {
+			r = -1;
+			ERR("FAIL %s\n", funtab[i].name);
+		}
+	}
+	return r;
+}
diff --git a/string/test/strrchr.c b/string/test/strrchr.c
new file mode 100644
index 0000000..b3fc2a9
--- /dev/null
+++ b/string/test/strrchr.c
@@ -0,0 +1,97 @@
+/*
+ * strrchr test.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "stringlib.h"
+
+static const struct fun
+{
+	const char *name;
+	char *(*fun)(const char *s, int c);
+} funtab[] = {
+#define F(x) {#x, x},
+F(strrchr)
+#if __aarch64__
+# if __ARM_FEATURE_SVE
+F(__strrchr_aarch64_sve)
+# endif
+#endif
+#undef F
+	{0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+
+#define A 32
+#define SP 512
+#define LEN 250000
+static char sbuf[LEN+2*A];
+
+static void *alignup(void *p)
+{
+	return (void*)(((uintptr_t)p + A-1) & -A);
+}
+
+static void test(const struct fun *fun, int align, int seekpos, int len)
+{
+	char *src = alignup(sbuf);
+	char *s = src + align;
+	char *f = seekpos != -1 ? s + seekpos : 0;
+	int seekchar = 0x1;
+	void *p;
+
+	if (len > LEN || seekpos >= len - 1 || align >= A)
+		abort();
+	if (seekchar >= 'a' && seekchar <= 'a' + 23)
+		abort();
+
+	for (int i = 0; i < len + A; i++)
+		src[i] = '?';
+	for (int i = 0; i < len - 2; i++)
+		s[i] = 'a' + i%23;
+	if (seekpos != -1)
+		s[seekpos/2] = s[seekpos] = seekchar;
+	s[len - 1] = '\0';
+
+	p = fun->fun(s, seekchar);
+
+	if (p != f) {
+		ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
+		ERR("expected: %p\n", f);
+		abort();
+	}
+}
+
+int main()
+{
+	int r = 0;
+	for (int i=0; funtab[i].name; i++) {
+		test_status = 0;
+		for (int a = 0; a < A; a++) {
+			int n;
+			for (n = 1; n < 100; n++) {
+				for (int sp = 0; sp < n - 1; sp++)
+					test(funtab+i, a, sp, n);
+				test(funtab+i, a, -1, n);
+			}
+			for (; n < LEN; n *= 2) {
+				test(funtab+i, a, -1, n);
+				test(funtab+i, a, n / 2, n);
+			}
+		}
+		if (test_status) {
+			r = -1;
+			ERR("FAIL %s\n", funtab[i].name);
+		}
+	}
+	return r;
+}
author	Treehugger Robot <treehugger-gerrit@google.com>	2019-09-18 11:57:37 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	2019-09-18 11:57:37 +0000
commit	6a751e14fcef594edf6a0eb7fab98799deaa484e (patch)
tree	d58d9102b2010c48c3d95474fb5914ae4c2371d9
parent	055b2f3c3849d992f9287a2fa0dbe79bd9f2f251 (diff)
parent	68a0658ee72ca4c3961719900e9f97562dd66931 (diff)
download	arm-optimized-routines-ndk-sysroot-r21.tar.gz