Snap for 7474514 from 0cbe0156ef389ae56254a55c909c3da03b72616c to mainline-media-swcodec-releaseandroid-mainline-12.0.0_r91 android-mainline-12.0.0_r75 android-mainline-12.0.0_r47 android-mainline-12.0.0_r30 android-mainline-12.0.0_r13 android-mainline-12.0.0_r120 android-mainline-12.0.0_r105 android12-mainline-media-swcodec-release

Change-Id: Idd5cf84cd34d79d8c782bf86002b6b91b851ea16
author: android-build-team Robot <android-build-team-robot@google.com> 2021-06-19 12:01:09 +0000
committer: android-build-team Robot <android-build-team-robot@google.com> 2021-06-19 12:01:09 +0000
commit: fa8a48fe8ce27c09976dede4af037c7acdb59a87 (patch)
tree: 7ded61f3cfe322fe4706181e50b2f28b35d8a4a0
parent: 8edcec53c6d84dc7f85e4c0a8539384b3fe489ec (diff)
parent: 0cbe0156ef389ae56254a55c909c3da03b72616c (diff)
download: arm-optimized-routines-android12-mainline-media-swcodec-release.tar.gz
156 files changed, 6530 insertions, 2033 deletions
diff --git a/Android.bp b/Android.bp
index ea477a1..ba814eb 100755
--- a/Android.bp
+++ b/Android.bp
@@ -1,3 +1,20 @@
+package {
+    default_applicable_licenses: ["external_arm-optimized-routines_license"],
+}
+
+// Added automatically by a large-scale-change
+// See: http://go/android-license-faq
+license {
+    name: "external_arm-optimized-routines_license",
+    visibility: [":__subpackages__"],
+    license_kinds: [
+        "SPDX-license-identifier-MIT",
+    ],
+    license_text: [
+        "LICENSE",
+    ],
+}
+
 cc_defaults {
     name: "arm-optimized-routines-defaults",
     host_supported: true,
@@ -26,12 +43,37 @@ cc_defaults {
     local_include_dirs: ["math/include"],
 }
 
-cc_library {
-    name: "libarm-optimized-routines-math",
+cc_defaults {
+    name: "libarm-optimized-routines-defaults",
     defaults: ["arm-optimized-routines-defaults"],
     ramdisk_available: true,
+    vendor_ramdisk_available: true,
     recovery_available: true,
     native_bridge_supported: true,
+    apex_available: [
+        "//apex_available:platform",
+        "com.android.runtime",
+    ],
+
+    stl: "none",
+    static: {
+        system_shared_libs: [],
+    },
+    header_libs: ["libc_headers"],
+}
+
+cc_library_static {
+    name: "libarm-optimized-routines-math",
+    defaults: ["libarm-optimized-routines-defaults"],
+    exclude_srcs: [
+        // Provided by:
+        // bionic/libm/upstream-freebsd/lib/msun/src/s_erf.c
+        // bionic/libm/upstream-freebsd/lib/msun/src/s_erff.c
+        "math/erf.c",
+        "math/erf_data.c",
+        "math/erff.c",
+        "math/erff_data.c",
+    ],
     srcs: [
         "math/*.c",
     ],
@@ -53,9 +95,43 @@ cc_library {
             enabled: true,
         },
     },
-    stl: "none",
-    static: {
-        system_shared_libs: [],
+}
+
+cc_library_static {
+    name: "libarm-optimized-routines-string",
+    defaults: ["libarm-optimized-routines-defaults"],
+
+    arch: {
+        arm64: {
+            srcs: [
+                "string/aarch64/memchr-mte.S",
+                "string/aarch64/memchr.S",
+                "string/aarch64/memcmp.S",
+                "string/aarch64/memrchr.S",
+                "string/aarch64/stpcpy-mte.S",
+                "string/aarch64/stpcpy.S",
+                "string/aarch64/strchrnul-mte.S",
+                "string/aarch64/strchrnul.S",
+                "string/aarch64/strchr-mte.S",
+                "string/aarch64/strchr.S",
+                "string/aarch64/strcmp-mte.S",
+                "string/aarch64/strcmp.S",
+                "string/aarch64/strcpy-mte.S",
+                "string/aarch64/strcpy.S",
+                "string/aarch64/strlen-mte.S",
+                "string/aarch64/strlen.S",
+                "string/aarch64/strncmp-mte.S",
+                "string/aarch64/strncmp.S",
+                "string/aarch64/strnlen.S",
+                "string/aarch64/strrchr-mte.S",
+                "string/aarch64/strrchr.S",
+            ],
+            asflags: [
+                "-D__memcmp_aarch64=memcmp",
+                "-D__memrchr_aarch64=memrchr",
+                "-D__strnlen_aarch64=strnlen",
+            ]
+        },
     },
 }
 
@@ -93,7 +169,7 @@ sh_test {
     test_suites: ["general-tests"],
     host_supported: true,
     device_supported: false,
-    test_config: "arm-optimized-routines-tests.xml",
+    require_root: true,
     target_required: [
         "mathtest",
         "ulp",
diff --git a/METADATA b/METADATA
index 94791ae..59af591 100644
--- a/METADATA
+++ b/METADATA
@@ -9,11 +9,11 @@ third_party {
     type: GIT
     value: "https://github.com/ARM-software/optimized-routines.git"
   }
-  version: "33ba19089a261964e1e84ba4edf90263b468c161"
+  version: "v21.02"
   license_type: NOTICE
   last_upgrade_date {
-    year: 2020
+    year: 2021
     month: 2
-    day: 1
+    day: 18
   }
 }
diff --git a/Makefile b/Makefile
index dee6134..169f89e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 # Makefile - requires GNU make
 #
-# Copyright (c) 2018-2019, Arm Limited.
+# Copyright (c) 2018-2020, Arm Limited.
 # SPDX-License-Identifier: MIT
 
 srcdir = .
@@ -10,7 +10,7 @@ libdir = $(prefix)/lib
 includedir = $(prefix)/include
 
 # Configure these in config.mk, do not make changes in this file.
-SUBS = math string
+SUBS = math string networking
 HOST_CC = cc
 HOST_CFLAGS = -std=c99 -O2
 HOST_LDFLAGS =
diff --git a/NOTICE b/NOTICE
deleted file mode 120000
index 7a694c9..0000000
--- a/NOTICE
+++ /dev/null
@@ -1 +0,0 @@
-LICENSE
-\ No newline at end of file
diff --git a/README b/README
index 440f08a..ae465e9 100644
--- a/README
+++ b/README
@@ -8,7 +8,8 @@ Assignment Agreement, please follow the instructions in
 contributor-agreement.pdf. This is needed so upstreaming code
 to projects that require copyright assignment is possible.
 
-Regular quarterly releases are tagged as vYY.MM (e.g. v19.11).
+Regular quarterly releases are tagged as vYY.MM, the latest
+release is v20.11.
 
 Source code layout:
 
@@ -17,6 +18,9 @@ math/           - math subproject sources.
 math/include/   - math library public headers.
 math/test/      - math test and benchmark related sources.
 math/tools/     - tools used for designing the algorithms.
+networking/     - networking subproject sources.
+networking/include/ - networking library public headers.
+networking/test/ - networking test and benchmark related sources.
 string/         - string routines subproject sources.
 string/include/ - string library public headers.
 string/test/    - string test and benchmark related sources.
diff --git a/TEST_MAPPING b/TEST_MAPPING
index e4d3d5e..66bdc01 100644
--- a/TEST_MAPPING
+++ b/TEST_MAPPING
@@ -2,6 +2,9 @@
   "presubmit": [
     {
       "name": "CtsBionicTestCases"
+    },
+    {
+      "name": "arm-optimized-routines-tests"
     }
   ]
 }
diff --git a/arm-optimized-routines-tests.xml b/arm-optimized-routines-tests.xml
deleted file mode 100644
index 96db90c..0000000
--- a/arm-optimized-routines-tests.xml
+++ /dev/null
@@ -1,26 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<!-- Copyright (C) 2019 The Android Open Source Project
-
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
--->
-<configuration description="Config for running arm-optimized-routines-tests through Atest or in Infra">
-    <option name="test-suite-tag" value="arm-optimized-routines-tests" />
-    <!-- This test requires a device, so it's not annotated with a null-device. -->
-    <test class="com.android.tradefed.testtype.binary.ExecutableHostTest" >
-        <option name="binary" value="run-arm-optimized-routines-tests-on-android.sh" />
-        <!-- Test script assumes a relative path with the tests/ folders. -->
-        <option name="relative-path-execution" value="true" />
-        <!-- Tests shouldn't be that long but set 15m to be safe. -->
-        <option name="per-binary-timeout" value="15m" />
-    </test>
-</configuration>
diff --git a/config.mk.dist b/config.mk.dist
index 301b5f9..177e1ac 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -1,20 +1,28 @@
 # Example config.mk
 #
-# Copyright (c) 2018-2019, Arm Limited.
+# Copyright (c) 2018-2020, Arm Limited.
 # SPDX-License-Identifier: MIT
 
 # Subprojects to build
-SUBS = math string
+SUBS = math string networking
 
-HOST_CC = gcc
-HOST_CFLAGS = -std=c99 -O2
-HOST_CFLAGS += -Wall -Wno-unused-function
+# Target architecture: aarch64, arm or x86_64
+ARCH = aarch64
+
+# Use for cross compilation with gcc.
+#CROSS_COMPILE = aarch64-none-linux-gnu-
 
+# Compiler for the target
 CC = $(CROSS_COMPILE)gcc
 CFLAGS = -std=c99 -pipe -O3
 CFLAGS += -Wall -Wno-missing-braces
 CFLAGS += -Werror=implicit-function-declaration
 
+# Used for test case generator that is executed on the host
+HOST_CC = gcc
+HOST_CFLAGS = -std=c99 -O2
+HOST_CFLAGS += -Wall -Wno-unused-function
+
 # Enable debug info.
 HOST_CFLAGS += -g
 CFLAGS += -g
@@ -22,8 +30,8 @@ CFLAGS += -g
 # Optimize the shared libraries on aarch64 assuming they fit in 1M.
 #CFLAGS_SHARED = -fPIC -mcmodel=tiny
 
-# Use for cross compilation with gcc.
-#CROSS_COMPILE = aarch64-none-linux-gnu-
+# Enable MTE support.
+#CFLAGS += -march=armv8.5-a+memtag -DWANT_MTE_TEST=1
 
 # Use with cross testing.
 #EMULATOR = qemu-aarch64-static
@@ -35,6 +43,7 @@ math-ldlibs =
 math-ulpflags =
 math-testflags =
 string-cflags =
+networking-cflags =
 
 # Use if mpfr is available on the target for ulp error checking.
 #math-ldlibs += -lmpfr -lgmp
@@ -53,3 +62,12 @@ math-cflags += -ffp-contract=fast -fno-math-errno
 # Disable fenv checks
 #math-ulpflags = -q -f
 #math-testflags = -nostatus
+
+# Remove GNU Property Notes from asm files.
+#string-cflags += -DWANT_GNU_PROPERTY=0
+
+# Enable assertion checks.
+#networking-cflags += -DWANT_ASSERT
+
+# Avoid auto-vectorization of scalar code and unroll loops
+networking-cflags += -O2 -fno-tree-vectorize -funroll-loops
diff --git a/math/cosf.c b/math/cosf.c
index 831b39e..f29f194 100644
--- a/math/cosf.c
+++ b/math/cosf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision cos function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/erf.c b/math/erf.c
new file mode 100644
index 0000000..12d7e51
--- /dev/null
+++ b/math/erf.c
@@ -0,0 +1,244 @@
+/*
+ * Double-precision erf(x) function.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "math_config.h"
+#include <math.h>
+#include <stdint.h>
+
+#define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3
+#define C 0x1.b0ac16p-1
+#define PA __erf_data.erf_poly_A
+#define NA __erf_data.erf_ratio_N_A
+#define DA __erf_data.erf_ratio_D_A
+#define NB __erf_data.erf_ratio_N_B
+#define DB __erf_data.erf_ratio_D_B
+#define PC __erf_data.erfc_poly_C
+#define PD __erf_data.erfc_poly_D
+#define PE __erf_data.erfc_poly_E
+#define PF __erf_data.erfc_poly_F
+
+/* Top 32 bits of a double.  */
+static inline uint32_t
+top32 (double x)
+{
+  return asuint64 (x) >> 32;
+}
+
+/* Fast erf implementation using a mix of
+   rational and polynomial approximations.
+   Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0.  */
+double
+erf (double x)
+{
+  /* Get top word and sign.  */
+  uint32_t ix = top32 (x);
+  uint32_t ia = ix & 0x7fffffff;
+  uint32_t sign = ix >> 31;
+
+  /* Normalized and subnormal cases */
+  if (ia < 0x3feb0000)
+    { /* a = |x| < 0.84375.  */
+
+      if (ia < 0x3e300000)
+	{ /* a < 2^(-28).  */
+	  if (ia < 0x00800000)
+	    { /* a < 2^(-1015).  */
+	      double y =  fma (TwoOverSqrtPiMinusOne, x, x);
+	      return check_uflow (y);
+	    }
+	  return x + TwoOverSqrtPiMinusOne * x;
+	}
+
+      double x2 = x * x;
+
+      if (ia < 0x3fe00000)
+	{ /* a < 0.5  - Use polynomial approximation.  */
+	  double r1 = fma (x2, PA[1], PA[0]);
+	  double r2 = fma (x2, PA[3], PA[2]);
+	  double r3 = fma (x2, PA[5], PA[4]);
+	  double r4 = fma (x2, PA[7], PA[6]);
+	  double r5 = fma (x2, PA[9], PA[8]);
+	  double x4 = x2 * x2;
+	  double r = r5;
+	  r = fma (x4, r, r4);
+	  r = fma (x4, r, r3);
+	  r = fma (x4, r, r2);
+	  r = fma (x4, r, r1);
+	  return fma (r, x, x); /* This fma is crucial for accuracy.  */
+	}
+      else
+	{ /* 0.5 <= a < 0.84375 - Use rational approximation.  */
+	  double x4, x8, r1n, r2n, r1d, r2d, r3d;
+
+	  r1n = fma (x2, NA[1], NA[0]);
+	  x4 = x2 * x2;
+	  r2n = fma (x2, NA[3], NA[2]);
+	  x8 = x4 * x4;
+	  r1d = fma (x2, DA[0], 1.0);
+	  r2d = fma (x2, DA[2], DA[1]);
+	  r3d = fma (x2, DA[4], DA[3]);
+	  double P = r1n + x4 * r2n + x8 * NA[4];
+	  double Q = r1d + x4 * r2d + x8 * r3d;
+	  return fma (P / Q, x, x);
+	}
+    }
+  else if (ia < 0x3ff40000)
+    { /* 0.84375 <= |x| < 1.25.  */
+      double a2, a4, a6, r1n, r2n, r3n, r4n, r1d, r2d, r3d, r4d;
+      double a = fabs (x) - 1.0;
+      r1n = fma (a, NB[1], NB[0]);
+      a2 = a * a;
+      r1d = fma (a, DB[0], 1.0);
+      a4 = a2 * a2;
+      r2n = fma (a, NB[3], NB[2]);
+      a6 = a4 * a2;
+      r2d = fma (a, DB[2], DB[1]);
+      r3n = fma (a, NB[5], NB[4]);
+      r3d = fma (a, DB[4], DB[3]);
+      r4n = NB[6];
+      r4d = DB[5];
+      double P = r1n + a2 * r2n + a4 * r3n + a6 * r4n;
+      double Q = r1d + a2 * r2d + a4 * r3d + a6 * r4d;
+      if (sign)
+	return -C - P / Q;
+      else
+	return C + P / Q;
+    }
+  else if (ia < 0x40000000)
+    { /* 1.25 <= |x| < 2.0.  */
+      double a = fabs (x);
+      a = a - 1.25;
+
+      double r1 = fma (a, PC[1], PC[0]);
+      double r2 = fma (a, PC[3], PC[2]);
+      double r3 = fma (a, PC[5], PC[4]);
+      double r4 = fma (a, PC[7], PC[6]);
+      double r5 = fma (a, PC[9], PC[8]);
+      double r6 = fma (a, PC[11], PC[10]);
+      double r7 = fma (a, PC[13], PC[12]);
+      double r8 = fma (a, PC[15], PC[14]);
+
+      double a2 = a * a;
+
+      double r = r8;
+      r = fma (a2, r, r7);
+      r = fma (a2, r, r6);
+      r = fma (a2, r, r5);
+      r = fma (a2, r, r4);
+      r = fma (a2, r, r3);
+      r = fma (a2, r, r2);
+      r = fma (a2, r, r1);
+
+      if (sign)
+	return -1.0 + r;
+      else
+	return 1.0 - r;
+    }
+  else if (ia < 0x400a0000)
+    { /* 2 <= |x| < 3.25.  */
+      double a = fabs (x);
+      a = fma (0.5, a, -1.0);
+
+      double r1 = fma (a, PD[1], PD[0]);
+      double r2 = fma (a, PD[3], PD[2]);
+      double r3 = fma (a, PD[5], PD[4]);
+      double r4 = fma (a, PD[7], PD[6]);
+      double r5 = fma (a, PD[9], PD[8]);
+      double r6 = fma (a, PD[11], PD[10]);
+      double r7 = fma (a, PD[13], PD[12]);
+      double r8 = fma (a, PD[15], PD[14]);
+      double r9 = fma (a, PD[17], PD[16]);
+
+      double a2 = a * a;
+
+      double r = r9;
+      r = fma (a2, r, r8);
+      r = fma (a2, r, r7);
+      r = fma (a2, r, r6);
+      r = fma (a2, r, r5);
+      r = fma (a2, r, r4);
+      r = fma (a2, r, r3);
+      r = fma (a2, r, r2);
+      r = fma (a2, r, r1);
+
+      if (sign)
+	return -1.0 + r;
+      else
+	return 1.0 - r;
+    }
+  else if (ia < 0x40100000)
+    { /* 3.25 <= |x| < 4.0.  */
+      double a = fabs (x);
+      a = a - 3.25;
+
+      double r1 = fma (a, PE[1], PE[0]);
+      double r2 = fma (a, PE[3], PE[2]);
+      double r3 = fma (a, PE[5], PE[4]);
+      double r4 = fma (a, PE[7], PE[6]);
+      double r5 = fma (a, PE[9], PE[8]);
+      double r6 = fma (a, PE[11], PE[10]);
+      double r7 = fma (a, PE[13], PE[12]);
+
+      double a2 = a * a;
+
+      double r = r7;
+      r = fma (a2, r, r6);
+      r = fma (a2, r, r5);
+      r = fma (a2, r, r4);
+      r = fma (a2, r, r3);
+      r = fma (a2, r, r2);
+      r = fma (a2, r, r1);
+
+      if (sign)
+	return -1.0 + r;
+      else
+	return 1.0 - r;
+    }
+  else if (ia < 0x4017a000)
+    { /* 4 <= |x| < 5.90625.  */
+      double a = fabs (x);
+      a = fma (0.5, a, -2.0);
+
+      double r1 = fma (a, PF[1], PF[0]);
+      double r2 = fma (a, PF[3], PF[2]);
+      double r3 = fma (a, PF[5], PF[4]);
+      double r4 = fma (a, PF[7], PF[6]);
+      double r5 = fma (a, PF[9], PF[8]);
+      double r6 = fma (a, PF[11], PF[10]);
+      double r7 = fma (a, PF[13], PF[12]);
+      double r8 = fma (a, PF[15], PF[14]);
+      double r9 = PF[16];
+
+      double a2 = a * a;
+
+      double r = r9;
+      r = fma (a2, r, r8);
+      r = fma (a2, r, r7);
+      r = fma (a2, r, r6);
+      r = fma (a2, r, r5);
+      r = fma (a2, r, r4);
+      r = fma (a2, r, r3);
+      r = fma (a2, r, r2);
+      r = fma (a2, r, r1);
+
+      if (sign)
+	return -1.0 + r;
+      else
+	return 1.0 - r;
+    }
+  else
+    {
+      /* Special cases : erf(nan)=nan, erf(+inf)=+1 and erf(-inf)=-1.  */
+      if (unlikely (ia >= 0x7ff00000))
+	return (double) (1.0 - (sign << 1)) + 1.0 / x;
+
+      if (sign)
+	return -1.0;
+      else
+	return 1.0;
+    }
+}
diff --git a/math/erf_data.c b/math/erf_data.c
new file mode 100644
index 0000000..807875b
--- /dev/null
+++ b/math/erf_data.c
@@ -0,0 +1,85 @@
+/*
+ * Shared data between erf and erfc.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "math_config.h"
+
+/*
+Minimax approximation of erf
+*/
+const struct erf_data __erf_data = {
+.erf_poly_A = {
+#if ERF_POLY_A_NCOEFFS == 10
+0x1.06eba8214db68p-3, -0x1.812746b037948p-2, 0x1.ce2f21a03872p-4,
+-0x1.b82ce30e6548p-6, 0x1.565bcc360a2f2p-8, -0x1.c02d812bc979ap-11,
+0x1.f99bddfc1ebe9p-14, -0x1.f42c457cee912p-17, 0x1.b0e414ec20ee9p-20,
+-0x1.18c47fd143c5ep-23
+#endif
+},
+/* Rational approximation on [0x1p-28, 0.84375] */
+.erf_ratio_N_A = {
+0x1.06eba8214db68p-3, -0x1.4cd7d691cb913p-2, -0x1.d2a51dbd7194fp-6,
+-0x1.7a291236668e4p-8, -0x1.8ead6120016acp-16
+},
+.erf_ratio_D_A = {
+0x1.97779cddadc09p-2, 0x1.0a54c5536cebap-4, 0x1.4d022c4d36b0fp-8,
+0x1.15dc9221c1a1p-13, -0x1.09c4342a2612p-18
+},
+/* Rational approximation on [0.84375, 1.25] */
+.erf_ratio_N_B = {
+-0x1.359b8bef77538p-9, 0x1.a8d00ad92b34dp-2, -0x1.7d240fbb8c3f1p-2,
+0x1.45fca805120e4p-2, -0x1.c63983d3e28ecp-4, 0x1.22a36599795ebp-5,
+-0x1.1bf380a96073fp-9
+},
+.erf_ratio_D_B = {
+0x1.b3e6618eee323p-4, 0x1.14af092eb6f33p-1, 0x1.2635cd99fe9a7p-4,
+0x1.02660e763351fp-3, 0x1.bedc26b51dd1cp-7, 0x1.88b545735151dp-7
+},
+.erfc_poly_C = {
+#if ERFC_POLY_C_NCOEFFS == 16
+/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=15 a=1.25 b=2 c=1 d=1.25 */
+0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2,
+-0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5,
+-0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8,
+-0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12,
+0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16,
+-0x1.578c9e375d37p-19
+#endif
+},
+.erfc_poly_D = {
+#if ERFC_POLY_D_NCOEFFS == 18
+/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2 */
+0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3,
+-0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2,
+-0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2,
+0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3,
+-0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3,
+0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10
+#endif
+},
+.erfc_poly_E = {
+#if ERFC_POLY_E_NCOEFFS == 14
+/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25 */
+0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14,
+-0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12,
+0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14,
+-0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20,
+-0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23
+#endif
+},
+.erfc_poly_F = {
+#if ERFC_POLY_F_NCOEFFS == 17
+/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4 */
+0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19,
+-0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14,
+0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11,
+-0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11,
+0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14,
+-0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19
+#endif
+}
+};
+
diff --git a/math/erff.c b/math/erff.c
new file mode 100644
index 0000000..a58e825
--- /dev/null
+++ b/math/erff.c
@@ -0,0 +1,104 @@
+/*
+ * Single-precision erf(x) function.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <math.h>
+#include "math_config.h"
+
+#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
+#define A __erff_data.erff_poly_A
+#define B __erff_data.erff_poly_B
+
+/* Top 12 bits of a float.  */
+static inline uint32_t
+top12 (float x)
+{
+  return asuint (x) >> 20;
+}
+
+/* Efficient implementation of erff
+   using either a pure polynomial approximation or
+   the exponential of a polynomial.
+   Worst-case error is 1.09ulps at 0x1.c111acp-1.  */
+float
+erff (float x)
+{
+  float r, x2, u;
+
+  /* Get top word.  */
+  uint32_t ix = asuint (x);
+  uint32_t sign = ix >> 31;
+  uint32_t ia12 = top12 (x) & 0x7ff;
+
+  /* Limit of both intervals is 0.875 for performance reasons but coefficients
+     computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
+     from 0.94 to 1.1ulps.  */
+  if (ia12 < 0x3f6)
+    { /* a = |x| < 0.875.  */
+
+      /* Tiny and subnormal cases.  */
+      if (unlikely (ia12 < 0x318))
+	{ /* |x| < 2^(-28).  */
+	  if (unlikely (ia12 < 0x040))
+	    { /* |x| < 2^(-119).  */
+	      float y = fmaf (TwoOverSqrtPiMinusOne, x, x);
+	      return check_uflowf (y);
+	    }
+	  return x + TwoOverSqrtPiMinusOne * x;
+	}
+
+      x2 = x * x;
+
+      /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2).  */
+      r = A[5];
+      r = fmaf (r, x2, A[4]);
+      r = fmaf (r, x2, A[3]);
+      r = fmaf (r, x2, A[2]);
+      r = fmaf (r, x2, A[1]);
+      r = fmaf (r, x2, A[0]);
+      r = fmaf (r, x, x);
+    }
+  else if (ia12 < 0x408)
+    { /* |x| < 4.0 - Use a custom Estrin scheme.  */
+
+      float a = fabsf (x);
+      /* Start with Estrin scheme on high order (small magnitude) coefficients.  */
+      r = fmaf (B[6], a, B[5]);
+      u = fmaf (B[4], a, B[3]);
+      x2 = x * x;
+      r = fmaf (r, x2, u);
+      /* Then switch to pure Horner scheme.  */
+      r = fmaf (r, a, B[2]);
+      r = fmaf (r, a, B[1]);
+      r = fmaf (r, a, B[0]);
+      r = fmaf (r, a, a);
+      /* Single precision exponential with ~0.5ulps,
+	 ensures erff has max. rel. error
+	 < 1ulp on [0.921875, 4.0],
+	 < 1.1ulps on [0.875, 4.0].  */
+      r = expf (-r);
+      /* Explicit copysign (calling copysignf increases latency).  */
+      if (sign)
+	r = -1.0f + r;
+      else
+	r = 1.0f - r;
+    }
+  else
+    { /* |x| >= 4.0.  */
+
+      /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1.  */
+      if (unlikely (ia12 >= 0x7f8))
+	return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;
+
+      /* Explicit copysign (calling copysignf increases latency).  */
+      if (sign)
+	r = -1.0f;
+      else
+	r = 1.0f;
+    }
+  return r;
+}
diff --git a/math/erff_data.c b/math/erff_data.c
new file mode 100644
index 0000000..fa6b1ef
--- /dev/null
+++ b/math/erff_data.c
@@ -0,0 +1,22 @@
+/*
+ * Data for approximation of erff.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "math_config.h"
+
+/* Minimax approximation of erff. */
+const struct erff_data __erff_data = {
+.erff_poly_A = {
+0x1.06eba6p-03f, -0x1.8126e0p-02f, 0x1.ce1a46p-04f,
+-0x1.b68bd2p-06f, 0x1.473f48p-08f, -0x1.3a1a82p-11f
+},
+.erff_poly_B = {
+0x1.079d0cp-3f, 0x1.450aa0p-1f, 0x1.b55cb0p-4f,
+-0x1.8d6300p-6f, 0x1.fd1336p-9f, -0x1.91d2ccp-12f,
+0x1.222900p-16f
+}
+};
+
diff --git a/math/exp.c b/math/exp.c
index 1909b8e..7f5024c 100644
--- a/math/exp.c
+++ b/math/exp.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision e^x function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/exp2.c b/math/exp2.c
index 47aa479..35ab39f 100644
--- a/math/exp2.c
+++ b/math/exp2.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision 2^x function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/expf.c b/math/expf.c
index 0fe1f7d..9b2f0c3 100644
--- a/math/expf.c
+++ b/math/expf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision e^x function.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index 4493008..279d829 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -1,7 +1,7 @@
 /*
  * Public API.
  *
- * Copyright (c) 2015-2019, Arm Limited.
+ * Copyright (c) 2015-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/log.c b/math/log.c
index b85d3ff..d3b7bc6 100644
--- a/math/log.c
+++ b/math/log.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision log(x) function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/log2.c b/math/log2.c
index 804fb85..55102b7 100644
--- a/math/log2.c
+++ b/math/log2.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision log2(x) function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/logf.c b/math/logf.c
index ee3120a..cfbaee1 100644
--- a/math/logf.c
+++ b/math/logf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision log function.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/logf_data.c b/math/logf_data.c
index 53c5f62..e8973ce 100644
--- a/math/logf_data.c
+++ b/math/logf_data.c
@@ -1,7 +1,7 @@
 /*
  * Data definition for logf.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/math_config.h b/math/math_config.h
index 7a1cc81..e851043 100644
--- a/math/math_config.h
+++ b/math/math_config.h
@@ -1,7 +1,7 @@
 /*
  * Configuration for math routines.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -12,12 +12,17 @@
 #include <stdint.h>
 
 #ifndef WANT_ROUNDING
-/* Correct special case results in non-nearest rounding modes.  */
+/* If defined to 1, return correct results for special cases in non-nearest
+   rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than -0.0f).
+   This may be set to 0 if there is no fenv support or if math functions only
+   get called in round to nearest mode.  */
 # define WANT_ROUNDING 1
 #endif
 #ifndef WANT_ERRNO
-/* Set errno according to ISO C with (math_errhandling & MATH_ERRNO) != 0.  */
-# define WANT_ERRNO 1
+/* If defined to 1, set errno in math functions according to ISO C.  Many math
+   libraries do not set errno, so this is 0 by default.  It may need to be
+   set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0.  */
+# define WANT_ERRNO 0
 #endif
 #ifndef WANT_ERRNO_UFLOW
 /* Set errno to ERANGE if result underflows to 0 (in all rounding modes).  */
@@ -293,6 +298,24 @@ check_uflow (double x)
   return WANT_ERRNO ? __math_check_uflow (x) : x;
 }
 
+/* Check if the result overflowed to infinity.  */
+HIDDEN float __math_check_oflowf (float);
+/* Check if the result underflowed to 0.  */
+HIDDEN float __math_check_uflowf (float);
+
+/* Check if the result overflowed to infinity.  */
+static inline float
+check_oflowf (float x)
+{
+  return WANT_ERRNO ? __math_check_oflowf (x) : x;
+}
+
+/* Check if the result underflowed to 0.  */
+static inline float
+check_uflowf (float x)
+{
+  return WANT_ERRNO ? __math_check_uflowf (x) : x;
+}
 
 /* Shared between expf, exp2f and powf.  */
 #define EXP2F_TABLE_BITS 5
@@ -411,4 +434,29 @@ extern const struct pow_log_data
   struct {double invc, pad, logc, logctail;} tab[1 << POW_LOG_TABLE_BITS];
 } __pow_log_data HIDDEN;
 
+extern const struct erff_data
+{
+  float erff_poly_A[6];
+  float erff_poly_B[7];
+} __erff_data HIDDEN;
+
+#define ERF_POLY_A_ORDER 19
+#define ERF_POLY_A_NCOEFFS 10
+#define ERFC_POLY_C_NCOEFFS 16
+#define ERFC_POLY_D_NCOEFFS 18
+#define ERFC_POLY_E_NCOEFFS 14
+#define ERFC_POLY_F_NCOEFFS 17
+extern const struct erf_data
+{
+  double erf_poly_A[ERF_POLY_A_NCOEFFS];
+  double erf_ratio_N_A[5];
+  double erf_ratio_D_A[5];
+  double erf_ratio_N_B[7];
+  double erf_ratio_D_B[6];
+  double erfc_poly_C[ERFC_POLY_C_NCOEFFS];
+  double erfc_poly_D[ERFC_POLY_D_NCOEFFS];
+  double erfc_poly_E[ERFC_POLY_E_NCOEFFS];
+  double erfc_poly_F[ERFC_POLY_F_NCOEFFS];
+} __erf_data HIDDEN;
+
 #endif
diff --git a/math/math_errf.c b/math/math_errf.c
index 07154c5..d5350b8 100644
--- a/math/math_errf.c
+++ b/math/math_errf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision math error handling.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -64,3 +64,17 @@ __math_invalidf (float x)
   float y = (x - x) / (x - x);
   return isnan (x) ? y : with_errnof (y, EDOM);
 }
+
+/* Check result and set errno if necessary.  */
+
+HIDDEN float
+__math_check_uflowf (float y)
+{
+  return y == 0.0f ? with_errnof (y, ERANGE) : y;
+}
+
+HIDDEN float
+__math_check_oflowf (float y)
+{
+  return isinf (y) ? with_errnof (y, ERANGE) : y;
+}
diff --git a/math/pow.c b/math/pow.c
index ced7c4f..86842c6 100644
--- a/math/pow.c
+++ b/math/pow.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision x^y function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/powf.c b/math/powf.c
index 1534a09..6ba45d3 100644
--- a/math/powf.c
+++ b/math/powf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision pow function.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/powf_log2_data.c b/math/powf_log2_data.c
index b9fbdc4..97e0d98 100644
--- a/math/powf_log2_data.c
+++ b/math/powf_log2_data.c
@@ -1,7 +1,7 @@
 /*
  * Data definition for powf.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/sincosf.c b/math/sincosf.c
index e6cd41e..9746f1c 100644
--- a/math/sincosf.c
+++ b/math/sincosf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision sin/cos function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/sincosf_data.c b/math/sincosf_data.c
index 5d0b58e..ab4ac47 100644
--- a/math/sincosf_data.c
+++ b/math/sincosf_data.c
@@ -1,7 +1,7 @@
 /*
  * Data definition for sinf, cosf and sincosf.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/sinf.c b/math/sinf.c
index 770b294..ddbc1da 100644
--- a/math/sinf.c
+++ b/math/sinf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision sin function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index 33ceda3..0c17826 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -1,7 +1,7 @@
 /*
  * Microbenchmark for math functions.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -248,6 +248,7 @@ D (log2, 0.999, 1.001)
 {"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
 D (xpow, 0.01, 11.1)
 D (ypow, -9.9, 9.9)
+D (erf, -6.0, 6.0)
 
 F (dummyf, 1.0, 2.0)
 F (expf, -9.9, 9.9)
@@ -275,6 +276,7 @@ F (cosf, -3.1, 3.1)
 F (cosf, 3.3, 33.3)
 F (cosf, 100, 1000)
 F (cosf, 1e6, 1e32)
+F (erff, -4.0, 4.0)
 #if WANT_VMATH
 D (__s_sin, -3.1, 3.1)
 D (__s_cos, -3.1, 3.1)
diff --git a/math/test/mathtest.c b/math/test/mathtest.c
index 2ff8c3f..3108967 100644
--- a/math/test/mathtest.c
+++ b/math/test/mathtest.c
@@ -1,7 +1,7 @@
 /*
  * mathtest.c - test rig for mathlib
  *
- * Copyright (c) 1998-2018, Arm Limited.
+ * Copyright (c) 1998-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/dotest.c b/math/test/rtest/dotest.c
index f416477..6be79e1 100644
--- a/math/test/rtest/dotest.c
+++ b/math/test/rtest/dotest.c
@@ -1,7 +1,7 @@
 /*
  * dotest.c - actually generate mathlib test cases
  *
- * Copyright (c) 1999-2018, Arm Limited.
+ * Copyright (c) 1999-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/intern.h b/math/test/rtest/intern.h
index af574b0..12a9c74 100644
--- a/math/test/rtest/intern.h
+++ b/math/test/rtest/intern.h
@@ -1,7 +1,7 @@
 /*
  * intern.h
  *
- * Copyright (c) 1999-2018, Arm Limited.
+ * Copyright (c) 1999-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/main.c b/math/test/rtest/main.c
index e94e455..0d8ead8 100644
--- a/math/test/rtest/main.c
+++ b/math/test/rtest/main.c
@@ -1,7 +1,7 @@
 /*
  * main.c
  *
- * Copyright (c) 1999-2018, Arm Limited.
+ * Copyright (c) 1999-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/random.c b/math/test/rtest/random.c
index e97a8c6..5612396 100644
--- a/math/test/rtest/random.c
+++ b/math/test/rtest/random.c
@@ -1,7 +1,7 @@
 /*
  * random.c - random number generator for producing mathlib test cases
  *
- * Copyright (c) 1998-2018, Arm Limited.
+ * Copyright (c) 1998-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/random.h b/math/test/rtest/random.h
index c1ce956..b4b22df 100644
--- a/math/test/rtest/random.h
+++ b/math/test/rtest/random.h
@@ -1,7 +1,7 @@
 /*
  * random.h - header for random.c
  *
- * Copyright (c) 2009-2018, Arm Limited.
+ * Copyright (c) 2009-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/semi.c b/math/test/rtest/semi.c
index 938dc3a..c9f0daf 100644
--- a/math/test/rtest/semi.c
+++ b/math/test/rtest/semi.c
@@ -1,7 +1,7 @@
 /*
  * semi.c: test implementations of mathlib seminumerical functions
  *
- * Copyright (c) 1999-2018, Arm Limited.
+ * Copyright (c) 1999-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/semi.h b/math/test/rtest/semi.h
index da473a2..17dc415 100644
--- a/math/test/rtest/semi.h
+++ b/math/test/rtest/semi.h
@@ -1,7 +1,7 @@
 /*
  * semi.h: header for semi.c
  *
- * Copyright (c) 1999-2018, Arm Limited.
+ * Copyright (c) 1999-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/types.h b/math/test/rtest/types.h
index 1a76c2e..53cd557 100644
--- a/math/test/rtest/types.h
+++ b/math/test/rtest/types.h
@@ -1,7 +1,7 @@
 /*
  * types.h
  *
- * Copyright (c) 2005-2018, Arm Limited.
+ * Copyright (c) 2005-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/wrappers.c b/math/test/rtest/wrappers.c
index acaf671..de45ac5 100644
--- a/math/test/rtest/wrappers.c
+++ b/math/test/rtest/wrappers.c
@@ -1,7 +1,7 @@
 /*
  * wrappers.c - wrappers to modify output of MPFR/MPC test functions
  *
- * Copyright (c) 2014-2018, Arm Limited.
+ * Copyright (c) 2014-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/wrappers.h b/math/test/rtest/wrappers.h
index 5804935..7b09c85 100644
--- a/math/test/rtest/wrappers.h
+++ b/math/test/rtest/wrappers.h
@@ -1,7 +1,7 @@
 /*
  * wrappers.h - wrappers to modify output of MPFR/MPC test functions
  *
- * Copyright (c) 2014-2018, Arm Limited.
+ * Copyright (c) 2014-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index a8c391b..0190d9a 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -2,7 +2,7 @@
 
 # ULP error check script.
 #
-# Copyright (c) 2019, Arm Limited.
+# Copyright (c) 2019-2020, Arm Limited.
 # SPDX-License-Identifier: MIT
 
 #set -x
@@ -72,6 +72,16 @@ t pow  0x1.ffffffffffff0p-1  0x1.0000000000008p0 x 0x1p60 0x1p68 50000
 t pow  0x1.ffffffffff000p-1  0x1p0 x 0x1p50 0x1p52 50000
 t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000
 
+L=1.0
+Ldir=0.9
+t erf  0 0xffff000000000000 10000
+t erf  0x1p-1022  0x1p-26   40000
+t erf  -0x1p-1022 -0x1p-26  40000
+t erf  0x1p-26    0x1p3     40000
+t erf  -0x1p-26  -0x1p3     40000
+t erf  0         inf        40000
+Ldir=0.5
+
 L=0.01
 t expf  0    0xffff0000    10000
 t expf  0x1p-14   0x1p8    50000
@@ -119,6 +129,17 @@ t powf  0x1p-70 0x1p70  x  0x1p-1 0x1p1   50000
 t powf  0x1p-70 0x1p70  x  -0x1p-1 -0x1p1 50000
 t powf  0x1.ep-1 0x1.1p0 x  0x1p8 0x1p14  50000
 t powf  0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000
+
+L=0.6
+Ldir=0.9
+t erff  0      0xffff0000 10000
+t erff  0x1p-127  0x1p-26 40000
+t erff -0x1p-127 -0x1p-26 40000
+t erff  0x1p-26   0x1p3   40000
+t erff -0x1p-26  -0x1p3   40000
+t erff  0         inf     40000
+Ldir=0.5
+
 done
 
 # vector functions
diff --git a/math/test/testcases/directed/cosf.tst b/math/test/testcases/directed/cosf.tst
index 5dc0994..7916044 100644
--- a/math/test/testcases/directed/cosf.tst
+++ b/math/test/testcases/directed/cosf.tst
@@ -1,6 +1,6 @@
 ; cosf.tst - Directed test cases for SP cosine
 ;
-; Copyright (c) 2007-2018, Arm Limited.
+; Copyright (c) 2007-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=cosf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/erf.tst b/math/test/testcases/directed/erf.tst
new file mode 100644
index 0000000..7fa4d18
--- /dev/null
+++ b/math/test/testcases/directed/erf.tst
@@ -0,0 +1,17 @@
+; erf.tst - Directed test cases for erf
+;
+; Copyright (c) 2007-2020, Arm Limited.
+; SPDX-License-Identifier: MIT
+
+func=erf op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=erf op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=erf op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=erf op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=erf op1=7ff00000.00000000 result=3ff00000.00000000 errno=0
+func=erf op1=fff00000.00000000 result=bff00000.00000000 errno=0
+func=erf op1=00000000.00000000 result=00000000.00000000 errno=ERANGE
+func=erf op1=80000000.00000000 result=80000000.00000000 errno=ERANGE
+func=erf op1=00000000.00000001 result=00000000.00000001 errno=0 status=ux
+func=erf op1=80000000.00000001 result=80000000.00000001 errno=0 status=ux
+func=erf op1=3ff00000.00000000 result=3feaf767.a741088a.c6d errno=0
+func=erf op1=bff00000.00000000 result=bfeaf767.a741088a.c6d errno=0
diff --git a/math/test/testcases/directed/erff.tst b/math/test/testcases/directed/erff.tst
new file mode 100644
index 0000000..d05b7b1
--- /dev/null
+++ b/math/test/testcases/directed/erff.tst
@@ -0,0 +1,17 @@
+; erff.tst
+;
+; Copyright (c) 2007-2020, Arm Limited.
+; SPDX-License-Identifier: MIT
+
+func=erff op1=7fc00001 result=7fc00001 errno=0
+func=erff op1=ffc00001 result=7fc00001 errno=0
+func=erff op1=7f800001 result=7fc00001 errno=0 status=i
+func=erff op1=ff800001 result=7fc00001 errno=0 status=i
+func=erff op1=7f800000 result=3f800000 errno=0
+func=erff op1=ff800000 result=bf800000 errno=0
+func=erff op1=00000000 result=00000000 errno=ERANGE
+func=erff op1=80000000 result=80000000 errno=ERANGE
+func=erff op1=00000001 result=00000001 errno=0 status=ux
+func=erff op1=80000001 result=80000001 errno=0 status=ux
+func=erff op1=3f800000 result=3f57bb3d.3a0 errno=0
+func=erff op1=bf800000 result=bf57bb3d.3a0 errno=0
diff --git a/math/test/testcases/directed/exp.tst b/math/test/testcases/directed/exp.tst
index addfc0a..85d556c 100644
--- a/math/test/testcases/directed/exp.tst
+++ b/math/test/testcases/directed/exp.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for exp
 ;
-; Copyright (c) 2018, Arm Limited.
+; Copyright (c) 2018-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=exp op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/exp2.tst b/math/test/testcases/directed/exp2.tst
index 04a5a50..fa56c9f 100644
--- a/math/test/testcases/directed/exp2.tst
+++ b/math/test/testcases/directed/exp2.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for exp2
 ;
-; Copyright (c) 2018, Arm Limited.
+; Copyright (c) 2018-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=exp2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/exp2f.tst b/math/test/testcases/directed/exp2f.tst
index 2b6a9b5..38cfc3f 100644
--- a/math/test/testcases/directed/exp2f.tst
+++ b/math/test/testcases/directed/exp2f.tst
@@ -1,6 +1,6 @@
 ; exp2f.tst - Directed test cases for exp2f
 ;
-; Copyright (c) 2017-2018, Arm Limited.
+; Copyright (c) 2017-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=exp2f op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/expf.tst b/math/test/testcases/directed/expf.tst
index 74664c7..ff0f671 100644
--- a/math/test/testcases/directed/expf.tst
+++ b/math/test/testcases/directed/expf.tst
@@ -1,6 +1,6 @@
 ; expf.tst - Directed test cases for expf
 ;
-; Copyright (c) 2007-2018, Arm Limited.
+; Copyright (c) 2007-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=expf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/log.tst b/math/test/testcases/directed/log.tst
index eeb762c..a0aa398 100644
--- a/math/test/testcases/directed/log.tst
+++ b/math/test/testcases/directed/log.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for log
 ;
-; Copyright (c) 2018, Arm Limited.
+; Copyright (c) 2018-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=log op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/log2.tst b/math/test/testcases/directed/log2.tst
index e0765d8..ff1286c 100644
--- a/math/test/testcases/directed/log2.tst
+++ b/math/test/testcases/directed/log2.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for log2
 ;
-; Copyright (c) 2018, Arm Limited.
+; Copyright (c) 2018-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/log2f.tst b/math/test/testcases/directed/log2f.tst
index 8d685ba..5832c4f 100644
--- a/math/test/testcases/directed/log2f.tst
+++ b/math/test/testcases/directed/log2f.tst
@@ -1,6 +1,6 @@
 ; log2f.tst - Directed test cases for log2f
 ;
-; Copyright (c) 2017-2018, Arm Limited.
+; Copyright (c) 2017-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=log2f op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/logf.tst b/math/test/testcases/directed/logf.tst
index 7ccc873..6e68a36 100644
--- a/math/test/testcases/directed/logf.tst
+++ b/math/test/testcases/directed/logf.tst
@@ -1,6 +1,6 @@
 ; logf.tst - Directed test cases for logf
 ;
-; Copyright (c) 2007-2018, Arm Limited.
+; Copyright (c) 2007-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=logf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/pow.tst b/math/test/testcases/directed/pow.tst
index a4c42be..1966581 100644
--- a/math/test/testcases/directed/pow.tst
+++ b/math/test/testcases/directed/pow.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for pow
 ;
-; Copyright (c) 2018, Arm Limited.
+; Copyright (c) 2018-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=pow op1=00000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
diff --git a/math/test/testcases/directed/powf.tst b/math/test/testcases/directed/powf.tst
index efd1dd5..3fa8b11 100644
--- a/math/test/testcases/directed/powf.tst
+++ b/math/test/testcases/directed/powf.tst
@@ -1,6 +1,6 @@
 ; powf.tst - Directed test cases for powf
 ;
-; Copyright (c) 2007-2018, Arm Limited.
+; Copyright (c) 2007-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=powf op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i
diff --git a/math/test/testcases/directed/sincosf.tst b/math/test/testcases/directed/sincosf.tst
index b4b2526..4b33d22 100644
--- a/math/test/testcases/directed/sincosf.tst
+++ b/math/test/testcases/directed/sincosf.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for SP sincos
 ;
-; Copyright (c) 2007-2018, Arm Limited.
+; Copyright (c) 2007-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 
diff --git a/math/test/testcases/directed/sinf.tst b/math/test/testcases/directed/sinf.tst
index 13cfdca..ded80b1 100644
--- a/math/test/testcases/directed/sinf.tst
+++ b/math/test/testcases/directed/sinf.tst
@@ -1,6 +1,6 @@
 ; sinf.tst - Directed test cases for SP sine
 ;
-; Copyright (c) 2007-2018, Arm Limited.
+; Copyright (c) 2007-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 
diff --git a/math/test/testcases/random/double.tst b/math/test/testcases/random/double.tst
index c37e837..c24ff80 100644
--- a/math/test/testcases/random/double.tst
+++ b/math/test/testcases/random/double.tst
@@ -1,6 +1,6 @@
 !! double.tst - Random test case specification for DP functions
 !!
-!! Copyright (c) 1999-2018, Arm Limited.
+!! Copyright (c) 1999-2019, Arm Limited.
 !! SPDX-License-Identifier: MIT
 
 test exp 10000
diff --git a/math/test/testcases/random/float.tst b/math/test/testcases/random/float.tst
index baf62b9..d02a227 100644
--- a/math/test/testcases/random/float.tst
+++ b/math/test/testcases/random/float.tst
@@ -1,6 +1,6 @@
 !! single.tst - Random test case specification for SP functions
 !!
-!! Copyright (c) 1999-2018, Arm Limited.
+!! Copyright (c) 1999-2019, Arm Limited.
 !! SPDX-License-Identifier: MIT
 
 test sinf 10000
diff --git a/math/test/ulp.c b/math/test/ulp.c
index 371567a..51479b8 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -1,7 +1,7 @@
 /*
  * ULP error checking tool for math functions.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -331,11 +331,13 @@ static const struct fun fun[] = {
  F1 (log)
  F1 (log2)
  F2 (pow)
+ F1 (erf)
  D1 (exp)
  D1 (exp2)
  D1 (log)
  D1 (log2)
  D2 (pow)
+ D1 (erf)
 #if WANT_VMATH
  F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0)
  F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0)
diff --git a/math/tools/remez.jl b/math/tools/remez.jl
index f479fc5..2ff436f 100755
--- a/math/tools/remez.jl
+++ b/math/tools/remez.jl
@@ -3,7 +3,7 @@
 
 # remez.jl - implementation of the Remez algorithm for polynomial approximation
 #
-# Copyright (c) 2015-2018, Arm Limited.
+# Copyright (c) 2015-2019, Arm Limited.
 # SPDX-License-Identifier: MIT
 
 import Base.\
diff --git a/math/v_math.h b/math/v_math.h
index 3db22e5..f2cc467 100644
--- a/math/v_math.h
+++ b/math/v_math.h
@@ -1,7 +1,7 @@
 /*
  * Vector math abstractions.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/networking/Dir.mk b/networking/Dir.mk
new file mode 100644
index 0000000..b496103
--- /dev/null
+++ b/networking/Dir.mk
@@ -0,0 +1,76 @@
+# Makefile fragment - requires GNU make
+#
+# Copyright (c) 2019-2020, Arm Limited.
+# SPDX-License-Identifier: MIT
+
+S := $(srcdir)/networking
+B := build/networking
+
+ifeq ($(ARCH),)
+all-networking check-networking install-networking clean-networking:
+	@echo "*** Please set ARCH in config.mk. ***"
+	@exit 1
+else
+
+networking-lib-srcs := $(wildcard $(S)/*.[cS]) $(wildcard $(S)/$(ARCH)/*.[cS])
+networking-test-srcs := $(wildcard $(S)/test/*.c)
+
+networking-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
+
+networking-libs := \
+	build/lib/libnetworking.so \
+	build/lib/libnetworking.a \
+
+networking-tools := \
+	build/bin/test/chksum
+
+networking-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(networking-lib-srcs)))
+networking-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(networking-test-srcs)))
+
+networking-objs := \
+	$(networking-lib-objs) \
+	$(networking-lib-objs:%.o=%.os) \
+	$(networking-test-objs) \
+
+networking-files := \
+	$(networking-objs) \
+	$(networking-libs) \
+	$(networking-tools) \
+	$(networking-includes) \
+
+all-networking: $(networking-libs) $(networking-tools) $(networking-includes)
+
+$(networking-objs): $(networking-includes)
+$(networking-objs): CFLAGS_ALL += $(networking-cflags)
+
+build/lib/libnetworking.so: $(networking-lib-objs:%.o=%.os)
+	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^
+
+build/lib/libnetworkinglib.a: $(networking-lib-objs)
+	rm -f $@
+	$(AR) rc $@ $^
+	$(RANLIB) $@
+
+build/bin/test/%: $(B)/test/%.o build/lib/libnetworkinglib.a
+	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+
+build/include/%.h: $(S)/include/%.h
+	cp $< $@
+
+build/bin/%.sh: $(S)/test/%.sh
+	cp $< $@
+
+check-networking: $(networking-tools)
+	$(EMULATOR) build/bin/test/chksum -i simple
+	$(EMULATOR) build/bin/test/chksum -i scalar
+	$(EMULATOR) build/bin/test/chksum -i simd || true # simd is not always available
+
+install-networking: \
+ $(networking-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \
+ $(networking-includes:build/include/%=$(DESTDIR)$(includedir)/%)
+
+clean-networking:
+	rm -f $(networking-files)
+endif
+
+.PHONY: all-networking check-networking install-networking clean-networking
diff --git a/networking/aarch64/chksum_simd.c b/networking/aarch64/chksum_simd.c
new file mode 100644
index 0000000..6d5be58
--- /dev/null
+++ b/networking/aarch64/chksum_simd.c
@@ -0,0 +1,146 @@
+/*
+ * AArch64-specific checksum implementation using NEON
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "networking.h"
+#include "../chksum_common.h"
+
+#ifndef __ARM_NEON
+#pragma GCC target("+simd")
+#endif
+
+#include <arm_neon.h>
+
+always_inline
+static inline uint64_t
+slurp_head64(const void **pptr, uint32_t *nbytes)
+{
+    Assert(*nbytes >= 8);
+    uint64_t sum = 0;
+    uint32_t off = (uintptr_t) *pptr % 8;
+    if (likely(off != 0))
+    {
+	/* Get rid of bytes 0..off-1 */
+	const unsigned char *ptr64 = align_ptr(*pptr, 8);
+	uint64_t mask = ALL_ONES << (CHAR_BIT * off);
+	uint64_t val = load64(ptr64) & mask;
+	/* Fold 64-bit sum to 33 bits */
+	sum = val >> 32;
+	sum += (uint32_t) val;
+	*pptr = ptr64 + 8;
+	*nbytes -= 8 - off;
+    }
+    return sum;
+}
+
+always_inline
+static inline uint64_t
+slurp_tail64(uint64_t sum, const void *ptr, uint32_t nbytes)
+{
+    Assert(nbytes < 8);
+    if (likely(nbytes != 0))
+    {
+	/* Get rid of bytes 7..nbytes */
+	uint64_t mask = ALL_ONES >> (CHAR_BIT * (8 - nbytes));
+	Assert(__builtin_popcountl(mask) / CHAR_BIT == nbytes);
+	uint64_t val = load64(ptr) & mask;
+	sum += val >> 32;
+	sum += (uint32_t) val;
+	nbytes = 0;
+    }
+    Assert(nbytes == 0);
+    return sum;
+}
+
+unsigned short
+__chksum_aarch64_simd(const void *ptr, unsigned int nbytes)
+{
+    bool swap = (uintptr_t) ptr & 1;
+    uint64_t sum;
+
+    if (unlikely(nbytes < 50))
+    {
+	sum = slurp_small(ptr, nbytes);
+	swap = false;
+	goto fold;
+    }
+
+    /* 8-byte align pointer */
+    Assert(nbytes >= 8);
+    sum = slurp_head64(&ptr, &nbytes);
+    Assert(((uintptr_t) ptr & 7) == 0);
+
+    const uint32_t *may_alias ptr32 = ptr;
+
+    uint64x2_t vsum0 = { 0, 0 };
+    uint64x2_t vsum1 = { 0, 0 };
+    uint64x2_t vsum2 = { 0, 0 };
+    uint64x2_t vsum3 = { 0, 0 };
+
+    /* Sum groups of 64 bytes */
+    for (uint32_t i = 0; i < nbytes / 64; i++)
+    {
+	uint32x4_t vtmp0 = vld1q_u32(ptr32);
+	uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4);
+	uint32x4_t vtmp2 = vld1q_u32(ptr32 + 8);
+	uint32x4_t vtmp3 = vld1q_u32(ptr32 + 12);
+	vsum0 = vpadalq_u32(vsum0, vtmp0);
+	vsum1 = vpadalq_u32(vsum1, vtmp1);
+	vsum2 = vpadalq_u32(vsum2, vtmp2);
+	vsum3 = vpadalq_u32(vsum3, vtmp3);
+	ptr32 += 16;
+    }
+    nbytes %= 64;
+
+    /* Fold vsum2 and vsum3 into vsum0 and vsum1 */
+    vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum2));
+    vsum1 = vpadalq_u32(vsum1, vreinterpretq_u32_u64(vsum3));
+
+    /* Add any trailing group of 32 bytes */
+    if (nbytes & 32)
+    {
+	uint32x4_t vtmp0 = vld1q_u32(ptr32);
+	uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4);
+	vsum0 = vpadalq_u32(vsum0, vtmp0);
+	vsum1 = vpadalq_u32(vsum1, vtmp1);
+	ptr32 += 8;
+	nbytes -= 32;
+    }
+    Assert(nbytes < 32);
+
+    /* Fold vsum1 into vsum0 */
+    vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum1));
+
+    /* Add any trailing group of 16 bytes */
+    if (nbytes & 16)
+    {
+	uint32x4_t vtmp = vld1q_u32(ptr32);
+	vsum0 = vpadalq_u32(vsum0, vtmp);
+	ptr32 += 4;
+	nbytes -= 16;
+    }
+    Assert(nbytes < 16);
+
+    /* Add any trailing group of 8 bytes */
+    if (nbytes & 8)
+    {
+	uint32x2_t vtmp = vld1_u32(ptr32);
+	vsum0 = vaddw_u32(vsum0, vtmp);
+	ptr32 += 2;
+	nbytes -= 8;
+    }
+    Assert(nbytes < 8);
+
+    uint64_t val = vaddlvq_u32(vreinterpretq_u32_u64(vsum0));
+    sum += val >> 32;
+    sum += (uint32_t) val;
+
+    /* Handle any trailing 0..7 bytes */
+    sum = slurp_tail64(sum, ptr32, nbytes);
+
+fold:
+    return fold_and_swap(sum, swap);
+}
diff --git a/networking/arm/chksum_simd.c b/networking/arm/chksum_simd.c
new file mode 100644
index 0000000..7f69adf
--- /dev/null
+++ b/networking/arm/chksum_simd.c
@@ -0,0 +1,149 @@
+/*
+ * Armv7-A specific checksum implementation using NEON
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "networking.h"
+#include "../chksum_common.h"
+
+#ifndef __ARM_NEON
+#pragma GCC target("+simd")
+#endif
+
+#include <arm_neon.h>
+
+unsigned short
+__chksum_arm_simd(const void *ptr, unsigned int nbytes)
+{
+    bool swap = (uintptr_t) ptr & 1;
+    uint64x1_t vsum = { 0 };
+
+    if (unlikely(nbytes < 40))
+    {
+	uint64_t sum = slurp_small(ptr, nbytes);
+	return fold_and_swap(sum, false);
+    }
+
+    /* 8-byte align pointer */
+    /* Inline slurp_head-like code since we use NEON here */
+    Assert(nbytes >= 8);
+    uint32_t off = (uintptr_t) ptr & 7;
+    if (likely(off != 0))
+    {
+	const uint64_t *may_alias ptr64 = align_ptr(ptr, 8);
+	uint64x1_t vword64 = vld1_u64(ptr64);
+	/* Get rid of bytes 0..off-1 */
+	uint64x1_t vmask = vdup_n_u64(ALL_ONES);
+	int64x1_t vshiftl = vdup_n_s64(CHAR_BIT * off);
+	vmask = vshl_u64(vmask, vshiftl);
+	vword64 = vand_u64(vword64, vmask);
+	uint32x2_t vtmp = vreinterpret_u32_u64(vword64);
+	/* Set accumulator */
+	vsum = vpaddl_u32(vtmp);
+	/* Update pointer and remaining size */
+	ptr = (char *) ptr64 + 8;
+	nbytes -= 8 - off;
+    }
+    Assert(((uintptr_t) ptr & 7) == 0);
+
+    /* Sum groups of 64 bytes */
+    uint64x2_t vsum0 = { 0, 0 };
+    uint64x2_t vsum1 = { 0, 0 };
+    uint64x2_t vsum2 = { 0, 0 };
+    uint64x2_t vsum3 = { 0, 0 };
+    const uint32_t *may_alias ptr32 = ptr;
+    for (uint32_t i = 0; i < nbytes / 64; i++)
+    {
+	uint32x4_t vtmp0 = vld1q_u32(ptr32);
+	uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4);
+	uint32x4_t vtmp2 = vld1q_u32(ptr32 + 8);
+	uint32x4_t vtmp3 = vld1q_u32(ptr32 + 12);
+	vsum0 = vpadalq_u32(vsum0, vtmp0);
+	vsum1 = vpadalq_u32(vsum1, vtmp1);
+	vsum2 = vpadalq_u32(vsum2, vtmp2);
+	vsum3 = vpadalq_u32(vsum3, vtmp3);
+	ptr32 += 16;
+    }
+    nbytes %= 64;
+
+    /* Fold vsum1/vsum2/vsum3 into vsum0 */
+    vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum2));
+    vsum1 = vpadalq_u32(vsum1, vreinterpretq_u32_u64(vsum3));
+    vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum1));
+
+    /* Add any trailing 16-byte groups */
+    while (likely(nbytes >= 16))
+    {
+	uint32x4_t vtmp0 = vld1q_u32(ptr32);
+	vsum0 = vpadalq_u32(vsum0, vtmp0);
+	ptr32 += 4;
+	nbytes -= 16;
+    }
+    Assert(nbytes < 16);
+
+    /* Fold vsum0 into vsum */
+    {
+	/* 4xu32 (4x32b) -> 2xu64 (2x33b) */
+	vsum0 = vpaddlq_u32(vreinterpretq_u32_u64(vsum0));
+	/* 4xu32 (2x(1b+32b)) -> 2xu64 (2x(0b+32b)) */
+	vsum0 = vpaddlq_u32(vreinterpretq_u32_u64(vsum0));
+	/* 4xu32 (4x32b) -> 2xu64 (2x33b) */
+	Assert((vgetq_lane_u64(vsum0, 0) >> 32) == 0);
+	Assert((vgetq_lane_u64(vsum0, 1) >> 32) == 0);
+	uint32x2_t vtmp = vmovn_u64(vsum0);
+	/* Add to accumulator */
+	vsum = vpadal_u32(vsum, vtmp);
+    }
+
+    /* Add any trailing group of 8 bytes */
+    if (nbytes & 8)
+    {
+	uint32x2_t vtmp = vld1_u32(ptr32);
+	/* Add to accumulator */
+	vsum = vpadal_u32(vsum, vtmp);
+	ptr32 += 2;
+	nbytes -= 8;
+    }
+    Assert(nbytes < 8);
+
+    /* Handle any trailing 1..7 bytes */
+    if (likely(nbytes != 0))
+    {
+	Assert(((uintptr_t) ptr32 & 7) == 0);
+	Assert(nbytes < 8);
+	uint64x1_t vword64 = vld1_u64((const uint64_t *) ptr32);
+	/* Get rid of bytes 7..nbytes */
+	uint64x1_t vmask = vdup_n_u64(ALL_ONES);
+	int64x1_t vshiftr = vdup_n_s64(-CHAR_BIT * (8 - nbytes));
+	vmask = vshl_u64(vmask, vshiftr);/* Shift right */
+	vword64 = vand_u64(vword64, vmask);
+	/* Fold 64-bit sum to 33 bits */
+	vword64 = vpaddl_u32(vreinterpret_u32_u64(vword64));
+	/* Add to accumulator */
+	vsum = vpadal_u32(vsum, vreinterpret_u32_u64(vword64));
+    }
+
+    /* Fold 64-bit vsum to 32 bits */
+    vsum = vpaddl_u32(vreinterpret_u32_u64(vsum));
+    vsum = vpaddl_u32(vreinterpret_u32_u64(vsum));
+    Assert(vget_lane_u32(vreinterpret_u32_u64(vsum), 1) == 0);
+
+    /* Fold 32-bit vsum to 16 bits */
+    uint32x2_t vsum32 = vreinterpret_u32_u64(vsum);
+    vsum32 = vpaddl_u16(vreinterpret_u16_u32(vsum32));
+    vsum32 = vpaddl_u16(vreinterpret_u16_u32(vsum32));
+    Assert(vget_lane_u16(vreinterpret_u16_u32(vsum32), 1) == 0);
+    Assert(vget_lane_u16(vreinterpret_u16_u32(vsum32), 2) == 0);
+    Assert(vget_lane_u16(vreinterpret_u16_u32(vsum32), 3) == 0);
+
+    /* Convert to 16-bit scalar */
+    uint16_t sum = vget_lane_u16(vreinterpret_u16_u32(vsum32), 0);
+
+    if (unlikely(swap))/* Odd base pointer is unexpected */
+    {
+	sum = bswap16(sum);
+    }
+    return sum;
+}
diff --git a/networking/chksum.c b/networking/chksum.c
new file mode 100644
index 0000000..95ce5ba
--- /dev/null
+++ b/networking/chksum.c
@@ -0,0 +1,81 @@
+/*
+ * Compute 16-bit sum in ones' complement arithmetic (with end-around carry).
+ * This sum is often used as a simple checksum in networking.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "networking.h"
+#include "chksum_common.h"
+
+always_inline
+static inline uint32_t
+slurp_head32(const void **pptr, uint32_t *nbytes)
+{
+    uint32_t sum = 0;
+    Assert(*nbytes >= 4);
+    uint32_t off = (uintptr_t) *pptr % 4;
+    if (likely(off != 0))
+    {
+	/* Get rid of bytes 0..off-1 */
+	const unsigned char *ptr32 = align_ptr(*pptr, 4);
+	uint32_t mask = ~0U << (CHAR_BIT * off);
+	sum = load32(ptr32) & mask;
+	*pptr = ptr32 + 4;
+	*nbytes -= 4 - off;
+    }
+    return sum;
+}
+
+/* Additional loop unrolling would help when not auto-vectorizing */
+unsigned short
+__chksum(const void *ptr, unsigned int nbytes)
+{
+    bool swap = false;
+    uint64_t sum = 0;
+
+    if (nbytes > 300)
+    {
+	/* 4-byte align pointer */
+	swap = (uintptr_t) ptr & 1;
+	sum = slurp_head32(&ptr, &nbytes);
+    }
+    /* Else benefit of aligning not worth the overhead */
+
+    /* Sum all 16-byte chunks */
+    const char *cptr = ptr;
+    for (uint32_t nquads = nbytes / 16; nquads != 0; nquads--)
+    {
+	uint64_t h0 = load32(cptr + 0);
+	uint64_t h1 = load32(cptr + 4);
+	uint64_t h2 = load32(cptr + 8);
+	uint64_t h3 = load32(cptr + 12);
+	sum += h0 + h1 + h2 + h3;
+	cptr += 16;
+    }
+    nbytes %= 16;
+    Assert(nbytes < 16);
+
+    /* Handle any trailing 4-byte chunks */
+    while (nbytes >= 4)
+    {
+	sum += load32(cptr);
+	cptr += 4;
+	nbytes -= 4;
+    }
+    Assert(nbytes < 4);
+
+    if (nbytes & 2)
+    {
+	sum += load16(cptr);
+	cptr += 2;
+    }
+
+    if (nbytes & 1)
+    {
+	sum += *(uint8_t *)cptr;
+    }
+
+    return fold_and_swap(sum, swap);
+}
diff --git a/networking/chksum_common.h b/networking/chksum_common.h
new file mode 100644
index 0000000..958c8cc
--- /dev/null
+++ b/networking/chksum_common.h
@@ -0,0 +1,132 @@
+/*
+ * Common code for checksum implementations
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef CHKSUM_COMMON_H
+#define CHKSUM_COMMON_H
+
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+#error Only little endian supported
+#endif
+
+#include <limits.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Assertions must be explicitly enabled */
+#if WANT_ASSERT
+#undef NDEBUG
+#include <assert.h>
+#define Assert(exp) assert(exp)
+#else
+#define Assert(exp) (void) (exp)
+#endif
+
+#ifdef __GNUC__
+#define likely(x)     __builtin_expect(!!(x), 1)
+#define unlikely(x)   __builtin_expect(!!(x), 0)
+#define may_alias     __attribute__((__may_alias__))
+#define always_inline __attribute__((always_inline))
+#ifdef __clang__
+#define no_unroll_loops
+#else
+#define no_unroll_loops  __attribute__((optimize("no-unroll-loops")))
+#endif
+#define bswap16(x)    __builtin_bswap16((x))
+#else
+#define likely(x)     (x)
+#define unlikely(x)   (x)
+#define may_alias
+#define always_inline
+#define no_unroll_loops
+#define bswap16(x)    ((uint8_t)((x) >> 8) | ((uint8_t)(x) << 8))
+#endif
+
+#define ALL_ONES ~UINT64_C(0)
+
+static inline
+uint64_t load64(const void *ptr)
+{
+    /* GCC will optimise this to a normal load instruction */
+    uint64_t v;
+    memcpy(&v, ptr, sizeof v);
+    return v;
+}
+
+static inline
+uint32_t load32(const void *ptr)
+{
+    /* GCC will optimise this to a normal load instruction */
+    uint32_t v;
+    memcpy(&v, ptr, sizeof v);
+    return v;
+}
+
+static inline
+uint16_t load16(const void *ptr)
+{
+    /* GCC will optimise this to a normal load instruction */
+    uint16_t v;
+    memcpy(&v, ptr, sizeof v);
+    return v;
+}
+
+/* slurp_small() is for small buffers, don't waste cycles on alignment */
+no_unroll_loops
+always_inline
+static inline uint64_t
+slurp_small(const void *ptr, uint32_t nbytes)
+{
+    const unsigned char *cptr = ptr;
+    uint64_t sum = 0;
+    while (nbytes >= 4)
+    {
+	sum += load32(cptr);
+	cptr += 4;
+	nbytes -= 4;
+    }
+    if (nbytes & 2)
+    {
+	sum += load16(cptr);
+	cptr += 2;
+    }
+    if (nbytes & 1)
+    {
+	sum += (uint8_t) *cptr;
+    }
+    return sum;
+}
+
+static inline const void *
+align_ptr(const void *ptr, size_t bytes)
+{
+    return (void *) ((uintptr_t) ptr & -(uintptr_t) bytes);
+}
+
+always_inline
+static inline uint16_t
+fold_and_swap(uint64_t sum, bool swap)
+{
+    /* Fold 64-bit sum to 32 bits */
+    sum = (sum & 0xffffffff) + (sum >> 32);
+    sum = (sum & 0xffffffff) + (sum >> 32);
+    Assert(sum == (uint32_t) sum);
+
+    /* Fold 32-bit sum to 16 bits */
+    sum = (sum & 0xffff) + (sum >> 16);
+    sum = (sum & 0xffff) + (sum >> 16);
+    Assert(sum == (uint16_t) sum);
+
+    if (unlikely(swap)) /* Odd base pointer is unexpected */
+    {
+	sum = bswap16(sum);
+    }
+
+    return (uint16_t) sum;
+}
+
+#endif
diff --git a/networking/include/networking.h b/networking/include/networking.h
new file mode 100644
index 0000000..a88feff
--- /dev/null
+++ b/networking/include/networking.h
@@ -0,0 +1,14 @@
+/*
+ * Public API.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+unsigned short __chksum (const void *, unsigned int);
+#if __aarch64__ && __ARM_NEON
+unsigned short __chksum_aarch64_simd (const void *, unsigned int);
+#endif
+#if __arm__ && __ARM_NEON
+unsigned short __chksum_arm_simd (const void *, unsigned int);
+#endif
diff --git a/networking/test/chksum.c b/networking/test/chksum.c
new file mode 100644
index 0000000..41b9812
--- /dev/null
+++ b/networking/test/chksum.c
@@ -0,0 +1,381 @@
+/*
+ * Ones' complement checksum test & benchmark
+ *
+ * Copyright (c) 2016-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define _GNU_SOURCE
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <unistd.h>
+#include "../include/networking.h"
+
+#if WANT_ASSERT
+#undef NDEBUG
+#include <assert.h>
+#define Assert(exp) assert(exp)
+#else
+#define Assert(exp) (void) (exp)
+#endif
+
+#ifdef __GNUC__
+#define may_alias __attribute__((__may_alias__))
+#else
+#define may_alias
+#endif
+
+#define CACHE_LINE 64
+#define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1))
+
+/* Reference implementation - do not modify! */
+static uint16_t
+checksum_simple(const void *ptr, uint32_t nbytes)
+{
+    const uint16_t *may_alias hptr = ptr;
+    uint64_t sum = 0;/* Need 64-bit accumulator when nbytes > 64K */
+
+    /* Sum all halfwords, assume misaligned accesses are handled in HW */
+    for (uint32_t nhalfs = nbytes >> 1; nhalfs != 0; nhalfs--)
+    {
+	sum += *hptr++;
+    }
+
+    /* Add any trailing odd byte */
+    if ((nbytes & 0x01) != 0)
+    {
+	sum += *(uint8_t *) hptr;
+    }
+
+    /* Fold 64-bit sum to 32 bits */
+    sum = (sum & 0xffffffff) + (sum >> 32);
+    sum = (sum & 0xffffffff) + (sum >> 32);
+    Assert(sum == (uint32_t) sum);
+
+    /* Fold 32-bit sum to 16 bits */
+    sum = (sum & 0xffff) + (sum >> 16);
+    sum = (sum & 0xffff) + (sum >> 16);
+    Assert(sum == (uint16_t) sum);
+
+    return (uint16_t) sum;
+}
+
+static struct
+{
+    uint16_t (*cksum_fp)(const void *, uint32_t);
+    const char *name;
+} implementations[] =
+{
+    { checksum_simple, "simple"},
+    { __chksum, "scalar"},
+#if __arm__
+    { __chksum_arm_simd, "simd" },
+#elif __aarch64__
+    { __chksum_aarch64_simd, "simd" },
+#endif
+    { NULL, NULL}
+};
+
+static int
+find_impl(const char *name)
+{
+    for (int i = 0; implementations[i].name != NULL; i++)
+    {
+	if (strcmp(implementations[i].name, name) == 0)
+	{
+	    return i;
+	}
+    }
+    return -1;
+}
+
+static uint16_t (*CKSUM_FP)(const void *, uint32_t);
+static volatile uint16_t SINK;
+
+static bool
+verify(const void *data, uint32_t offset, uint32_t size)
+{
+
+    uint16_t csum_expected = checksum_simple(data, size);
+    uint16_t csum_actual = CKSUM_FP(data, size);
+    if (csum_actual != csum_expected)
+    {
+	fprintf(stderr, "\nInvalid checksum for offset %u size %u: "
+		"actual %04x expected %04x (valid)",
+		offset, size, csum_actual, csum_expected);
+	if (size < 65536)
+	{
+	    /* Fatal error */
+	    exit(EXIT_FAILURE);
+	}
+	/* Else some implementations only support sizes up to 2^16 */
+	return false;
+    }
+    return true;
+}
+
+static uint64_t
+clock_get_ns(void)
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec;
+}
+
+static void
+benchmark(const uint8_t *base,
+	  size_t poolsize,
+	  uint32_t blksize,
+	  uint32_t numops,
+	  uint64_t cpufreq)
+{
+    printf("%11u ", (unsigned int) blksize); fflush(stdout);
+
+    uint64_t start = clock_get_ns();
+    for (uint32_t i = 0; i < numops; i ++)
+    {
+	/* Read a random value from the pool */
+	uint32_t random = ((uint32_t *) base)[i % (poolsize / 4)];
+	/* Generate a random starting address */
+	const void *data = &base[random % (poolsize - blksize)];
+	SINK = CKSUM_FP(data, blksize);
+    }
+    uint64_t end = clock_get_ns();
+
+#define MEGABYTE 1000000 /* Decimal megabyte (MB) */
+    uint64_t elapsed_ns = end - start;
+    uint64_t elapsed_ms = elapsed_ns / 1000000;
+    uint32_t blks_per_s = (uint32_t) ((numops / elapsed_ms) * 1000);
+    uint64_t accbytes = (uint64_t) numops * blksize;
+    printf("%11ju ", (uintmax_t) ((accbytes / elapsed_ms) * 1000) / MEGABYTE);
+    unsigned int cyc_per_blk = cpufreq / blks_per_s;
+    printf("%11u ", cyc_per_blk);
+    if (blksize != 0)
+    {
+	unsigned int cyc_per_byte = 1000 * cyc_per_blk / blksize;
+	printf("%7u.%03u ",
+		cyc_per_byte / 1000, cyc_per_byte % 1000);
+    }
+    printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+    int c;
+    bool DUMP = false;
+    uint32_t IMPL = 0;/* Simple implementation */
+    uint64_t CPUFREQ = 0;
+    uint32_t BLKSIZE = 0;
+    uint32_t NUMOPS = 1000000;
+    uint32_t POOLSIZE = 512 * 1024;/* Typical ARM L2 cache size */
+
+    setvbuf(stdout, NULL, _IOLBF, 160);
+    while ((c = getopt(argc, argv, "b:df:i:n:p:")) != -1)
+    {
+	switch (c)
+	{
+	    case 'b' :
+		{
+		    int blksize = atoi(optarg);
+		    if (blksize < 1 || blksize > POOLSIZE / 2)
+		    {
+			fprintf(stderr, "Invalid block size %d\n", blksize);
+			exit(EXIT_FAILURE);
+		    }
+		    BLKSIZE = (unsigned) blksize;
+		    break;
+		}
+	    case 'd' :
+		DUMP = true;
+		break;
+	    case 'f' :
+		{
+		    int64_t cpufreq = atoll(optarg);
+		    if (cpufreq < 1)
+		    {
+			fprintf(stderr, "Invalid CPU frequency %"PRId64"\n",
+				cpufreq);
+			exit(EXIT_FAILURE);
+		    }
+		    CPUFREQ = cpufreq;
+		    break;
+		}
+	    case 'i' :
+		{
+		    int impl = find_impl(optarg);
+		    if (impl < 0)
+		    {
+			fprintf(stderr, "Invalid implementation %s\n", optarg);
+			goto usage;
+		    }
+		    IMPL = (unsigned) impl;
+		    break;
+		}
+	    case 'n' :
+		{
+		    int numops = atoi(optarg);
+		    if (numops < 1)
+		    {
+			fprintf(stderr, "Invalid number of operations %d\n", numops);
+			exit(EXIT_FAILURE);
+		    }
+		    NUMOPS = (unsigned) numops;
+		    break;
+		}
+	    case 'p' :
+		{
+		    int poolsize = atoi(optarg);
+		    if (poolsize < 4096)
+		    {
+			fprintf(stderr, "Invalid pool size %d\n", poolsize);
+			exit(EXIT_FAILURE);
+		    }
+		    char c = optarg[strlen(optarg) - 1];
+		    if (c == 'M')
+		    {
+			POOLSIZE = (unsigned) poolsize * 1024 * 1024;
+		    }
+		    else if (c == 'K')
+		    {
+			POOLSIZE = (unsigned) poolsize * 1024;
+		    }
+		    else
+		    {
+			POOLSIZE = (unsigned) poolsize;
+		    }
+		    break;
+		}
+	    default :
+usage :
+		fprintf(stderr, "Usage: checksum <options>\n"
+			"-b <blksize>    Block size\n"
+			"-d              Dump first 96 bytes of data\n"
+			"-f <cpufreq>    CPU frequency (Hz)\n"
+			"-i <impl>       Implementation\n"
+			"-n <numops>     Number of operations\n"
+			"-p <poolsize>   Pool size (K or M suffix)\n"
+		       );
+		printf("Implementations:");
+		for (int i = 0; implementations[i].name != NULL; i++)
+		{
+		    printf(" %s", implementations[i].name);
+		}
+		printf("\n");
+		exit(EXIT_FAILURE);
+	}
+    }
+    if (optind > argc)
+    {
+	goto usage;
+    }
+
+    CKSUM_FP = implementations[IMPL].cksum_fp;
+    POOLSIZE = ALIGN(POOLSIZE, CACHE_LINE);
+    uint8_t *base = mmap(0, POOLSIZE, PROT_READ|PROT_WRITE,
+			MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+    if (base == MAP_FAILED)
+    {
+	perror("aligned_alloc"), exit(EXIT_FAILURE);
+    }
+    for (size_t i = 0; i < POOLSIZE / 4; i++)
+    {
+	((uint32_t *) base)[i] = rand();
+    }
+
+    printf("Implementation: %s\n", implementations[IMPL].name);
+    printf("numops %u, poolsize ", NUMOPS);
+    if (POOLSIZE % (1024 * 1024) == 0)
+    {
+	printf("%uMiB", POOLSIZE / (1024 * 1024));
+    }
+    else if (POOLSIZE % 1024 == 0)
+    {
+	printf("%uKiB", POOLSIZE / 1024);
+    }
+    else
+    {
+	printf("%uB", POOLSIZE);
+    }
+    printf(", blocksize %u, CPU frequency %juMHz\n",
+	   BLKSIZE, (uintmax_t) (CPUFREQ / 1000000));
+#if WANT_ASSERT
+    printf("Warning: assertions are enabled\n");
+#endif
+
+    if (DUMP)
+    {
+	/* Print out first 96 bytes of data for human debugging */
+	for (int i = 0; i < 96; i++)
+	{
+	    if (i % 8 == 0)
+		printf("%2u:", i);
+	    printf(" %02x", base[i]);
+	    if (i % 8 == 7)
+		printf("\n");
+	}
+    }
+
+    /* Verify that chosen algorithm handles all combinations of offsets and sizes */
+    printf("Verifying..."); fflush(stdout);
+    bool success = true;
+    /* Check all (relevant) combinations of size and offset */
+    for (int size = 0; size <= 256; size++)
+    {
+	for (int offset = 0; offset < 255; offset++)
+	{
+	    /* Check at start of mapped memory */
+	    success &= verify(&base[offset], offset, size);
+	    /* Check at end of mapped memory */
+	    uint8_t *p = base + POOLSIZE - (size + offset);
+	    success &= verify(p, (uintptr_t) p % 64, size);
+	}
+    }
+    /* Check increasingly larger sizes */
+    for (size_t size = 1; size < POOLSIZE; size *= 2)
+    {
+	success &= verify(base, 0, size);
+    }
+    /* Check the full size, this can detect accumulator overflows */
+    success &= verify(base, 0, POOLSIZE);
+    printf("%s\n", success ? "OK" : "failure");
+
+    /* Print throughput in decimal megabyte (1000000B) per second */
+    if (CPUFREQ != 0)
+    {
+	printf("%11s %11s %11s %11s\n",
+	       "block size", "MB/s", "cycles/blk", "cycles/byte");
+    }
+    else
+    {
+	printf("%11s %11s %11s %11s\n",
+	       "block size", "MB/s", "ns/blk", "ns/byte");
+	CPUFREQ = 1000000000;
+    }
+    if (BLKSIZE != 0)
+    {
+	benchmark(base, POOLSIZE, BLKSIZE, NUMOPS, CPUFREQ);
+    }
+    else
+    {
+	static const uint16_t sizes[] =
+	    { 20, 42, 102, 250, 612, 1500, 3674, 9000, 0 };
+	for (int i = 0; sizes[i] != 0; i++)
+	{
+	    uint32_t numops = NUMOPS * 10000 / (40 + sizes[i]);
+	    benchmark(base, POOLSIZE, sizes[i], numops, CPUFREQ);
+	}
+    }
+
+    if (munmap(base, POOLSIZE) != 0)
+    {
+	perror("munmap"), exit(EXIT_FAILURE);
+    }
+
+    return success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/run-arm-optimized-routines-tests-on-android.sh b/run-arm-optimized-routines-tests-on-android.sh
index 61efeaf..21163a3 100755
--- a/run-arm-optimized-routines-tests-on-android.sh
+++ b/run-arm-optimized-routines-tests-on-android.sh
@@ -25,16 +25,20 @@ check_failure() {
 }
 
 # Run the 32-bit tests.
-adb shell /data/nativetest/mathtest/mathtest /data/nativetest/mathtest/math/test/testcases/directed/*
-check_failure
+if [ -e "$ANDROID_PRODUCT_OUT/data/nativetest/mathtest/mathtest" ]; then
+  adb shell /data/nativetest/mathtest/mathtest /data/nativetest/mathtest/math/test/testcases/directed/*
+  check_failure
+fi
 
 # TODO: these tests are currently a bloodbath.
 #adb shell 'cp /data/nativetest/ulp/math/test/runulp.sh /data/nativetest/ulp/ && sh /data/nativetest/ulp/runulp.sh'
 #check_failure
 
 # Run the 64-bit tests.
-adb shell /data/nativetest64/mathtest/mathtest /data/nativetest64/mathtest/math/test/testcases/directed/*
-check_failure
+if [ -e "$ANDROID_PRODUCT_OUT/data/nativetest64/mathtest/mathtest" ]; then
+  adb shell /data/nativetest64/mathtest/mathtest /data/nativetest64/mathtest/math/test/testcases/directed/*
+  check_failure
+fi
 
 # TODO: these tests are currently a bloodbath.
 #adb shell 'cp /data/nativetest64/ulp/math/test/runulp.sh /data/nativetest64/ulp/ && sh /data/nativetest64/ulp/runulp.sh'
diff --git a/string/Dir.mk b/string/Dir.mk
index 470917a..cf3453f 100644
--- a/string/Dir.mk
+++ b/string/Dir.mk
@@ -1,13 +1,20 @@
 # Makefile fragment - requires GNU make
 #
-# Copyright (c) 2019, Arm Limited.
+# Copyright (c) 2019-2021, Arm Limited.
 # SPDX-License-Identifier: MIT
 
 S := $(srcdir)/string
 B := build/string
 
-string-lib-srcs := $(wildcard $(S)/*.[cS])
+ifeq ($(ARCH),)
+all-string bench-string check-string install-string clean-string:
+	@echo "*** Please set ARCH in config.mk. ***"
+	@exit 1
+else
+
+string-lib-srcs := $(wildcard $(S)/$(ARCH)/*.[cS])
 string-test-srcs := $(wildcard $(S)/test/*.c)
+string-bench-srcs := $(wildcard $(S)/bench/*.c)
 
 string-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
 
@@ -15,13 +22,17 @@ string-libs := \
 	build/lib/libstringlib.so \
 	build/lib/libstringlib.a \
 
-string-tools := \
+string-tests := \
 	build/bin/test/memcpy \
 	build/bin/test/memmove \
 	build/bin/test/memset \
 	build/bin/test/memchr \
+	build/bin/test/memrchr \
 	build/bin/test/memcmp \
+	build/bin/test/__mtag_tag_region \
+	build/bin/test/__mtag_tag_zero_region \
 	build/bin/test/strcpy \
+	build/bin/test/stpcpy \
 	build/bin/test/strcmp \
 	build/bin/test/strchr \
 	build/bin/test/strrchr \
@@ -30,25 +41,34 @@ string-tools := \
 	build/bin/test/strnlen \
 	build/bin/test/strncmp
 
+string-benches := \
+	build/bin/bench/memcpy \
+	build/bin/bench/strlen
+
 string-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-lib-srcs)))
 string-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-test-srcs)))
+string-bench-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-bench-srcs)))
 
 string-objs := \
 	$(string-lib-objs) \
 	$(string-lib-objs:%.o=%.os) \
 	$(string-test-objs) \
+	$(string-bench-objs)
 
 string-files := \
 	$(string-objs) \
 	$(string-libs) \
-	$(string-tools) \
+	$(string-tests) \
+	$(string-benches) \
 	$(string-includes) \
 
-all-string: $(string-libs) $(string-tools) $(string-includes)
+all-string: $(string-libs) $(string-tests) $(string-benches) $(string-includes)
 
 $(string-objs): $(string-includes)
 $(string-objs): CFLAGS_ALL += $(string-cflags)
 
+$(string-test-objs): CFLAGS_ALL += -D_GNU_SOURCE
+
 build/lib/libstringlib.so: $(string-lib-objs:%.o=%.os)
 	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^
 
@@ -60,26 +80,27 @@ build/lib/libstringlib.a: $(string-lib-objs)
 build/bin/test/%: $(B)/test/%.o build/lib/libstringlib.a
 	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
 
+build/bin/bench/%: $(B)/bench/%.o build/lib/libstringlib.a
+	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+
 build/include/%.h: $(S)/include/%.h
 	cp $< $@
 
 build/bin/%.sh: $(S)/test/%.sh
 	cp $< $@
 
-check-string: $(string-tools)
-	$(EMULATOR) build/bin/test/memcpy
-	$(EMULATOR) build/bin/test/memmove
-	$(EMULATOR) build/bin/test/memset
-	$(EMULATOR) build/bin/test/memchr
-	$(EMULATOR) build/bin/test/memcmp
-	$(EMULATOR) build/bin/test/strcpy
-	$(EMULATOR) build/bin/test/strcmp
-	$(EMULATOR) build/bin/test/strchr
-	$(EMULATOR) build/bin/test/strrchr
-	$(EMULATOR) build/bin/test/strchrnul
-	$(EMULATOR) build/bin/test/strlen
-	$(EMULATOR) build/bin/test/strnlen
-	$(EMULATOR) build/bin/test/strncmp
+string-tests-out = $(string-tests:build/bin/test/%=build/string/test/%.out)
+
+build/string/test/%.out: build/bin/test/%
+	$(EMULATOR) $^ | tee $@.tmp
+	mv $@.tmp $@
+
+check-string: $(string-tests-out)
+	! grep FAIL $^
+
+bench-string: $(string-benches)
+	$(EMULATOR) build/bin/bench/strlen
+	$(EMULATOR) build/bin/bench/memcpy
 
 install-string: \
  $(string-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \
@@ -87,5 +108,6 @@ install-string: \
 
 clean-string:
 	rm -f $(string-files)
+endif
 
-.PHONY: all-string check-string install-string clean-string
+.PHONY: all-string bench-string check-string install-string clean-string
diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S
new file mode 100644
index 0000000..84339f7
--- /dev/null
+++ b/string/aarch64/__mtag_tag_region.S
@@ -0,0 +1,100 @@
+/*
+ * __mtag_tag_region - tag memory
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, MTE, LP64 ABI.
+ *
+ * Interface contract:
+ * Address is 16 byte aligned and size is multiple of 16.
+ * Returns the passed pointer.
+ * The memory region may remain untagged if tagging is not enabled.
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_MEMORY_TAGGING
+
+#define dstin	x0
+#define count	x1
+#define dst	x2
+#define dstend	x3
+#define tmp	x4
+#define zva_val	x4
+
+ENTRY (__mtag_tag_region)
+	PTR_ARG (0)
+	SIZE_ARG (1)
+
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	L(set_long)
+
+	tbnz	count, 6, L(set96)
+
+	/* Set 0, 16, 32, or 48 bytes.  */
+	lsr	tmp, count, 5
+	add	tmp, dstin, tmp, lsl 4
+	cbz     count, L(end)
+	stg	dstin, [dstin]
+	stg	dstin, [tmp]
+	stg	dstin, [dstend, -16]
+L(end):
+	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	st2g	dstin, [dstin]
+	st2g	dstin, [dstin, 32]
+	st2g	dstin, [dstend, -32]
+	ret
+
+	.p2align 4
+	/* Size is > 96 bytes.  */
+L(set_long):
+	cmp	count, 160
+	b.lo	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
+#endif
+	st2g	dstin, [dstin]
+	st2g	dstin, [dstin, 32]
+	bic	dst, dstin, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	.p2align 4
+L(zva_loop):
+	add	dst, dst, 64
+	dc	gva, dst
+	subs	count, count, 64
+	b.hi	L(zva_loop)
+	st2g	dstin, [dstend, -64]
+	st2g	dstin, [dstend, -32]
+	ret
+
+L(no_zva):
+	sub	dst, dstin, 32		/* Dst is biased by -32.  */
+	sub	count, count, 64	/* Adjust count for loop.  */
+L(no_zva_loop):
+	st2g	dstin, [dst, 32]
+	st2g	dstin, [dst, 64]!
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
+	st2g	dstin, [dstend, -64]
+	st2g	dstin, [dstend, -32]
+	ret
+
+END (__mtag_tag_region)
+#endif
diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S
new file mode 100644
index 0000000..f58364c
--- /dev/null
+++ b/string/aarch64/__mtag_tag_zero_region.S
@@ -0,0 +1,100 @@
+/*
+ * __mtag_tag_zero_region - tag memory and fill it with zero bytes
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, MTE, LP64 ABI.
+ *
+ * Interface contract:
+ * Address is 16 byte aligned and size is multiple of 16.
+ * Returns the passed pointer.
+ * The memory region may remain untagged if tagging is not enabled.
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_MEMORY_TAGGING
+
+#define dstin	x0
+#define count	x1
+#define dst	x2
+#define dstend	x3
+#define tmp	x4
+#define zva_val	x4
+
+ENTRY (__mtag_tag_zero_region)
+	PTR_ARG (0)
+	SIZE_ARG (1)
+
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	L(set_long)
+
+	tbnz	count, 6, L(set96)
+
+	/* Set 0, 16, 32, or 48 bytes.  */
+	lsr	tmp, count, 5
+	add	tmp, dstin, tmp, lsl 4
+	cbz     count, L(end)
+	stzg	dstin, [dstin]
+	stzg	dstin, [tmp]
+	stzg	dstin, [dstend, -16]
+L(end):
+	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	stz2g	dstin, [dstin]
+	stz2g	dstin, [dstin, 32]
+	stz2g	dstin, [dstend, -32]
+	ret
+
+	.p2align 4
+	/* Size is > 96 bytes.  */
+L(set_long):
+	cmp	count, 160
+	b.lo	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
+#endif
+	stz2g	dstin, [dstin]
+	stz2g	dstin, [dstin, 32]
+	bic	dst, dstin, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	.p2align 4
+L(zva_loop):
+	add	dst, dst, 64
+	dc	gzva, dst
+	subs	count, count, 64
+	b.hi	L(zva_loop)
+	stz2g	dstin, [dstend, -64]
+	stz2g	dstin, [dstend, -32]
+	ret
+
+L(no_zva):
+	sub	dst, dstin, 32		/* Dst is biased by -32.  */
+	sub	count, count, 64	/* Adjust count for loop.  */
+L(no_zva_loop):
+	stz2g	dstin, [dst, 32]
+	stz2g	dstin, [dst, 64]!
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
+	stz2g	dstin, [dstend, -64]
+	stz2g	dstin, [dstend, -32]
+	ret
+
+END (__mtag_tag_zero_region)
+#endif
diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S
new file mode 100644
index 0000000..5a54242
--- /dev/null
+++ b/string/aarch64/check-arch.S
@@ -0,0 +1,13 @@
+/*
+ * check ARCH setting.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if !__aarch64__
+# error ARCH setting does not match the compiler.
+#endif
+
+/* Include for GNU property notes.  */
+#include "../asmdefs.h"
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
new file mode 100644
index 0000000..c2e967d
--- /dev/null
+++ b/string/aarch64/memchr-mte.S
@@ -0,0 +1,116 @@
+/*
+ * memchr - find a character in a memory zone
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+#define result		x0
+
+#define src		x3
+#define cntrem		x4
+#define synd		x5
+#define shift		x6
+#define	tmp		x7
+#define wtmp		w7
+
+#define vrepchr		v0
+#define qdata		q1
+#define vdata		v1
+#define vhas_chr	v2
+#define vrepmask	v3
+#define vend		v4
+#define dend		d4
+
+/*
+   Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (__memchr_aarch64_mte)
+	PTR_ARG (0)
+	SIZE_ARG (2)
+	bic	src, srcin, 15
+	cbz	cntin, L(nomatch)
+	ld1	{vdata.16b}, [src]
+	dup	vrepchr.16b, chrin
+	mov	wtmp, 0xf00f
+	dup	vrepmask.8h, wtmp
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	lsl	shift, srcin, 2
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbz	synd, L(start_loop)
+
+	rbit	synd, synd
+	clz	synd, synd
+	add	result, srcin, synd, lsr 2
+	cmp	cntin, synd, lsr 2
+	csel	result, result, xzr, hi
+	ret
+
+L(start_loop):
+	sub	tmp, src, srcin
+	add	tmp, tmp, 16
+	subs	cntrem, cntin, tmp
+	b.ls	L(nomatch)
+
+	/* Make sure that it won't overread by a 16-byte chunk */
+	add	tmp, cntrem, 15
+	tbnz	tmp, 4, L(loop32_2)
+
+	.p2align 4
+L(loop32):
+	ldr	qdata, [src, 16]!
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbnz	synd, L(end)
+
+L(loop32_2):
+	ldr	qdata, [src, 16]!
+	subs	cntrem, cntrem, 32
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	b.ls	L(end)
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbz	synd, L(loop32)
+L(end):
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	add	tmp, srcin, cntin
+	sub	cntrem, tmp, src
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	synd, synd
+	cmp	cntrem, synd, lsr 2
+	add	result, src, synd, lsr 2
+	csel	result, result, xzr, hi
+	ret
+
+L(nomatch):
+	mov	result, 0
+	ret
+
+END (__memchr_aarch64_mte)
+
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S
index 0d75acd..c22e659 100644
--- a/string/aarch64/memchr-sve.S
+++ b/string/aarch64/memchr-sve.S
@@ -1,28 +1,27 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-	.globl	__memchr_aarch64_sve
-	.type	__memchr_aarch64_sve, %function
-	.p2align 4
-__memchr_aarch64_sve:
+ENTRY (__memchr_aarch64_sve)
+	PTR_ARG (0)
+	SIZE_ARG (2)
 	dup	z1.b, w1			/* duplicate c to a vector */
 	setffr					/* initialize FFR */
 	mov	x3, 0				/* initialize off */
-	nop
 
+	.p2align 4
 0:	whilelo	p1.b, x3, x2			/* make sure off < max */
 	b.none	9f
 
@@ -59,4 +58,7 @@ __memchr_aarch64_sve:
 9:	mov	x0, 0			/* return null */
 	ret
 
-	.size	__memchr_aarch64_sve, . - __memchr_aarch64_sve
+END (__memchr_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index 10be49e..353f0d1 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -1,7 +1,7 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2014-2019, Arm Limited.
+ * Copyright (c) 2014-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -47,6 +47,8 @@
  */
 
 ENTRY (__memchr_aarch64)
+	PTR_ARG (0)
+	SIZE_ARG (2)
 	/* Do not dereference srcin if no bytes to compare.  */
 	cbz	cntin, L(zero_length)
 	/*
@@ -110,7 +112,7 @@ L(end):
 	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
 	mov	synd, vend.d[0]
 	/* Only do the clear for the last possible block */
-	b.hi	L(tail)
+	b.hs	L(tail)
 
 L(masklast):
 	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
@@ -141,3 +143,4 @@ L(zero_length):
 	ret
 
 END (__memchr_aarch64)
+
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S
index d4f6026..78c5eca 100644
--- a/string/aarch64/memcmp-sve.S
+++ b/string/aarch64/memcmp-sve.S
@@ -1,23 +1,23 @@
 /*
  * memcmp - compare memory
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-	.globl	__memcmp_aarch64_sve
-	.type	__memcmp_aarch64_sve, %function
-	.p2align 4
-__memcmp_aarch64_sve:
+ENTRY (__memcmp_aarch64_sve)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	mov	x3, 0			/* initialize off */
 
 0:	whilelo	p0.b, x3, x2		/* while off < max */
@@ -45,4 +45,7 @@ __memcmp_aarch64_sve:
 9:	mov	x0, 0			/* return equality */
 	ret
 
-	.size	__memcmp_aarch64_sve, . - __memcmp_aarch64_sve
+END (__memcmp_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index 6722516..3b10266 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -1,6 +1,6 @@
 /* memcmp - compare memory
  *
- * Copyright (c) 2013, Arm Limited.
+ * Copyright (c) 2013-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -28,6 +28,9 @@
 #define tmp2		x8
 
 ENTRY (__memcmp_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	subs	limit, limit, 8
 	b.lo	L(less8)
 
@@ -131,3 +134,4 @@ L(byte_loop):
 	ret
 
 END (__memcmp_aarch64)
+
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
new file mode 100644
index 0000000..f97f2c3
--- /dev/null
+++ b/string/aarch64/memcpy-advsimd.S
@@ -0,0 +1,206 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#include "../asmdefs.h"
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_lw	w10
+#define tmp1	x14
+
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
+#define H_q	q7
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The source pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY_ALIAS (__memmove_aarch64_simd)
+ENTRY (__memcpy_aarch64_simd)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 128
+	b.hi	L(copy_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
+
+	/* Small copies: 0..32 bytes.  */
+	cmp	count, 16
+	b.lo	L(copy16)
+	ldr	A_q, [src]
+	ldr	B_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstend, -16]
+	ret
+
+	/* Copy 8-15 bytes.  */
+L(copy16):
+	tbz	count, 3, L(copy8)
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+
+	.p2align 3
+	/* Copy 4-7 bytes.  */
+L(copy8):
+	tbz	count, 2, L(copy4)
+	ldr	A_lw, [src]
+	ldr	B_lw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	B_lw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+	cbz	count, L(copy0)
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	C_lw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	C_lw, [dstend, -1]
+L(copy0):
+	ret
+
+	.p2align 4
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	ldp	A_q, B_q, [src]
+	ldp	C_q, D_q, [srcend, -32]
+	cmp	count, 64
+	b.hi	L(copy128)
+	stp	A_q, B_q, [dstin]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	.p2align 4
+	/* Copy 65..128 bytes.  */
+L(copy128):
+	ldp	E_q, F_q, [src, 32]
+	cmp	count, 96
+	b.ls	L(copy96)
+	ldp	G_q, H_q, [srcend, -64]
+	stp	G_q, H_q, [dstend, -64]
+L(copy96):
+	stp	A_q, B_q, [dstin]
+	stp	E_q, F_q, [dstin, 32]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	/* Copy more than 128 bytes.  */
+L(copy_long):
+	/* Use backwards copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cmp	tmp1, count
+	b.lo	L(copy_long_backwards)
+
+	/* Copy 16 bytes and then align src to 16-byte alignment.  */
+	ldr	D_q, [src]
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_q, B_q, [src, 16]
+	str	D_q, [dstin]
+	ldp	C_q, D_q, [src, 48]
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(copy64_from_end)
+L(loop64):
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [src, 80]
+	stp	C_q, D_q, [dst, 48]
+	ldp	C_q, D_q, [src, 112]
+	add	src, src, 64
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+	ldp	E_q, F_q, [srcend, -64]
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [srcend, -32]
+	stp	C_q, D_q, [dst, 48]
+	stp	E_q, F_q, [dstend, -64]
+	stp	A_q, B_q, [dstend, -32]
+	ret
+
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
+L(copy_long_backwards):
+	cbz	tmp1, L(copy0)
+	ldr	D_q, [srcend, -16]
+	and	tmp1, srcend, 15
+	bic	srcend, srcend, 15
+	sub	count, count, tmp1
+	ldp	A_q, B_q, [srcend, -32]
+	str	D_q, [dstend, -16]
+	ldp	C_q, D_q, [srcend, -64]
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	L(copy64_from_start)
+
+L(loop64_backwards):
+	str	B_q, [dstend, -16]
+	str	A_q, [dstend, -32]
+	ldp	A_q, B_q, [srcend, -96]
+	str	D_q, [dstend, -48]
+	str	C_q, [dstend, -64]!
+	ldp	C_q, D_q, [srcend, -128]
+	sub	srcend, srcend, 64
+	subs	count, count, 64
+	b.hi	L(loop64_backwards)
+
+	/* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+	ldp	E_q, F_q, [src, 32]
+	stp	A_q, B_q, [dstend, -32]
+	ldp	A_q, B_q, [src]
+	stp	C_q, D_q, [dstend, -64]
+	stp	E_q, F_q, [dstin, 32]
+	stp	A_q, B_q, [dstin]
+	ret
+
+END (__memcpy_aarch64_simd)
+
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index 1aad88e..dd254f6 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -1,7 +1,7 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2012-2019, Arm Limited.
+ * Copyright (c) 2012-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -22,11 +22,11 @@
 #define A_l	x6
 #define A_lw	w6
 #define A_h	x7
-#define A_hw	w7
 #define B_l	x8
 #define B_lw	w8
 #define B_h	x9
 #define C_l	x10
+#define C_lw	w10
 #define C_h	x11
 #define D_l	x12
 #define D_h	x13
@@ -40,119 +40,117 @@
 #define H_h	srcend
 #define tmp1	x14
 
-/* This implementation of memcpy correctly handles overlaps, therefore
-   __memmove_aarch64 aliases to __memcpy_aarch64. By moving the src and
-   dst buffer overlap check from the start of memmove code to the
-   beginning of large copy code, the overhead of combining memcpy
-   and memmove implementations is negligible.
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
 
-   Copies are split into 3 main cases: small copies of up to 16 bytes,
-   medium copies of 17..128 bytes which are fully unrolled, and large
-   copies (moves).
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
 
-   Large forward moves align the destination and use an unrolled loop
-   processing 64 bytes per iteration.
-
-   Large backward moves align dstend and use an unrolled loop processing
-   64 bytes per iteration.
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
 */
 
-ENTRY (__memcpy_aarch64)
 ENTRY_ALIAS (__memmove_aarch64)
+ENTRY (__memcpy_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	add	srcend, src, count
 	add	dstend, dstin, count
-	cmp	count, 16
-	b.ls	L(copy16)
 	cmp	count, 128
-	b.hi	L(move_long)
+	b.hi	L(copy_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
 
-	/* Medium copies: 17..128 bytes.  */
+	/* Small copies: 0..32 bytes.  */
+	cmp	count, 16
+	b.lo	L(copy16)
 	ldp	A_l, A_h, [src]
 	ldp	D_l, D_h, [srcend, -16]
-	cmp	count, 32
-	b.hi	L(copy33_128)
 	stp	A_l, A_h, [dstin]
 	stp	D_l, D_h, [dstend, -16]
 	ret
 
-	.p2align 4
-	/* Small copies: 0..16 bytes.  */
+	/* Copy 8-15 bytes.  */
 L(copy16):
-	/* 8-15 bytes.  */
-	cmp	count, 8
-	b.lo	1f
+	tbz	count, 3, L(copy8)
 	ldr	A_l, [src]
 	ldr	A_h, [srcend, -8]
 	str	A_l, [dstin]
 	str	A_h, [dstend, -8]
 	ret
 
-	.p2align 4
-1:
-	/* 4-7 bytes.  */
-	tbz	count, 2, 1f
+	.p2align 3
+	/* Copy 4-7 bytes.  */
+L(copy8):
+	tbz	count, 2, L(copy4)
 	ldr	A_lw, [src]
-	ldr	A_hw, [srcend, -4]
+	ldr	B_lw, [srcend, -4]
 	str	A_lw, [dstin]
-	str	A_hw, [dstend, -4]
+	str	B_lw, [dstend, -4]
 	ret
 
-	.p2align 4
-	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
-	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
-1:
-	cbz	count, 2f
+	/* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+	cbz	count, L(copy0)
 	lsr	tmp1, count, 1
 	ldrb	A_lw, [src]
-	ldrb	A_hw, [srcend, -1]
+	ldrb	C_lw, [srcend, -1]
 	ldrb	B_lw, [src, tmp1]
 	strb	A_lw, [dstin]
 	strb	B_lw, [dstin, tmp1]
-	strb	A_hw, [dstend, -1]
-2:	ret
+	strb	C_lw, [dstend, -1]
+L(copy0):
+	ret
 
 	.p2align 4
-	/* Copy 33..128 bytes.  */
-L(copy33_128):
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	ldp	A_l, A_h, [src]
 	ldp	B_l, B_h, [src, 16]
 	ldp	C_l, C_h, [srcend, -32]
+	ldp	D_l, D_h, [srcend, -16]
 	cmp	count, 64
-	b.hi	L(copy65_128)
+	b.hi	L(copy128)
 	stp	A_l, A_h, [dstin]
-	stp	D_l, D_h, [dstend, -16]
 	stp	B_l, B_h, [dstin, 16]
 	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
 	ret
 
 	.p2align 4
 	/* Copy 65..128 bytes.  */
-L(copy65_128):
+L(copy128):
 	ldp	E_l, E_h, [src, 32]
 	ldp	F_l, F_h, [src, 48]
+	cmp	count, 96
+	b.ls	L(copy96)
 	ldp	G_l, G_h, [srcend, -64]
 	ldp	H_l, H_h, [srcend, -48]
+	stp	G_l, G_h, [dstend, -64]
+	stp	H_l, H_h, [dstend, -48]
+L(copy96):
 	stp	A_l, A_h, [dstin]
-	stp	D_l, D_h, [dstend, -16]
 	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstend, -32]
 	stp	E_l, E_h, [dstin, 32]
 	stp	F_l, F_h, [dstin, 48]
-	stp	G_l, G_h, [dstend, -64]
-	stp	H_l, H_h, [dstend, -48]
+	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
 	ret
 
 	.p2align 4
-	/* Move more than 128 bytes.  */
-L(move_long):
-	sub	tmp1, dstin, src	/* Overlap check.  */
+	/* Copy more than 128 bytes.  */
+L(copy_long):
+	/* Use backwards copy if there is an overlap.  */
+	sub	tmp1, dstin, src
 	cbz	tmp1, L(copy0)
 	cmp	tmp1, count
-	b.lo	L(move_long_backwards)
+	b.lo	L(copy_long_backwards)
 
-	/* Align dst to 16 byte alignment so that we don't cross cache line
-	   boundaries on both loads and stores.  There are at least 128 bytes
-	   to copy, so copy 16 bytes unaligned and then align.  The loop
-	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
 
 	ldp	D_l, D_h, [src]
 	and	tmp1, dstin, 15
@@ -179,9 +177,7 @@ L(loop64):
 	subs	count, count, 64
 	b.hi	L(loop64)
 
-	/* Write the last full set of 64 bytes.  The remainder is at most 64
-	   bytes, so it is safe to always copy 64 bytes from the end even if
-	   there is just 1 byte left.  */
+	/* Write the last iteration and copy 64 bytes from the end.  */
 L(copy64_from_end):
 	ldp	E_l, E_h, [srcend, -64]
 	stp	A_l, A_h, [dst, 16]
@@ -195,20 +191,13 @@ L(copy64_from_end):
 	stp	A_l, A_h, [dstend, -48]
 	stp	B_l, B_h, [dstend, -32]
 	stp	C_l, C_h, [dstend, -16]
-
-L(copy0):
 	ret
 
 	.p2align 4
 
-	/* Move more than 128 bytes where src and dst buffers overlap
-	   and dst > src.
-
-     Align dstend to 16 byte alignment so that we don't cross cache line
-	   boundaries on both loads and stores.  There are at least 128 bytes
-	   to copy, so copy 16 bytes unaligned and then align.  The loop
-	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
-L(move_long_backwards):
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
 	ldp	D_l, D_h, [srcend, -16]
 	and	tmp1, dstend, 15
 	sub	srcend, srcend, tmp1
@@ -234,9 +223,7 @@ L(loop64_backwards):
 	subs	count, count, 64
 	b.hi	L(loop64_backwards)
 
-	/* Write the last full set of 64 bytes.  The remainder is at most 64
-	   bytes, so it is safe to always copy 64 bytes from the start even if
-	   there is just 1 byte left.  */
+	/* Write the last iteration and copy 64 bytes from the start.  */
 L(copy64_from_start):
 	ldp	G_l, G_h, [src, 48]
 	stp	A_l, A_h, [dstend, -16]
@@ -253,3 +240,4 @@ L(copy64_from_start):
 	ret
 
 END (__memcpy_aarch64)
+
diff --git a/string/aarch64/memcpy_simd.S b/string/aarch64/memcpy_simd.S
deleted file mode 100644
index fa2442f..0000000
--- a/string/aarch64/memcpy_simd.S
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * memcpy/memmove using SIMD registers
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, unaligned accesses.
- *
- */
-
-#include "../asmdefs.h"
-
-#define dstin	x0
-#define src	x1
-#define count	x2
-#define dst	x3
-#define srcend	x4
-#define dstend	x5
-#define A_l	x6
-#define A_lw	w6
-#define A_h	x7
-#define A_hw	w7
-#define B_l	x8
-#define B_lw	w8
-#define B_h	x9
-#define C_l	x10
-#define C_h	x11
-#define D_l	x12
-#define D_h	x13
-#define E_l	x14
-#define E_h	x15
-#define F_l	x16
-#define F_h	x17
-#define G_l	count
-#define G_h	dst
-#define H_l	src
-#define H_h	srcend
-#define tmp1	x14
-
-#define A_q	q0
-#define B_q	q1
-#define C_q	q2
-#define D_q	q3
-#define E_q	q4
-#define F_q	q5
-#define G_q	q6
-#define H_q	q7
-
-/* This implementation of memcpy correctly handles overlaps, therefore
-   __memmove_aarch64_simd aliases to __memcpy_aarch64_simd. By moving the
-   src and dst buffer overlap check from the start of memmove code to the
-   beginning of large copy code, the overhead of combining memcpy
-   and memmove implementations is negligible.
-
-   Copies are split into 3 main cases: small copies of up to 16 bytes,
-   medium copies of 17..128 bytes which are fully unrolled, and large
-   copies (moves).
-
-   Large forward moves align the source and use an unrolled loop
-   processing 64 bytes per iteration.
-
-   Large backward moves align srcend and use an unrolled loop processing
-   64 bytes per iteration.
-*/
-
-ENTRY (__memcpy_aarch64_simd)
-ENTRY_ALIAS (__memmove_aarch64_simd)
-	add	srcend, src, count
-	add	dstend, dstin, count
-	cmp	count, 16
-	b.ls	L(copy16_simd)
-	cmp	count, 128
-	b.hi	L(move_long_simd)
-
-	/* Medium copies: 17..128 bytes.  */
-	ldr	A_q, [src]
-	ldr	D_q, [srcend, -16]
-	cmp	count, 32
-	b.hi	L(copy33_128_simd)
-	str	A_q, [dstin]
-	str	D_q, [dstend, -16]
-	ret
-
-	.p2align 4
-	/* Small copies: 0..16 bytes.  */
-L(copy16_simd):
-	/* 8-15 bytes.  */
-	cmp	count, 8
-	b.lo	1f
-	ldr	A_l, [src]
-	ldr	A_h, [srcend, -8]
-	str	A_l, [dstin]
-	str	A_h, [dstend, -8]
-	ret
-
-	.p2align 4
-1:
-	/* 4-7 bytes.  */
-	tbz	count, 2, 1f
-	ldr	A_lw, [src]
-	ldr	A_hw, [srcend, -4]
-	str	A_lw, [dstin]
-	str	A_hw, [dstend, -4]
-	ret
-
-	.p2align 4
-	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
-	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
-1:
-	cbz	count, 2f
-	lsr	tmp1, count, 1
-	ldrb	A_lw, [src]
-	ldrb	A_hw, [srcend, -1]
-	ldrb	B_lw, [src, tmp1]
-	strb	A_lw, [dstin]
-	strb	B_lw, [dstin, tmp1]
-	strb	A_hw, [dstend, -1]
-2:	ret
-
-	.p2align 4
-	/* Copy 33..128 bytes.  */
-L(copy33_128_simd):
-	ldr	B_q, [src, 16]
-	ldr	C_q, [srcend, -32]
-	cmp	count, 64
-	b.hi	L(copy65_128_simd)
-	str	A_q, [dstin]
-	str	D_q, [dstend, -16]
-	str	B_q, [dstin, 16]
-	str	C_q, [dstend, -32]
-	ret
-
-	.p2align 4
-	/* Copy 65..128 bytes.  */
-L(copy65_128_simd):
-	ldr	E_q, [src, 32]
-	ldr	F_q, [src, 48]
-	ldr	G_q, [srcend, -64]
-	ldr	H_q, [srcend, -48]
-	str	A_q, [dstin]
-	str	D_q, [dstend, -16]
-	str	B_q, [dstin, 16]
-	str	C_q, [dstend, -32]
-	str	E_q, [dstin, 32]
-	str	F_q, [dstin, 48]
-	str	G_q, [dstend, -64]
-	str	H_q, [dstend, -48]
-	ret
-
-	.p2align 4
-	/* Move more than 128 bytes.  */
-L(move_long_simd):
-	sub	tmp1, dstin, src	/* Overlap check.  */
-	cbz	tmp1, L(copy0_simd)
-	cmp	tmp1, count
-	b.lo	L(move_long_backwards_simd)
-
-	/* Align src to 16 byte alignment so that we don't cross cache line
-	   boundaries on both loads and stores.  There are at least 128 bytes
-	   to copy, so copy 16 bytes unaligned and then align.  The loop
-	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
-
-	ldr	D_q, [src]
-	and	tmp1, src, 15
-	bic	src, src, 15
-	sub	dst, dstin, tmp1
-	add	count, count, tmp1	/* Count is now 16 too large.  */
-	ldr	A_q, [src, 16]
-	str	D_q, [dstin]
-	ldr	B_q, [src, 32]
-	ldr	C_q, [src, 48]
-	ldr	D_q, [src, 64]!
-	subs	count, count, 128 + 16	/* Test and readjust count.  */
-	b.ls	L(copy64_from_end_simd)
-
-L(loop64_simd):
-	str	A_q, [dst, 16]
-	ldr	A_q, [src, 16]
-	str	B_q, [dst, 32]
-	ldr	B_q, [src, 32]
-	str	C_q, [dst, 48]
-	ldr	C_q, [src, 48]
-	str	D_q, [dst, 64]!
-	ldr	D_q, [src, 64]!
-	subs	count, count, 64
-	b.hi	L(loop64_simd)
-
-	/* Write the last full set of 64 bytes.  The remainder is at most 64
-	   bytes, so it is safe to always copy 64 bytes from the end even if
-	   there is just 1 byte left.  */
-L(copy64_from_end_simd):
-	ldr	E_q, [srcend, -64]
-	str	A_q, [dst, 16]
-	ldr	A_q, [srcend, -48]
-	str	B_q, [dst, 32]
-	ldr	B_q, [srcend, -32]
-	str	C_q, [dst, 48]
-	ldr	C_q, [srcend, -16]
-	str	D_q, [dst, 64]
-	str	E_q, [dstend, -64]
-	str	A_q, [dstend, -48]
-	str	B_q, [dstend, -32]
-	str	C_q, [dstend, -16]
-
-L(copy0_simd):
-	ret
-
-	.p2align 4
-
-	/* Move more than 128 bytes where src and dst buffers overlap
-	   and dst > src.
-
-     Align srcend to 16 byte alignment so that we don't cross cache line
-	   boundaries on both loads and stores.  There are at least 128 bytes
-	   to copy, so copy 16 bytes unaligned and then align.  The loop
-	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
-
-L(move_long_backwards_simd):
-	ldr	D_q, [srcend, -16]
-	and	tmp1, srcend, 15
-	sub	srcend, srcend, tmp1
-	sub	count, count, tmp1
-	ldr	A_q, [srcend, -16]
-	str	D_q, [dstend, -16]
-	ldr	B_q, [srcend, -32]
-	ldr	C_q, [srcend, -48]
-	ldr	D_q, [srcend, -64]!
-	sub	dstend, dstend, tmp1
-	subs	count, count, 128
-	b.ls	L(copy64_from_start_simd)
-
-L(loop64_backwards_simd):
-	str	A_q, [dstend, -16]
-	ldr	A_q, [srcend, -16]
-	str	B_q, [dstend, -32]
-	ldr	B_q, [srcend, -32]
-	str	C_q, [dstend, -48]
-	ldr	C_q, [srcend, -48]
-	str	D_q, [dstend, -64]!
-	ldr	D_q, [srcend, -64]!
-	subs	count, count, 64
-	b.hi	L(loop64_backwards_simd)
-
-	/* Write the last full set of 64 bytes.  The remainder is at most 64
-	   bytes, so it is safe to always copy 64 bytes from the start even if
-	   there is just 1 byte left.  */
-L(copy64_from_start_simd):
-	ldr	G_q, [src, 48]
-	str	A_q, [dstend, -16]
-	ldr	A_q, [src, 32]
-	str	B_q, [dstend, -32]
-	ldr	B_q, [src, 16]
-	str	C_q, [dstend, -48]
-	ldr	C_q, [src]
-	str	D_q, [dstend, -64]
-	str	G_q, [dstin, 48]
-	str	A_q, [dstin, 32]
-	str	B_q, [dstin, 16]
-	str	C_q, [dstin]
-	ret
-
-END (__memcpy_aarch64_simd)
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
new file mode 100644
index 0000000..7b4be84
--- /dev/null
+++ b/string/aarch64/memrchr.S
@@ -0,0 +1,117 @@
+/*
+ * memrchr - find last character in a memory zone.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+#define result		x0
+
+#define src		x3
+#define cntrem		x4
+#define synd		x5
+#define shift		x6
+#define	tmp		x7
+#define wtmp		w7
+#define end		x8
+#define endm1		x9
+
+#define vrepchr		v0
+#define qdata		q1
+#define vdata		v1
+#define vhas_chr	v2
+#define vrepmask	v3
+#define vend		v4
+#define dend		d4
+
+/*
+   Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (__memrchr_aarch64)
+	PTR_ARG (0)
+	add	end, srcin, cntin
+	sub	endm1, end, 1
+	bic	src, endm1, 15
+	cbz	cntin, L(nomatch)
+	ld1	{vdata.16b}, [src]
+	dup	vrepchr.16b, chrin
+	mov	wtmp, 0xf00f
+	dup	vrepmask.8h, wtmp
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	neg	shift, end, lsl 2
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	fmov	synd, dend
+	lsl	synd, synd, shift
+	cbz	synd, L(start_loop)
+
+	clz	synd, synd
+	sub	result, endm1, synd, lsr 2
+	cmp	cntin, synd, lsr 2
+	csel	result, result, xzr, hi
+	ret
+
+L(start_loop):
+	sub	tmp, end, src
+	subs	cntrem, cntin, tmp
+	b.ls	L(nomatch)
+
+	/* Make sure that it won't overread by a 16-byte chunk */
+	add	tmp, cntrem, 15
+	tbnz	tmp, 4, L(loop32_2)
+
+	.p2align 4
+L(loop32):
+	ldr	qdata, [src, -16]!
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbnz	synd, L(end)
+
+L(loop32_2):
+	ldr	qdata, [src, -16]!
+	subs	cntrem, cntrem, 32
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	b.ls	L(end)
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbz	synd, L(loop32)
+L(end):
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+
+	add	tmp, src, 15
+#ifdef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	synd, synd
+	sub	tmp, tmp, synd, lsr 2
+	cmp	tmp, srcin
+	csel	result, tmp, xzr, hs
+	ret
+
+L(nomatch):
+	mov	result, 0
+	ret
+
+END (__memrchr_aarch64)
+
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index 3868141..9fcd975 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -1,13 +1,13 @@
 /*
  * memset - fill memory with a constant byte
  *
- * Copyright (c) 2012, Arm Limited.
+ * Copyright (c) 2012-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  *
  */
 
@@ -19,14 +19,11 @@
 #define count	x2
 #define dst	x3
 #define dstend	x4
-#define tmp1	x5
-#define tmp1w	w5
-#define tmp2	x6
-#define tmp2w	w6
-#define zva_len x7
-#define zva_lenw w7
+#define zva_val	x5
 
 ENTRY (__memset_aarch64)
+	PTR_ARG (0)
+	SIZE_ARG (2)
 
 	dup	v0.16B, valw
 	add	dstend, dstin, count
@@ -42,7 +39,7 @@ ENTRY (__memset_aarch64)
 	str	val, [dstin]
 	str	val, [dstend, -8]
 	ret
-	nop
+	.p2align 4
 1:	tbz	count, 2, 2f
 	str	valw, [dstin]
 	str	valw, [dstend, -4]
@@ -72,108 +69,49 @@ L(set96):
 	stp	q0, q0, [dstend, -32]
 	ret
 
-	.p2align 3
-	nop
+	.p2align 4
 L(set_long):
 	and	valw, valw, 255
 	bic	dst, dstin, 15
 	str	q0, [dstin]
-	cmp	count, 256
-	ccmp	valw, 0, 0, cs
-	b.eq	L(try_zva)
-L(no_zva):
-	sub	count, dstend, dst	/* Count is 16 too large.  */
-	add	dst, dst, 16
-	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
-1:	stp	q0, q0, [dst], 64
-	stp	q0, q0, [dst, -32]
-L(tail64):
-	subs	count, count, 64
-	b.hi	1b
-2:	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
-	ret
-
-	.p2align 3
-L(try_zva):
-	mrs	tmp1, dczid_el0
-	tbnz	tmp1w, 4, L(no_zva)
-	and	tmp1w, tmp1w, 15
-	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
-	b.ne	 L(zva_128)
-
-	/* Write the first and last 64 byte aligned block using stp rather
-	   than using DC ZVA.  This is faster on some cores.
-	 */
-L(zva_64):
+	cmp	count, 160
+	ccmp	valw, 0, 0, hs
+	b.ne	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
+#endif
 	str	q0, [dst, 16]
 	stp	q0, q0, [dst, 32]
 	bic	dst, dst, 63
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-	nop
-1:	dc	zva, dst
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	.p2align 4
+L(zva_loop):
 	add	dst, dst, 64
+	dc	zva, dst
 	subs	count, count, 64
-	b.hi	1b
-	stp	q0, q0, [dst, 0]
-	stp	q0, q0, [dst, 32]
+	b.hi	L(zva_loop)
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 
-	.p2align 3
-L(zva_128):
-	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
-	b.ne	L(zva_other)
-
-	str	q0, [dst, 16]
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	sub	dst, dst, 16		/* Dst is biased by -32.  */
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+L(no_zva_loop):
 	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	bic	dst, dst, 127
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-1:	dc	zva, dst
-	add	dst, dst, 128
-	subs	count, count, 128
-	b.hi	1b
-	stp	q0, q0, [dstend, -128]
-	stp	q0, q0, [dstend, -96]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 
-L(zva_other):
-	mov	tmp2w, 4
-	lsl	zva_lenw, tmp2w, tmp1w
-	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
-	cmp	count, tmp1
-	blo	L(no_zva)
-
-	sub	tmp2, zva_len, 1
-	add	tmp1, dst, zva_len
-	add	dst, dst, 16
-	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
-	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
-	beq	2f
-1:	stp	q0, q0, [dst], 64
-	stp	q0, q0, [dst, -32]
-	subs	count, count, 64
-	b.hi	1b
-2:	mov	dst, tmp1
-	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
-	subs	count, count, zva_len
-	b.lo	4f
-3:	dc	zva, dst
-	add	dst, dst, zva_len
-	subs	count, count, zva_len
-	b.hs	3b
-4:	add	count, count, zva_len
-	b	L(tail64)
-
 END (__memset_aarch64)
+
diff --git a/string/aarch64/stpcpy-mte.S b/string/aarch64/stpcpy-mte.S
new file mode 100644
index 0000000..f1c7119
--- /dev/null
+++ b/string/aarch64/stpcpy-mte.S
@@ -0,0 +1,10 @@
+/*
+ * stpcpy - copy a string returning pointer to end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define BUILD_STPCPY 1
+
+#include "strcpy-mte.S"
diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/stpcpy-sve.S
new file mode 100644
index 0000000..82dd971
--- /dev/null
+++ b/string/aarch64/stpcpy-sve.S
@@ -0,0 +1,10 @@
+/*
+ * stpcpy - copy a string returning pointer to end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define BUILD_STPCPY 1
+
+#include "strcpy-sve.S"
diff --git a/string/aarch64/stpcpy.S b/string/aarch64/stpcpy.S
new file mode 100644
index 0000000..4f62aa4
--- /dev/null
+++ b/string/aarch64/stpcpy.S
@@ -0,0 +1,10 @@
+/*
+ * stpcpy - copy a string returning pointer to end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define BUILD_STPCPY 1
+
+#include "strcpy.S"
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
new file mode 100644
index 0000000..dcb0e46
--- /dev/null
+++ b/string/aarch64/strchr-mte.S
@@ -0,0 +1,105 @@
+/*
+ * strchr - find a character in a string
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin		x0
+#define chrin		w1
+#define result		x0
+
+#define src		x2
+#define tmp1		x1
+#define wtmp2		w3
+#define tmp3		x3
+
+#define vrepchr		v0
+#define vdata		v1
+#define qdata		q1
+#define vhas_nul	v2
+#define vhas_chr	v3
+#define vrepmask	v4
+#define vrepmask2	v5
+#define vend		v6
+#define dend		d6
+
+/* Core algorithm.
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
+   requested character, bits 2-3 are set if the byte is NUL (or matched), and
+   bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
+   bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
+   in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (__strchr_aarch64_mte)
+	PTR_ARG (0)
+	bic	src, srcin, 15
+	dup	vrepchr.16b, chrin
+	ld1	{vdata.16b}, [src]
+	mov	wtmp2, 0x3003
+	dup	vrepmask.8h, wtmp2
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	mov	wtmp2, 0xf00f
+	dup	vrepmask2.8h, wtmp2
+
+	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	lsl	tmp3, srcin, 2
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+
+	fmov	tmp1, dend
+	lsr	tmp1, tmp1, tmp3
+	cbz	tmp1, L(loop)
+
+	rbit	tmp1, tmp1
+	clz	tmp1, tmp1
+	/* Tmp1 is an even multiple of 2 if the target character was
+	   found first. Otherwise we've found the end of string.  */
+	tst	tmp1, 2
+	add	result, srcin, tmp1, lsr 2
+	csel	result, result, xzr, eq
+	ret
+
+	.p2align 4
+L(loop):
+	ldr	qdata, [src, 16]!
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	tmp1, dend
+	cbz	tmp1, L(loop)
+
+#ifdef __AARCH64EB__
+	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	fmov	tmp1, dend
+#else
+	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	fmov	tmp1, dend
+	rbit	tmp1, tmp1
+#endif
+	clz	tmp1, tmp1
+	/* Tmp1 is an even multiple of 2 if the target character was
+	   found first. Otherwise we've found the end of string.  */
+	tst	tmp1, 2
+	add	result, src, tmp1, lsr 2
+	csel	result, result, xzr, eq
+	ret
+
+END (__strchr_aarch64_mte)
+
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S
index 8d8a319..13ba9f4 100644
--- a/string/aarch64/strchr-sve.S
+++ b/string/aarch64/strchr-sve.S
@@ -1,19 +1,19 @@
 /*
  * strchr/strchrnul - find a character in a string
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
 /* To build as strchrnul, define BUILD_STRCHRNUL before compiling this file.  */
 #ifdef BUILD_STRCHRNUL
 #define FUNC  __strchrnul_aarch64_sve
@@ -21,10 +21,8 @@
 #define FUNC  __strchr_aarch64_sve
 #endif
 
-	.globl	FUNC
-	.type	FUNC, %function
-	.p2align 4
-FUNC:
+ENTRY (FUNC)
+	PTR_ARG (0)
 	dup	z1.b, w1		/* replicate byte across vector */
 	setffr				/* initialize FFR */
 	ptrue	p1.b			/* all ones; loop invariant */
@@ -66,4 +64,7 @@ FUNC:
 	incp	x0, p0.b
 	b	0b
 
-	.size	FUNC, . - FUNC
+END (FUNC)
+
+#endif
+
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index 00d9be3..1063cbf 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -1,7 +1,7 @@
 /*
  * strchr - find a character in a string
  *
- * Copyright (c) 2014-2019, Arm Limited.
+ * Copyright (c) 2014-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -51,11 +51,12 @@
 /* Locals and temporaries.  */
 
 ENTRY (__strchr_aarch64)
-	/* Magic constant 0x40100401 to allow us to identify which lane
-	   matches the requested byte.  Magic constant 0x80200802 used
-	   similarly for NUL termination.  */
-	mov	wtmp2, #0x0401
-	movk	wtmp2, #0x4010, lsl #16
+	PTR_ARG (0)
+	/* Magic constant 0xc0300c03 to allow us to identify which lane
+	   matches the requested byte.  Even bits are set if the character
+	   matches, odd bits if either the char is NUL or matches.  */
+	mov	wtmp2, 0x0c03
+	movk	wtmp2, 0xc030, lsl 16
 	dup	vrepchr.16b, chrin
 	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
 	dup	vrepmask_c.4s, wtmp2
@@ -73,12 +74,10 @@ ENTRY (__strchr_aarch64)
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
 	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
-	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
-	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
 	lsl	tmp1, tmp1, #1
 	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
 	mov	tmp3, #~0
@@ -89,31 +88,26 @@ ENTRY (__strchr_aarch64)
 	bic	tmp1, tmp3, tmp1	// Mask padding bits.
 	cbnz	tmp1, L(tail)
 
+	.p2align 4
 L(loop):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	/* Use a fast check for the termination condition.  */
-	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
-	orr	vend1.16b, vend1.16b, vend2.16b
-	addp	vend1.2d, vend1.2d, vend1.2d
+	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+	umaxp	vend1.16b, vend1.16b, vend1.16b
 	mov	tmp1, vend1.d[0]
 	cbz	tmp1, L(loop)
 
 	/* Termination condition found.  Now need to establish exactly why
 	   we terminated.  */
-	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
-	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
-	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
 	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
 	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
-
 	mov	tmp1, vend1.d[0]
 L(tail):
 	/* Count the trailing zeros, by bit reversing...  */
@@ -129,3 +123,4 @@ L(tail):
 	ret
 
 END (__strchr_aarch64)
+
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
new file mode 100644
index 0000000..1b0d0a6
--- /dev/null
+++ b/string/aarch64/strchrnul-mte.S
@@ -0,0 +1,84 @@
+/*
+ * strchrnul - find a character or nul in a string
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin		x0
+#define chrin		w1
+#define result		x0
+
+#define src		x2
+#define tmp1		x1
+#define tmp2		x3
+#define tmp2w		w3
+
+#define vrepchr		v0
+#define vdata		v1
+#define qdata		q1
+#define vhas_nul	v2
+#define vhas_chr	v3
+#define vrepmask	v4
+#define vend		v5
+#define dend		d5
+
+/* Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (__strchrnul_aarch64_mte)
+	PTR_ARG (0)
+	bic	src, srcin, 15
+	dup	vrepchr.16b, chrin
+	ld1	{vdata.16b}, [src]
+	mov	tmp2w, 0xf00f
+	dup	vrepmask.8h, tmp2w
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
+	lsl	tmp2, srcin, 2
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	tmp1, dend
+	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
+	cbz	tmp1, L(loop)
+
+	rbit	tmp1, tmp1
+	clz	tmp1, tmp1
+	add	result, srcin, tmp1, lsr 2
+	ret
+
+	.p2align 4
+L(loop):
+	ldr	qdata, [src, 16]!
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
+	fmov	tmp1, dend
+	cbz	tmp1, L(loop)
+
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	tmp1, dend
+#ifndef __AARCH64EB__
+	rbit	tmp1, tmp1
+#endif
+	clz	tmp1, tmp1
+	add	result, src, tmp1, lsr 2
+	ret
+
+END (__strchrnul_aarch64_mte)
+
diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S
index 5140e59..428ff1a 100644
--- a/string/aarch64/strchrnul-sve.S
+++ b/string/aarch64/strchrnul-sve.S
@@ -1,7 +1,7 @@
 /*
  * strchrnul - find a character or nul in a string
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index 81264ea..a4230d9 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -1,7 +1,7 @@
 /*
  * strchrnul - find a character or nul in a string
  *
- * Copyright (c) 2014-2019, Arm Limited.
+ * Copyright (c) 2014-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -47,6 +47,7 @@
 /* Locals and temporaries.  */
 
 ENTRY (__strchrnul_aarch64)
+	PTR_ARG (0)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the termination condition.  */
 	mov	wtmp2, #0x0401
@@ -63,14 +64,12 @@ ENTRY (__strchrnul_aarch64)
 	   syndrome that are related to the padding.  */
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	neg	tmp1, tmp1
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	orr	vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
-	orr	vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
 	lsl	tmp1, tmp1, #1
 	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
 	mov	tmp3, #~0
@@ -81,24 +80,22 @@ ENTRY (__strchrnul_aarch64)
 	bic	tmp1, tmp3, tmp1	// Mask padding bits.
 	cbnz	tmp1, L(tail)
 
+	.p2align 4
 L(loop):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	/* Use a fast check for the termination condition.  */
-	orr	vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
-	orr	vend1.16b, vhas_chr1.16b, vhas_chr2.16b
-	addp	vend1.2d, vend1.2d, vend1.2d
+	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+	umaxp	vend1.16b, vend1.16b, vend1.16b
 	mov	tmp1, vend1.d[0]
 	cbz	tmp1, L(loop)
 
 	/* Termination condition found.  Now need to establish exactly why
 	   we terminated.  */
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
 	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128
 	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
 
@@ -114,3 +111,4 @@ L(tail):
 	ret
 
 END (__strchrnul_aarch64)
+
diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S
new file mode 100644
index 0000000..12d1a6b
--- /dev/null
+++ b/string/aarch64/strcmp-mte.S
@@ -0,0 +1,189 @@
+/*
+ * strcmp - compare two strings
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+#define src1		x0
+#define src2		x1
+#define result		x0
+
+#define data1		x2
+#define data1w		w2
+#define data2		x3
+#define data2w		w3
+#define has_nul		x4
+#define diff		x5
+#define off1		x5
+#define syndrome	x6
+#define tmp		x6
+#define data3		x7
+#define zeroones	x8
+#define shift		x9
+#define off2		x10
+
+/* On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.  */
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+#else
+# define LS_FW lsr
+#endif
+
+/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+   can be done in parallel across the entire word.
+   Since carry propagation makes 0x1 bytes before a NUL byte appear
+   NUL too in big-endian, byte-reverse the data before the NUL check.  */
+
+
+ENTRY (__strcmp_aarch64_mte)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	sub	off2, src2, src1
+	mov	zeroones, REP8_01
+	and	tmp, src1, 7
+	tst	off2, 7
+	b.ne	L(misaligned8)
+	cbnz	tmp, L(mutual_align)
+
+	.p2align 4
+
+L(loop_aligned):
+	ldr	data2, [src1, off2]
+	ldr	data1, [src1], 8
+L(start_realigned):
+#ifdef __AARCH64EB__
+	rev	tmp, data1
+	sub	has_nul, tmp, zeroones
+	orr	tmp, tmp, REP8_7f
+#else
+	sub	has_nul, data1, zeroones
+	orr	tmp, data1, REP8_7f
+#endif
+	bics	has_nul, has_nul, tmp	/* Non-zero if NUL terminator.  */
+	ccmp	data1, data2, 0, eq
+	b.eq	L(loop_aligned)
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	eor	diff, data1, data2
+	orr	syndrome, diff, has_nul
+L(end):
+#ifndef __AARCH64EB__
+	rev	syndrome, syndrome
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	clz	shift, syndrome
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	lsl	data1, data1, shift
+	lsl	data2, data2, shift
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, 56
+	sub	result, data1, data2, lsr 56
+	ret
+
+	.p2align 4
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.  */
+	bic	src1, src1, 7
+	ldr	data2, [src1, off2]
+	ldr	data1, [src1], 8
+	neg	shift, src2, lsl 3	/* Bits to alignment -64.  */
+	mov	tmp, -1
+	LS_FW	tmp, tmp, shift
+	orr	data1, data1, tmp
+	orr	data2, data2, tmp
+	b	L(start_realigned)
+
+L(misaligned8):
+	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+	   checking to make sure that we don't access beyond the end of SRC2.  */
+	cbz	tmp, L(src1_aligned)
+L(do_misaligned):
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	cmp	data1w, 0
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	tst	src1, 7
+	b.ne	L(do_misaligned)
+
+L(src1_aligned):
+	neg	shift, src2, lsl 3
+	bic	src2, src2, 7
+	ldr	data3, [src2], 8
+#ifdef __AARCH64EB__
+	rev	data3, data3
+#endif
+	lsr	tmp, zeroones, shift
+	orr	data3, data3, tmp
+	sub	has_nul, data3, zeroones
+	orr	tmp, data3, REP8_7f
+	bics	has_nul, has_nul, tmp
+	b.ne	L(tail)
+
+	sub	off1, src2, src1
+
+	.p2align 4
+
+L(loop_unaligned):
+	ldr	data3, [src1, off1]
+	ldr	data2, [src1, off2]
+#ifdef __AARCH64EB__
+	rev	data3, data3
+#endif
+	sub	has_nul, data3, zeroones
+	orr	tmp, data3, REP8_7f
+	ldr	data1, [src1], 8
+	bics	has_nul, has_nul, tmp
+	ccmp	data1, data2, 0, eq
+	b.eq	L(loop_unaligned)
+
+	lsl	tmp, has_nul, shift
+#ifdef __AARCH64EB__
+	rev	tmp, tmp
+#endif
+	eor	diff, data1, data2
+	orr	syndrome, diff, tmp
+	cbnz	syndrome, L(end)
+L(tail):
+	ldr	data1, [src1]
+	neg	shift, shift
+	lsr	data2, data3, shift
+	lsr	has_nul, has_nul, shift
+#ifdef __AARCH64EB__
+	rev     data2, data2
+	rev	has_nul, has_nul
+#endif
+	eor	diff, data1, data2
+	orr	syndrome, diff, has_nul
+	b	L(end)
+
+L(done):
+	sub	result, data1, data2
+	ret
+
+END (__strcmp_aarch64_mte)
+
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S
index 91bac19..e6d2da5 100644
--- a/string/aarch64/strcmp-sve.S
+++ b/string/aarch64/strcmp-sve.S
@@ -1,29 +1,28 @@
 /*
  * __strcmp_aarch64_sve - compare two strings
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-	.globl	__strcmp_aarch64_sve
-	.type	__strcmp_aarch64_sve, %function
-	.p2align 4
-__strcmp_aarch64_sve:
+ENTRY (__strcmp_aarch64_sve)
+	PTR_ARG (0)
+	PTR_ARG (1)
 	setffr				/* initialize FFR */
 	ptrue	p1.b, all		/* all ones; loop invariant */
 	mov	x2, 0			/* initialize offset */
-	nop
 
 	/* Read a vector's worth of bytes, stopping on first fault.  */
+	.p2align 4
 0:	ldff1b	z0.b, p1/z, [x0, x2]
 	ldff1b	z1.b, p1/z, [x1, x2]
 	rdffrs	p0.b, p1/z
@@ -54,4 +53,7 @@ __strcmp_aarch64_sve:
 	b.none	0b
 	b	1b
 
-	.size	__strcmp_aarch64_sve, . - __strcmp_aarch64_sve
+END (__strcmp_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index 65af5ce..7714ebf 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -1,7 +1,7 @@
 /*
  * strcmp - compare two strings
  *
- * Copyright (c) 2012, Arm Limited.
+ * Copyright (c) 2012-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -37,6 +37,8 @@
 
 	/* Start of performance-critical section  -- one 64B cache line.  */
 ENTRY (__strcmp_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
@@ -168,3 +170,4 @@ L(done):
 	ret
 
 END (__strcmp_aarch64)
+
diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S
new file mode 100644
index 0000000..88c222d
--- /dev/null
+++ b/string/aarch64/strcpy-mte.S
@@ -0,0 +1,161 @@
+/*
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define dstin		x0
+#define srcin		x1
+#define result		x0
+
+#define src		x2
+#define dst		x3
+#define len		x4
+#define synd		x4
+#define	tmp		x5
+#define wtmp		w5
+#define shift		x5
+#define data1		x6
+#define dataw1		w6
+#define data2		x7
+#define dataw2		w7
+
+#define dataq		q0
+#define vdata		v0
+#define vhas_nul	v1
+#define vrepmask	v2
+#define vend		v3
+#define dend		d3
+#define dataq2		q1
+
+#ifdef BUILD_STPCPY
+# define STRCPY __stpcpy_aarch64_mte
+# define IFSTPCPY(X,...) X,__VA_ARGS__
+#else
+# define STRCPY __strcpy_aarch64_mte
+# define IFSTPCPY(X,...)
+#endif
+
+/* Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (STRCPY)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	bic	src, srcin, 15
+	mov	wtmp, 0xf00f
+	ld1	{vdata.16b}, [src]
+	dup	vrepmask.8h, wtmp
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbnz	synd, L(tail)
+
+	ldr	dataq, [src, 16]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(start_loop)
+
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	sub	tmp, src, srcin
+	clz	len, synd
+	add	len, tmp, len, lsr 2
+	tbz	len, 4, L(less16)
+	sub	tmp, len, 15
+	ldr	dataq, [srcin]
+	ldr	dataq2, [srcin, tmp]
+	str	dataq, [dstin]
+	str	dataq2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
+	ret
+
+	.p2align 4,,8
+L(tail):
+	rbit	synd, synd
+	clz	len, synd
+	lsr	len, len, 2
+
+	.p2align 4
+L(less16):
+	tbz	len, 3, L(less8)
+	sub	tmp, len, 7
+	ldr	data1, [srcin]
+	ldr	data2, [srcin, tmp]
+	str	data1, [dstin]
+	str	data2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
+	ret
+
+	.p2align 4
+L(less8):
+	subs	tmp, len, 3
+	b.lo	L(less4)
+	ldr	dataw1, [srcin]
+	ldr	dataw2, [srcin, tmp]
+	str	dataw1, [dstin]
+	str	dataw2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
+	ret
+
+L(less4):
+	cbz	len, L(zerobyte)
+	ldrh	dataw1, [srcin]
+	strh	dataw1, [dstin]
+L(zerobyte):
+	strb	wzr, [dstin, len]
+	IFSTPCPY (add result, dstin, len)
+	ret
+
+	.p2align 4
+L(start_loop):
+	sub	len, src, srcin
+	ldr	dataq2, [srcin]
+	add	dst, dstin, len
+	str	dataq2, [dstin]
+
+	.p2align 5
+L(loop):
+	str	dataq, [dst], 16
+	ldr	dataq, [src, 16]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(loop)
+
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	fmov	synd, dend
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	len, synd
+	lsr	len, len, 2
+	sub	tmp, len, 15
+	ldr	dataq, [src, tmp]
+	str	dataq, [dst, tmp]
+	IFSTPCPY (add result, dst, len)
+	ret
+
+END (STRCPY)
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S
index c929f37..f515462 100644
--- a/string/aarch64/strcpy-sve.S
+++ b/string/aarch64/strcpy-sve.S
@@ -1,19 +1,19 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
 /* To build as stpcpy, define BUILD_STPCPY before compiling this file.  */
 #ifdef BUILD_STPCPY
 #define FUNC  __stpcpy_aarch64_sve
@@ -21,10 +21,9 @@
 #define FUNC  __strcpy_aarch64_sve
 #endif
 
-	.globl	FUNC
-	.type	FUNC, %function
-	.p2align 4
-FUNC:
+ENTRY (FUNC)
+	PTR_ARG (0)
+	PTR_ARG (1)
 	setffr				/* initialize FFR */
 	ptrue	p2.b, all		/* all ones; loop invariant */
 	mov	x2, 0			/* initialize offset */
@@ -66,4 +65,7 @@ FUNC:
 #endif
 	ret
 
-	.size	FUNC, . - FUNC
+END (FUNC)
+
+#endif
+
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 4edffcf..6e9ed42 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -1,7 +1,7 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2013-2019, Arm Limited.
+ * Copyright (c) 2013-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -80,6 +80,8 @@
 #define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
 
 ENTRY (STRCPY)
+	PTR_ARG (0)
+	PTR_ARG (1)
 	/* For moderately short strings, the fastest way to do the copy is to
 	   calculate the length of the string in the same way as strlen, then
 	   essentially do a memcpy of the result.  This avoids the need for
@@ -306,3 +308,4 @@ L(page_cross):
 	b	L(fp_gt8)
 
 END (STRCPY)
+
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
new file mode 100644
index 0000000..7cf41d5
--- /dev/null
+++ b/string/aarch64/strlen-mte.S
@@ -0,0 +1,80 @@
+/*
+ * strlen - calculate the length of a string.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin		x0
+#define result		x0
+
+#define src		x1
+#define	synd		x2
+#define tmp		x3
+#define wtmp		w3
+#define shift		x4
+
+#define data		q0
+#define vdata		v0
+#define vhas_nul	v1
+#define vrepmask	v2
+#define vend		v3
+#define dend		d3
+
+/* Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (__strlen_aarch64_mte)
+	PTR_ARG (0)
+	bic	src, srcin, 15
+	mov	wtmp, 0xf00f
+	ld1	{vdata.16b}, [src]
+	dup	vrepmask.8h, wtmp
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbz	synd, L(loop)
+
+	rbit	synd, synd
+	clz	result, synd
+	lsr	result, result, 2
+	ret
+
+	.p2align 5
+L(loop):
+	ldr	data, [src, 16]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(loop)
+
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	sub	result, src, srcin
+	fmov	synd, dend
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	tmp, synd
+	add	result, result, tmp, lsr 2
+	ret
+
+END (__strlen_aarch64_mte)
+
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S
index 64ede85..2392493 100644
--- a/string/aarch64/strlen-sve.S
+++ b/string/aarch64/strlen-sve.S
@@ -1,31 +1,28 @@
 /*
  * __strlen_aarch64_sve - compute the length of a string
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-	.globl	__strlen_aarch64_sve
-	.type	__strlen_aarch64_sve, %function
-	.p2align 4
-__strlen_aarch64_sve:
+ENTRY (__strlen_aarch64_sve)
+	PTR_ARG (0)
 	setffr			/* initialize FFR */
 	ptrue	p2.b		/* all ones; loop invariant */
 	mov	x1, 0		/* initialize length */
-	nop
 
 	/* Read a vector's worth of bytes, stopping on first fault.  */
+	.p2align 4
 0:	ldff1b	z0.b, p2/z, [x0, x1]
-	nop
 	rdffrs	p0.b, p2/z
 	b.nlast	2f
 
@@ -52,4 +49,7 @@ __strlen_aarch64_sve:
 	incp	x1, p0.b
 	b	0b
 
-	.size	__strlen_aarch64_sve, . - __strlen_aarch64_sve
+END (__strlen_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index 2293f73..a1b164a 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -1,84 +1,88 @@
 /*
- * strlen - calculate the length of a string
+ * strlen - calculate the length of a string.
  *
- * Copyright (c) 2013, Arm Limited.
+ * Copyright (c) 2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * Not MTE compatible.
  */
 
 #include "../asmdefs.h"
 
-/* To test the page crossing code path more thoroughly, compile with
-   -DTEST_PAGE_CROSS - this will force all calls through the slower
-   entry path.  This option is not intended for production use.	 */
-
-/* Arguments and results.  */
-#define srcin		x0
-#define len		x0
-
-/* Locals and temporaries.  */
-#define src		x1
-#define data1		x2
-#define data2		x3
-#define has_nul1	x4
-#define has_nul2	x5
-#define tmp1		x4
-#define tmp2		x5
-#define tmp3		x6
-#define tmp4		x7
-#define zeroones	x8
-
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word. A faster check
-	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
-	   false hits for characters 129..255.	*/
+#define srcin	x0
+#define len	x0
+
+#define src	x1
+#define data1	x2
+#define data2	x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1	x4
+#define tmp2	x5
+#define tmp3	x6
+#define tmp4	x7
+#define zeroones x8
+
+#define maskv	v0
+#define maskd	d0
+#define dataq1	q1
+#define dataq2	q2
+#define datav1	v1
+#define datav2	v2
+#define tmp	x2
+#define tmpw	w2
+#define synd	x3
+#define shift	x4
+
+/* For the first 32 bytes, NUL detection works on the principle that
+   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
+   byte is zero, and can be done in parallel across the entire word.  */
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
+
+/* To test the page crossing code path more thoroughly, compile with
+   -DTEST_PAGE_CROSS - this will force all calls through the slower
+   entry path.  This option is not intended for production use.  */
 
 #ifdef TEST_PAGE_CROSS
-# define MIN_PAGE_SIZE 15
+# define MIN_PAGE_SIZE 32
 #else
 # define MIN_PAGE_SIZE 4096
 #endif
 
-	/* Since strings are short on average, we check the first 16 bytes
-	   of the string for a NUL character.  In order to do an unaligned ldp
-	   safely we have to do a page cross check first.  If there is a NUL
-	   byte we calculate the length from the 2 8-byte words using
-	   conditional select to reduce branch mispredictions (it is unlikely
-	   __strlen_aarch64 will be repeatedly called on strings with the same length).
-
-	   If the string is longer than 16 bytes, we align src so don't need
-	   further page cross checks, and process 32 bytes per iteration
-	   using the fast NUL check.  If we encounter non-ASCII characters,
-	   fallback to a second loop using the full NUL check.
-
-	   If the page cross check fails, we read 16 bytes from an aligned
-	   address, remove any characters before the string, and continue
-	   in the main loop using aligned loads.  Since strings crossing a
-	   page in the first 16 bytes are rare (probability of
-	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
-
-	   AArch64 systems have a minimum page size of 4k.  We don't bother
-	   checking for larger page sizes - the cost of setting up the correct
-	   page size is just not worth the extra gain from a small reduction in
-	   the cases taking the slow path.  Note that we only care about
-	   whether the first fetch, which may be misaligned, crosses a page
-	   boundary.  */
+/* Core algorithm:
+
+   Since strings are short on average, we check the first 32 bytes of the
+   string for a NUL character without aligning the string.  In order to use
+   unaligned loads safely we must do a page cross check first.
+
+   If there is a NUL byte we calculate the length from the 2 8-byte words
+   using conditional select to reduce branch mispredictions (it is unlikely
+   strlen will be repeatedly called on strings with the same length).
+
+   If the string is longer than 32 bytes, align src so we don't need further
+   page cross checks, and process 32 bytes per iteration using a fast SIMD
+   loop.
+
+   If the page cross check fails, we read 32 bytes from an aligned address,
+   and ignore any characters before the string.  If it contains a NUL
+   character, return the length, if not, continue in the main loop.  */
 
 ENTRY (__strlen_aarch64)
+	PTR_ARG (0)
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
-	mov	zeroones, REP8_01
-	cmp	tmp1, MIN_PAGE_SIZE - 16
-	b.gt	L(page_cross)
+	cmp	tmp1, MIN_PAGE_SIZE - 32
+	b.hi	L(page_cross)
+
+	/* Look for a NUL byte in the first 16 bytes.  */
 	ldp	data1, data2, [srcin]
+	mov	zeroones, REP8_01
+
 #ifdef __AARCH64EB__
 	/* For big-endian, carry propagation (if the final byte in the
 	   string is 0x01) means we cannot use has_nul1/2 directly.
@@ -94,113 +98,103 @@ ENTRY (__strlen_aarch64)
 	bics	has_nul1, tmp1, tmp2
 	bic	has_nul2, tmp3, tmp4
 	ccmp	has_nul2, 0, 0, eq
-	beq	L(main_loop_entry)
+	b.eq	L(bytes16_31)
 
-	/* Enter with C = has_nul1 == 0.  */
+	/* Find the exact offset of the first NUL byte in the first 16 bytes
+	   from the string start.  Enter with C = has_nul1 == 0.  */
 	csel	has_nul1, has_nul1, has_nul2, cc
 	mov	len, 8
 	rev	has_nul1, has_nul1
-	clz	tmp1, has_nul1
 	csel	len, xzr, len, cc
+	clz	tmp1, has_nul1
 	add	len, len, tmp1, lsr 3
 	ret
 
-	/* The inner loop processes 32 bytes per iteration and uses the fast
-	   NUL check.  If we encounter non-ASCII characters, use a second
-	   loop with the accurate NUL check.  */
-	.p2align 4
-L(main_loop_entry):
-	bic	src, srcin, 15
-	sub	src, src, 16
-L(main_loop):
-	ldp	data1, data2, [src, 32]!
-L(page_cross_entry):
-	sub	tmp1, data1, zeroones
-	sub	tmp3, data2, zeroones
-	orr	tmp2, tmp1, tmp3
-	tst	tmp2, zeroones, lsl 7
-	bne	1f
-	ldp	data1, data2, [src, 16]
+	.p2align 3
+	/* Look for a NUL byte at offset 16..31 in the string.  */
+L(bytes16_31):
+	ldp	data1, data2, [srcin, 16]
+#ifdef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
 	sub	tmp1, data1, zeroones
-	sub	tmp3, data2, zeroones
-	orr	tmp2, tmp1, tmp3
-	tst	tmp2, zeroones, lsl 7
-	beq	L(main_loop)
-	add	src, src, 16
-1:
-	/* The fast check failed, so do the slower, accurate NUL check.	 */
 	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
 	orr	tmp4, data2, REP8_7f
 	bics	has_nul1, tmp1, tmp2
 	bic	has_nul2, tmp3, tmp4
 	ccmp	has_nul2, 0, 0, eq
-	beq	L(nonascii_loop)
+	b.eq	L(loop_entry)
 
-	/* Enter with C = has_nul1 == 0.  */
-L(tail):
-#ifdef __AARCH64EB__
-	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul1/2 directly.  The
-	   easiest way to get the correct byte is to byte-swap the data
-	   and calculate the syndrome a second time.  */
-	csel	data1, data1, data2, cc
-	rev	data1, data1
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	bic	has_nul1, tmp1, tmp2
-#else
+	/* Find the exact offset of the first NUL byte at offset 16..31 from
+	   the string start.  Enter with C = has_nul1 == 0.  */
 	csel	has_nul1, has_nul1, has_nul2, cc
-#endif
-	sub	len, src, srcin
+	mov	len, 24
 	rev	has_nul1, has_nul1
-	add	tmp2, len, 8
+	mov	tmp3, 16
 	clz	tmp1, has_nul1
-	csel	len, len, tmp2, cc
+	csel	len, tmp3, len, cc
 	add	len, len, tmp1, lsr 3
 	ret
 
-L(nonascii_loop):
-	ldp	data1, data2, [src, 16]!
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	bic	has_nul2, tmp3, tmp4
-	ccmp	has_nul2, 0, 0, eq
-	bne	L(tail)
-	ldp	data1, data2, [src, 16]!
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	bic	has_nul2, tmp3, tmp4
-	ccmp	has_nul2, 0, 0, eq
-	beq	L(nonascii_loop)
-	b	L(tail)
+L(loop_entry):
+	bic	src, srcin, 31
 
-	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
-	   srcin to 0x7f, so we ignore any NUL bytes before the string.
-	   Then continue in the aligned loop.  */
-L(page_cross):
-	bic	src, srcin, 15
-	ldp	data1, data2, [src]
-	lsl	tmp1, srcin, 3
-	mov	tmp4, -1
+	.p2align 5
+L(loop):
+	ldp	dataq1, dataq2, [src, 32]!
+	uminp	maskv.16b, datav1.16b, datav2.16b
+	uminp	maskv.16b, maskv.16b, maskv.16b
+	cmeq	maskv.8b, maskv.8b, 0
+	fmov	synd, maskd
+	cbz	synd, L(loop)
+
+	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
+	cmeq	maskv.16b, datav1.16b, 0
+	sub	len, src, srcin
+	tst	synd, 0xffffffff
+	b.ne	1f
+	cmeq	maskv.16b, datav2.16b, 0
+	add	len, len, 16
+1:
+	/* Generate a bitmask and compute correct byte offset.  */
 #ifdef __AARCH64EB__
-	/* Big-endian.	Early bytes are at MSB.	 */
-	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
+	bic	maskv.8h, 0xf0
 #else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
+	bic	maskv.8h, 0x0f, lsl 8
+#endif
+	umaxp	maskv.16b, maskv.16b, maskv.16b
+	fmov	synd, maskd
+#ifndef __AARCH64EB__
+	rbit	synd, synd
 #endif
-	orr	tmp1, tmp1, REP8_80
-	orn	data1, data1, tmp1
-	orn	tmp2, data2, tmp1
-	tst	srcin, 8
-	csel	data1, data1, tmp4, eq
-	csel	data2, data2, tmp2, eq
-	b	L(page_cross_entry)
+	clz	tmp, synd
+	add	len, len, tmp, lsr 2
+	ret
+
+        .p2align 4
+
+L(page_cross):
+	bic	src, srcin, 31
+	mov	tmpw, 0x0c03
+	movk	tmpw, 0xc030, lsl 16
+	ld1	{datav1.16b, datav2.16b}, [src]
+	dup	maskv.4s, tmpw
+	cmeq	datav1.16b, datav1.16b, 0
+	cmeq	datav2.16b, datav2.16b, 0
+	and	datav1.16b, datav1.16b, maskv.16b
+	and	datav2.16b, datav2.16b, maskv.16b
+	addp	maskv.16b, datav1.16b, datav2.16b
+	addp	maskv.16b, maskv.16b, maskv.16b
+	fmov	synd, maskd
+	lsl	shift, srcin, 1
+	lsr	synd, synd, shift
+	cbz	synd, L(loop)
+
+	rbit	synd, synd
+	clz	len, synd
+	lsr	len, len, 1
+	ret
 
 END (__strlen_aarch64)
diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S
new file mode 100644
index 0000000..c9d6fc8
--- /dev/null
+++ b/string/aarch64/strncmp-mte.S
@@ -0,0 +1,307 @@
+/*
+ * strncmp - compare two strings
+ *
+ * Copyright (c) 2013-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+#include "../asmdefs.h"
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		x0
+
+/* Internal variables.  */
+#define data1		x3
+#define data1w		w3
+#define data2		x4
+#define data2w		w4
+#define has_nul		x5
+#define diff		x6
+#define syndrome	x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define zeroones	x11
+#define pos		x12
+#define mask		x13
+#define endloop		x14
+#define count		mask
+#define offset		pos
+#define neg_offset	x15
+
+/* Define endian dependent shift operations.
+   On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.
+   LS_BK means shifting towards later bytes.
+   */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
+
+ENTRY (__strncmp_aarch64_mte)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+	cbz	limit, L(ret0)
+	eor	tmp1, src1, src2
+	mov	zeroones, #REP8_01
+	tst	tmp1, #7
+	and	count, src1, #7
+	b.ne	L(misaligned8)
+	cbnz	count, L(mutual_align)
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+	.p2align 4
+L(loop_aligned):
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+L(start_realigned):
+	subs	limit, limit, #8
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	endloop, #0, #0, eq
+	b.eq	L(loop_aligned)
+	/* End of main loop */
+
+L(full_check):
+#ifndef __AARCH64EB__
+	orr	syndrome, diff, has_nul
+	add	limit, limit, 8	/* Rewind limit to before last subs. */
+L(syndrome_check):
+	/* Limit was reached. Check if the NUL byte or the difference
+	   is before the limit. */
+	rev	syndrome, syndrome
+	rev	data1, data1
+	clz	pos, syndrome
+	rev	data2, data2
+	lsl	data1, data1, pos
+	cmp	limit, pos, lsr #3
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	csel result, result, xzr, hi
+	ret
+#else
+	/* Not reached the limit, must have found the end or a diff.  */
+	tbz	limit, #63, L(not_limit)
+	add	tmp1, limit, 8
+	cbz	limit, L(not_limit)
+
+	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
+	mov	mask, #~0
+	lsr	mask, mask, limit
+	bic	data1, data1, mask
+	bic	data2, data2, mask
+
+	/* Make sure that the NUL byte is marked in the syndrome.  */
+	orr	has_nul, has_nul, mask
+
+L(not_limit):
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+L(end_quick):
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#endif
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.
+	   We also need to adjust the limit calculations, but without
+	   overflowing if the limit is near ULONG_MAX.  */
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	ldr	data1, [src1], #8
+	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+	/* Adjust the limit and ensure it doesn't overflow.  */
+	adds	limit, limit, count
+	csinv	limit, limit, xzr, lo
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	b	L(start_realigned)
+
+	.p2align 4
+	/* Don't bother with dwords for up to 16 bytes.  */
+L(misaligned8):
+	cmp	limit, #16
+	b.hs	L(try_misaligned_words)
+
+L(byte_loop):
+	/* Perhaps we can do better than this.  */
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	subs	limit, limit, #1
+	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.eq	L(byte_loop)
+L(done):
+	sub	result, data1, data2
+	ret
+	/* Align the SRC1 to a dword by doing a bytewise compare and then do
+	   the dword loop.  */
+L(try_misaligned_words):
+	cbz	count, L(src1_aligned)
+
+	neg	count, count
+	and	count, count, #7
+	sub	limit, limit, count
+
+L(page_end_loop):
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	subs	count, count, #1
+	b.hi	L(page_end_loop)
+
+	/* The following diagram explains the comparison of misaligned strings.
+	   The bytes are shown in natural order. For little-endian, it is
+	   reversed in the registers. The "x" bytes are before the string.
+	   The "|" separates data that is loaded at one time.
+	   src1     | a a a a a a a a | b b b c c c c c | . . .
+	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
+
+	   After shifting in each step, the data looks like this:
+	                STEP_A              STEP_B              STEP_C
+	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
+	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
+
+	   The bytes with "0" are eliminated from the syndrome via mask.
+
+	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+	   time from SRC2. The comparison happens in 3 steps. After each step
+	   the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+	/* Calculate offset from 8 byte alignment to string start in bits. No
+	   need to mask offset since shifts are ignoring upper bits. */
+	lsl	offset, src2, #3
+	bic	src2, src2, #0xf
+	mov	mask, -1
+	neg	neg_offset, offset
+	ldr	data1, [src1], #8
+	ldp	tmp1, tmp2, [src2], #16
+	LS_BK	mask, mask, neg_offset
+	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
+	/* Skip the first compare if data in tmp1 is irrelevant. */
+	tbnz	offset, 6, L(misaligned_mid_loop)
+
+L(loop_misaligned):
+	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+	LS_FW	data2, tmp1, offset
+	LS_BK	tmp1, tmp2, neg_offset
+	subs	limit, limit, #8
+	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
+	sub	has_nul, data1, zeroones
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	orr	tmp3, data1, #REP8_7f
+	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
+	orr	tmp3, endloop, has_nul
+	cbnz	tmp3, L(full_check)
+
+	ldr	data1, [src1], #8
+L(misaligned_mid_loop):
+	/* STEP_B: Compare first part of data1 to second part of tmp2. */
+	LS_FW	data2, tmp2, offset
+#ifdef __AARCH64EB__
+	/* For big-endian we do a byte reverse to avoid carry-propagation
+	problem described above. This way we can reuse the has_nul in the
+	next step and also use syndrome value trick at the end. */
+	rev	tmp3, data1
+	#define data1_fixed tmp3
+#else
+	#define data1_fixed data1
+#endif
+	sub	has_nul, data1_fixed, zeroones
+	orr	tmp3, data1_fixed, #REP8_7f
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	cmp	limit, neg_offset, lsr #3
+	orr	syndrome, diff, has_nul
+	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	/* STEP_C: Compare second part of data1 to first part of tmp1. */
+	ldp	tmp1, tmp2, [src2], #16
+	cmp	limit, #8
+	LS_BK	data2, tmp1, neg_offset
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	orr	syndrome, diff, has_nul
+	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	ldr	data1, [src1], #8
+	sub	limit, limit, #8
+	b	L(loop_misaligned)
+
+#ifdef	__AARCH64EB__
+L(syndrome_check):
+	clz	pos, syndrome
+	cmp	pos, limit, lsl #3
+	b.lo	L(end_quick)
+#endif
+
+L(ret0):
+	mov	result, #0
+	ret
+END(__strncmp_aarch64_mte)
+
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S
index 6f31eca..234190e 100644
--- a/string/aarch64/strncmp-sve.S
+++ b/string/aarch64/strncmp-sve.S
@@ -1,23 +1,23 @@
 /*
  * strncmp - compare two strings with limit
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-	.globl	__strncmp_aarch64_sve
-	.type	__strncmp_aarch64_sve, %function
-	.p2align 4
-__strncmp_aarch64_sve:
+ENTRY (__strncmp_aarch64_sve)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	setffr				/* initialize FFR */
 	mov	x3, 0			/* initialize off */
 
@@ -63,4 +63,7 @@ __strncmp_aarch64_sve:
 9:	mov	x0, 0			/* return equal */
 	ret
 
-	.size	__strncmp_aarch64_sve, . - __strncmp_aarch64_sve
+END (__strncmp_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index fbd08ee..738b653 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -1,7 +1,7 @@
 /*
  * strncmp - compare two strings
  *
- * Copyright (c) 2013, Arm Limited.
+ * Copyright (c) 2013-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -40,12 +40,10 @@
 #define endloop		x15
 #define count		mask
 
-	.text
-	.p2align 6
-	.rep 7
-	nop	/* Pad so that the loop below fits a cache line.  */
-	.endr
-ENTRY_ALIGN (__strncmp_aarch64, 0)
+ENTRY (__strncmp_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	cbz	limit, L(ret0)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
@@ -60,7 +58,7 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
 	   can be done in parallel across the entire word.  */
-	/* Start of performance-critical section  -- one 64B cache line.  */
+	.p2align 4
 L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
@@ -73,7 +71,7 @@ L(start_realigned):
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
 	b.eq	L(loop_aligned)
-	/* End of performance-critical section  -- one 64B cache line.  */
+	/* End of main loop */
 
 	/* Not reached the limit, must have found the end or a diff.  */
 	tbz	limit_wd, #63, L(not_limit)
@@ -178,7 +176,7 @@ L(mutual_align):
 	add	limit_wd, limit_wd, tmp3, lsr #3
 	b	L(start_realigned)
 
-	.p2align 6
+	.p2align 4
 	/* Don't bother with dwords for up to 16 bytes.  */
 L(misaligned8):
 	cmp	limit, #16
@@ -259,3 +257,4 @@ L(ret0):
 	ret
 
 END ( __strncmp_aarch64)
+
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S
index 3a9be08..5b9ebf7 100644
--- a/string/aarch64/strnlen-sve.S
+++ b/string/aarch64/strnlen-sve.S
@@ -1,23 +1,22 @@
 /*
  * strnlen - calculate the length of a string with limit.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-	.globl	__strnlen_aarch64_sve
-	.type	__strnlen_aarch64_sve, %function
-	.p2align 4
-__strnlen_aarch64_sve:
+ENTRY (__strnlen_aarch64_sve)
+	PTR_ARG (0)
+	SIZE_ARG (1)
 	setffr				/* initialize FFR */
 	mov	x2, 0			/* initialize len */
 	b	1f
@@ -66,7 +65,10 @@ __strnlen_aarch64_sve:
 	b	1b
 
 	/* End of count.  Return max.  */
-9:	mov	x0, x2
+9:	mov	x0, x1
 	ret
 
-	.size	__strnlen_aarch64_sve, . - __strnlen_aarch64_sve
+END (__strnlen_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index df66b60..48d2495 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -1,155 +1,112 @@
 /*
  * strnlen - calculate the length of a string with limit.
  *
- * Copyright (c) 2013, Arm Limited.
+ * Copyright (c) 2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
  */
 
 #include "../asmdefs.h"
 
-/* Arguments and results.  */
 #define srcin		x0
-#define len		x0
-#define limit		x1
+#define cntin		x1
+#define result		x0
 
-/* Locals and temporaries.  */
 #define src		x2
-#define data1		x3
-#define data2		x4
-#define data2a		x5
-#define has_nul1	x6
-#define has_nul2	x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define tmp4		x11
-#define zeroones	x12
-#define pos		x13
-#define limit_wd	x14
+#define synd		x3
+#define	shift		x4
+#define wtmp		w4
+#define tmp		x4
+#define cntrem		x5
+
+#define qdata		q0
+#define vdata		v0
+#define vhas_chr	v1
+#define vrepmask	v2
+#define vend		v3
+#define dend		d3
 
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
-	.text
-	.p2align	6
-L(start):
-	/* Pre-pad to ensure critical loop begins an icache line.  */
-	.rep 7
-	nop
-	.endr
-	/* Put this code here to avoid wasting more space with pre-padding.  */
-L(hit_limit):
-	mov	len, limit
+/*
+   Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (__strnlen_aarch64)
+	PTR_ARG (0)
+	SIZE_ARG (1)
+	bic	src, srcin, 15
+	mov	wtmp, 0xf00f
+	cbz	cntin, L(nomatch)
+	ld1	{vdata.16b}, [src], 16
+	dup	vrepmask.8h, wtmp
+	cmeq	vhas_chr.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbz	synd, L(start_loop)
+L(finish):
+	rbit	synd, synd
+	clz	synd, synd
+	lsr	result, synd, 2
+	cmp	cntin, result
+	csel	result, cntin, result, ls
 	ret
 
-ENTRY_ALIGN (__strnlen_aarch64, 0)
-	cbz	limit, L(hit_limit)
-	mov	zeroones, #REP8_01
-	bic	src, srcin, #15
-	ands	tmp1, srcin, #15
-	b.ne	L(misaligned)
-	/* Calculate the number of full and partial words -1.  */
-	sub	limit_wd, limit, #1	/* Limit != 0, so no underflow.  */
-	lsr	limit_wd, limit_wd, #4	/* Convert to Qwords.  */
-
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
-	/* The inner loop deals with two Dwords at a time.  This has a
-	   slightly higher start-up cost, but we should win quite quickly,
-	   especially on cores with a high number of issue slots per
-	   cycle, as we get much better parallelism out of the operations.  */
-
-	/* Start of critial section -- keep to one 64Byte cache line.  */
-L(loop):
-	ldp	data1, data2, [src], #16
-L(realigned):
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bic	has_nul2, tmp3, tmp4
-	subs	limit_wd, limit_wd, #1
-	orr	tmp1, has_nul1, has_nul2
-	ccmp	tmp1, #0, #0, pl	/* NZCV = 0000  */
-	b.eq	L(loop)
-	/* End of critical section -- keep to one 64Byte cache line.  */
-
-	orr	tmp1, has_nul1, has_nul2
-	cbz	tmp1, L(hit_limit)	/* No null in final Qword.  */
-
-	/* We know there's a null in the final Qword.  The easiest thing
-	   to do now is work out the length of the string and return
-	   MIN (len, limit).  */
-
-	sub	len, src, srcin
-	cbz	has_nul1, L(nul_in_data2)
-#ifdef __AARCH64EB__
-	mov	data2, data1
-#endif
-	sub	len, len, #8
-	mov	has_nul2, has_nul1
-L(nul_in_data2):
-#ifdef __AARCH64EB__
-	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul directly.  The
-	   easiest way to get the correct byte is to byte-swap the data
-	   and calculate the syndrome a second time.  */
-	rev	data2, data2
-	sub	tmp1, data2, zeroones
-	orr	tmp2, data2, #REP8_7f
-	bic	has_nul2, tmp1, tmp2
+L(start_loop):
+	sub	tmp, src, srcin
+	subs	cntrem, cntin, tmp
+	b.ls	L(nomatch)
+
+	/* Make sure that it won't overread by a 16-byte chunk */
+	add	tmp, cntrem, 15
+	tbnz	tmp, 4, L(loop32_2)
+
+	.p2align 5
+L(loop32):
+	ldr	qdata, [src], 16
+	cmeq	vhas_chr.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbnz	synd, L(end)
+L(loop32_2):
+	ldr	qdata, [src], 16
+	subs	cntrem, cntrem, 32
+	cmeq	vhas_chr.16b, vdata.16b, 0
+	b.ls	L(end)
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbz	synd, L(loop32)
+
+L(end):
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	sub	src, src, 16
+	mov	synd, vend.d[0]
+	sub	result, src, srcin
+#ifndef __AARCH64EB__
+	rbit	synd, synd
 #endif
-	sub	len, len, #8
-	rev	has_nul2, has_nul2
-	clz	pos, has_nul2
-	add	len, len, pos, lsr #3		/* Bits to bytes.  */
-	cmp	len, limit
-	csel	len, len, limit, ls		/* Return the lower value.  */
+	clz	synd, synd
+	add	result, result, synd, lsr 2
+	cmp	cntin, result
+	csel	result, cntin, result, ls
 	ret
 
-L(misaligned):
-	/* Deal with a partial first word.
-	   We're doing two things in parallel here;
-	   1) Calculate the number of words (but avoiding overflow if
-	      limit is near ULONG_MAX) - to do this we need to work out
-	      limit + tmp1 - 1 as a 65-bit value before shifting it;
-	   2) Load and mask the initial data words - we force the bytes
-	      before the ones we are interested in to 0xff - this ensures
-	      early bytes will not hit any zero detection.  */
-	sub	limit_wd, limit, #1
-	neg	tmp4, tmp1
-	cmp	tmp1, #8
-
-	and	tmp3, limit_wd, #15
-	lsr	limit_wd, limit_wd, #4
-	mov	tmp2, #~0
-
-	ldp	data1, data2, [src], #16
-	lsl	tmp4, tmp4, #3		/* Bytes beyond alignment -> bits.  */
-	add	tmp3, tmp3, tmp1
-
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
-#endif
-	add	limit_wd, limit_wd, tmp3, lsr #4
-
-	orr	data1, data1, tmp2
-	orr	data2a, data2, tmp2
-
-	csinv	data1, data1, xzr, le
-	csel	data2, data2, data2a, le
-	b	L(realigned)
+L(nomatch):
+	mov	result, cntin
+	ret
 
 END (__strnlen_aarch64)
+
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
new file mode 100644
index 0000000..1e4fb1a
--- /dev/null
+++ b/string/aarch64/strrchr-mte.S
@@ -0,0 +1,127 @@
+/*
+ * strrchr - find last position of a character in a string.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin		x0
+#define chrin		w1
+#define result		x0
+
+#define src		x2
+#define tmp		x3
+#define wtmp		w3
+#define synd		x3
+#define shift		x4
+#define src_match	x4
+#define nul_match	x5
+#define chr_match	x6
+
+#define vrepchr		v0
+#define vdata		v1
+#define vhas_nul	v2
+#define vhas_chr	v3
+#define vrepmask	v4
+#define vrepmask2	v5
+#define vend		v5
+#define dend		d5
+
+/* Core algorithm.
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value, with
+   four bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bits 0-1 are set if
+   the relevant byte matched the requested character; bits 2-3 are set
+   if the relevant byte matched the NUL end of string.  */
+
+ENTRY (__strrchr_aarch64_mte)
+	PTR_ARG (0)
+	bic	src, srcin, 15
+	dup	vrepchr.16b, chrin
+	mov	wtmp, 0x3003
+	dup	vrepmask.8h, wtmp
+	tst	srcin, 15
+	beq	L(loop1)
+
+	ld1	{vdata.16b}, [src], 16
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	mov	wtmp, 0xf00f
+	dup	vrepmask2.8h, wtmp
+	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	lsl	shift, srcin, 2
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	lsl	synd, synd, shift
+	ands	nul_match, synd, 0xcccccccccccccccc
+	bne	L(tail)
+	cbnz	synd, L(loop2)
+
+	.p2align 5
+L(loop1):
+	ld1	{vdata.16b}, [src], 16
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(loop1)
+
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	bic	vhas_nul.8h, 0x0f, lsl 8
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	ands	nul_match, synd, 0xcccccccccccccccc
+	beq	L(loop2)
+
+L(tail):
+	sub	nul_match, nul_match, 1
+	and	chr_match, synd, 0x3333333333333333
+	ands	chr_match, chr_match, nul_match
+	sub	result, src, 1
+	clz	tmp, chr_match
+	sub	result, result, tmp, lsr 2
+	csel	result, result, xzr, ne
+	ret
+
+	.p2align 4
+L(loop2):
+	cmp	synd, 0
+	csel	src_match, src, src_match, ne
+	csel	chr_match, synd, chr_match, ne
+	ld1	{vdata.16b}, [src], 16
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	tst	synd, 0xcccccccccccccccc
+	beq	L(loop2)
+
+	bic	vhas_nul.8h, 0x0f, lsl 8
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	and	nul_match, synd, 0xcccccccccccccccc
+	sub	nul_match, nul_match, 1
+	and	tmp, synd, 0x3333333333333333
+	ands	tmp, tmp, nul_match
+	csel	chr_match, tmp, chr_match, ne
+	csel	src_match, src, src_match, ne
+	sub	src_match, src_match, 1
+	clz	tmp, chr_match
+	sub	result, src_match, tmp, lsr 2
+	ret
+
+END (__strrchr_aarch64_mte)
+
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S
index bb522e7..d36d69a 100644
--- a/string/aarch64/strrchr-sve.S
+++ b/string/aarch64/strrchr-sve.S
@@ -1,23 +1,21 @@
 /*
  * strrchr - find the last of a character in a string
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-	.globl	__strrchr_aarch64_sve
-	.type	__strrchr_aarch64_sve, %function
-	.p2align 4
-__strrchr_aarch64_sve:
+ENTRY (__strrchr_aarch64_sve)
+	PTR_ARG (0)
 	dup	z1.b, w1		/* replicate byte across vector */
 	setffr				/* initialize FFR */
 	ptrue	p1.b			/* all ones; loop invariant */
@@ -80,4 +78,7 @@ __strrchr_aarch64_sve:
 5:	mov	x0, 0
 	ret
 
-	.size	__strrchr_aarch64_sve, . - __strrchr_aarch64_sve
+END (__strrchr_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
index 1b4caac..56185ff 100644
--- a/string/aarch64/strrchr.S
+++ b/string/aarch64/strrchr.S
@@ -55,6 +55,7 @@
    identify exactly which byte is causing the termination, and why.  */
 
 ENTRY (__strrchr_aarch64)
+	PTR_ARG (0)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the requested byte.  Magic constant 0x80200802 used
 	   similarly for NUL termination.  */
@@ -84,38 +85,38 @@ ENTRY (__strrchr_aarch64)
 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
 	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
 	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
-	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b	// 128->64
-	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
-	mov	nul_match, vhas_nul1.d[0]
+	addp	vend1.16b, vhas_nul1.16b, vhas_chr1.16b		// 128->64
+	mov	nul_match, vend1.d[0]
 	lsl	tmp1, tmp1, #1
 	mov	const_m1, #~0
-	mov	chr_match, vhas_chr1.d[0]
 	lsr	tmp3, const_m1, tmp1
+	mov	chr_match, vend1.d[1]
 
 	bic	nul_match, nul_match, tmp3	// Mask padding bits.
 	bic	chr_match, chr_match, tmp3	// Mask padding bits.
 	cbnz	nul_match, L(tail)
 
+	.p2align 4
 L(loop):
 	cmp	chr_match, #0
 	csel	src_match, src, src_match, ne
 	csel	src_offset, chr_match, src_offset, ne
 L(aligned):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	addp	vend1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
+	uminp	vend1.16b, vdata1.16b, vdata2.16b
 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	cmeq	vend1.16b, vend1.16b, 0
 	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
-	addp	vend1.16b, vend1.16b, vend1.16b	// 128->64
-	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
+	addp	vend1.16b, vend1.16b, vhas_chr1.16b		// 128->64
 	mov	nul_match, vend1.d[0]
-	mov	chr_match, vhas_chr1.d[0]
+	mov	chr_match, vend1.d[1]
 	cbz	nul_match, L(loop)
 
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
 	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
 	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
@@ -145,3 +146,4 @@ L(tail):
 	ret
 
 END (__strrchr_aarch64)
+
diff --git a/string/arm/check-arch.S b/string/arm/check-arch.S
new file mode 100644
index 0000000..1cff934
--- /dev/null
+++ b/string/arm/check-arch.S
@@ -0,0 +1,10 @@
+/*
+ * check ARCH setting.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if !__arm__
+# error ARCH setting does not match the compiler.
+#endif
diff --git a/string/arm/memchr.S b/string/arm/memchr.S
index 2eff4d1..3f1ac4d 100644
--- a/string/arm/memchr.S
+++ b/string/arm/memchr.S
@@ -1,7 +1,7 @@
 /*
  * memchr - scan memory for a character
  *
- * Copyright (c) 2010, Arm Limited.
+ * Copyright (c) 2010-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -31,7 +31,6 @@
 #else
 #define CHARTSTMASK(c) 1<<(c*8)
 #endif
-	.text
 	.thumb
 
 @ ---------------------------------------------------------------------------
diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S
index aab78a2..86e6493 100644
--- a/string/arm/memcpy.S
+++ b/string/arm/memcpy.S
@@ -1,7 +1,7 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2013, Arm Limited.
+ * Copyright (c) 2013-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -124,7 +124,7 @@ ENTRY (__memcpy_arm)
 
 	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
 	cmp	count, #64
-	bge	L(cpy_not_short)
+	bhs	L(cpy_not_short)
 	/* Deal with small copies quickly by dropping straight into the
 	   exit block.  */
 
@@ -239,10 +239,10 @@ L(cpy_not_short):
 
 1:
 	subs	tmp2, count, #64	/* Use tmp2 for count.  */
-	blt	L(tail63aligned)
+	blo	L(tail63aligned)
 
 	cmp	tmp2, #512
-	bge	L(cpy_body_long)
+	bhs	L(cpy_body_long)
 
 L(cpy_body_medium):			/* Count in tmp2.  */
 #ifdef USE_VFP
@@ -266,7 +266,7 @@ L(cpy_body_medium):			/* Count in tmp2.  */
 	add	src, src, #64
 	vstr	d1, [dst, #56]
 	add	dst, dst, #64
-	bge	1b
+	bhs	1b
 	tst	tmp2, #0x3f
 	beq	L(done)
 
@@ -312,7 +312,7 @@ L(tail63aligned):			/* Count in tmp2.  */
 	ldrd	A_l, A_h, [src, #64]!
 	strd	A_l, A_h, [dst, #64]!
 	subs	tmp2, tmp2, #64
-	bge	1b
+	bhs	1b
 	tst	tmp2, #0x3f
 	bne	1f
 	ldr	tmp2,[sp], #FRAME_SIZE
@@ -383,7 +383,7 @@ L(cpy_body_long):			/* Count in tmp2.  */
 	add	src, src, #32
 
 	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
-	blt	2f
+	blo	2f
 1:
 	cpy_line_vfp	d3, 0
 	cpy_line_vfp	d4, 64
@@ -395,7 +395,7 @@ L(cpy_body_long):			/* Count in tmp2.  */
 	add	dst, dst, #2 * 64
 	add	src, src, #2 * 64
 	subs	tmp2, tmp2, #prefetch_lines * 64
-	bge	1b
+	bhs	1b
 
 2:
 	cpy_tail_vfp	d3, 0
@@ -499,15 +499,15 @@ L(cpy_notaligned):
 1:
 	pld	[src, #(3 * 64)]
 	subs	count, count, #64
-	ldrmi	tmp2, [sp], #FRAME_SIZE
-	bmi	L(tail63unaligned)
+	ldrlo	tmp2, [sp], #FRAME_SIZE
+	blo	L(tail63unaligned)
 	pld	[src, #(4 * 64)]
 
 #ifdef USE_NEON
 	vld1.8	{d0-d3}, [src]!
 	vld1.8	{d4-d7}, [src]!
 	subs	count, count, #64
-	bmi	2f
+	blo	2f
 1:
 	pld	[src, #(4 * 64)]
 	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
@@ -515,7 +515,7 @@ L(cpy_notaligned):
 	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
 	vld1.8	{d4-d7}, [src]!
 	subs	count, count, #64
-	bpl	1b
+	bhs	1b
 2:
 	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
 	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
diff --git a/string/arm/memset.S b/string/arm/memset.S
index 3ee5238..11e9273 100644
--- a/string/arm/memset.S
+++ b/string/arm/memset.S
@@ -1,7 +1,7 @@
 /*
  * memset - fill memory with a constant
  *
- * Copyright (c) 2010, Arm Limited.
+ * Copyright (c) 2010-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -25,7 +25,6 @@
 #else
 #define CHARTSTMASK(c) 1<<(c*8)
 #endif
-	.text
 	.thumb
 
 @ ---------------------------------------------------------------------------
diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S
index d615231..b75d414 100644
--- a/string/arm/strcmp-armv6m.S
+++ b/string/arm/strcmp-armv6m.S
@@ -1,10 +1,12 @@
 /*
  * strcmp for ARMv6-M (optimized for performance, not size)
  *
- * Copyright (c) 2014-2019, Arm Limited.
+ * Copyright (c) 2014-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+#if __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
+
 	.thumb_func
 	.syntax unified
 	.arch	armv6-m
@@ -111,3 +113,5 @@ ENTRY_ALIGN (__strcmp_armv6m, 4)
 	pop	{r4, r5, r6, pc}
 
 END (__strcmp_armv6m)
+
+#endif /* __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1  */
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index 295db8b..51443e3 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -1,10 +1,12 @@
 /*
  * strcmp for ARMv7
  *
- * Copyright (c) 2012-2019, Arm Limited.
+ * Copyright (c) 2012-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+#if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
+
 /* Implementation of strcmp for ARMv7 when DSP instructions are
    available.  Use ldrd to support wider loads, provided the data
    is sufficiently aligned.  Use saturating arithmetic to optimize
@@ -123,7 +125,6 @@
 #endif
 	.endm
 
-	.text
 	.p2align	5
 L(strcmp_start_addr):
 #if STRCMP_NO_PRECHECK == 0
@@ -470,3 +471,5 @@ L(strcmp_tail):
 	bx	lr
 
 END (__strcmp_arm)
+
+#endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1  */
diff --git a/string/arm/strcpy.c b/string/arm/strcpy.c
index 48ebbe8..02cf94f 100644
--- a/string/arm/strcpy.c
+++ b/string/arm/strcpy.c
@@ -1,10 +1,12 @@
 /*
  * strcpy
  *
- * Copyright (c) 2008-2019, Arm Limited.
+ * Copyright (c) 2008-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+#if defined (__thumb2__) && !defined (__thumb__)
+
 /* For GLIBC:
 #include <string.h>
 #include <memcopy.h>
@@ -127,3 +129,5 @@ __strcpy_arm (char* dst, const char* src)
        "BX LR");
 }
 /* For GLIBC: libc_hidden_builtin_def (strcpy) */
+
+#endif /* defined (__thumb2__) && !defined (__thumb__)  */
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index 76e6930..5ad30c9 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -1,10 +1,12 @@
 /*
  * strlen - calculate the length of a string
  *
- * Copyright (c) 2010, Arm Limited.
+ * Copyright (c) 2010-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+#if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+
 /*
    Assumes:
    ARMv6T2, AArch32
@@ -118,3 +120,5 @@ L(misaligned8):
 	b	L(start_realigned)
 
 END (__strlen_armv6t2)
+
+#endif /* __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2  */
diff --git a/string/asmdefs.h b/string/asmdefs.h
index 7d143a9..340b427 100644
--- a/string/asmdefs.h
+++ b/string/asmdefs.h
@@ -1,13 +1,64 @@
 /*
  * Macros for asm code.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
 #ifndef _ASMDEFS_H
 #define _ASMDEFS_H
 
+#if defined(__aarch64__)
+
+/* Branch Target Identitication support.  */
+#define BTI_C		hint	34
+#define BTI_J		hint	36
+/* Return address signing support (pac-ret).  */
+#define PACIASP		hint	25; .cfi_window_save
+#define AUTIASP		hint	29; .cfi_window_save
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h.  */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note.  */
+#define GNU_PROPERTY(type, value)	\
+  .section .note.gnu.property, "a";	\
+  .p2align 3;				\
+  .word 4;				\
+  .word 16;				\
+  .word 5;				\
+  .asciz "GNU";				\
+  .word type;				\
+  .word 4;				\
+  .word value;				\
+  .word 0;				\
+  .text
+
+/* If set then the GNU Property Note section will be added to
+   mark objects to support BTI and PAC-RET.  */
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+/* Add property note with supported features to all asm files.  */
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
+#endif
+
+#define ENTRY_ALIGN(name, alignment)	\
+  .global name;		\
+  .type name,%function;	\
+  .align alignment;		\
+  name:			\
+  .cfi_startproc;	\
+  BTI_C;
+
+#else
+
+#define END_FILE
+
 #define ENTRY_ALIGN(name, alignment)	\
   .global name;		\
   .type name,%function;	\
@@ -15,6 +66,8 @@
   name:			\
   .cfi_startproc;
 
+#endif
+
 #define ENTRY(name)	ENTRY_ALIGN(name, 6)
 
 #define ENTRY_ALIAS(name)	\
@@ -28,4 +81,18 @@
 
 #define L(l) .L ## l
 
+#ifdef __ILP32__
+  /* Sanitize padding bits of pointer arguments as per aapcs64 */
+#define PTR_ARG(n)  mov w##n, w##n
+#else
+#define PTR_ARG(n)
+#endif
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of size arguments as per aapcs64 */
+#define SIZE_ARG(n)  mov w##n, w##n
+#else
+#define SIZE_ARG(n)
+#endif
+
 #endif
diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
new file mode 100644
index 0000000..d5d4ea7
--- /dev/null
+++ b/string/bench/memcpy.c
@@ -0,0 +1,260 @@
+/*
+ * memcpy benchmark.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "stringlib.h"
+#include "benchlib.h"
+
+#define ITERS 5000
+#define ITERS2 20000000
+#define ITERS3 500000
+#define MAX_COPIES 8192
+#define SIZE (256*1024)
+
+static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64)));
+static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64)));
+
+#define F(x) {#x, x},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun)(void *, const void *, size_t);
+} funtab[] =
+{
+  F(memcpy)
+#if __aarch64__
+  F(__memcpy_aarch64)
+# if __ARM_NEON
+  F(__memcpy_aarch64_simd)
+# endif
+#elif __arm__
+  F(__memcpy_arm)
+#endif
+#undef F
+  {0, 0}
+};
+
+typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
+typedef struct { uint8_t align; uint16_t freq; } align_data_t;
+
+#define SIZE_NUM 65536
+#define SIZE_MASK (SIZE_NUM-1)
+static uint8_t size_arr[SIZE_NUM];
+
+/* Frequency data for memcpy of less than 4096 bytes based on SPEC2017.  */
+static freq_data_t size_freq[] =
+{
+{32,22320}, { 16,9554}, {  8,8915}, {152,5327}, {  4,2159}, {292,2035},
+{ 12,1608}, { 24,1343}, {1152,895}, {144, 813}, {884, 733}, {284, 721},
+{120, 661}, {  2, 649}, {882, 550}, {  5, 475}, {  7, 461}, {108, 460},
+{ 10, 361}, {  9, 361}, {  6, 334}, {  3, 326}, {464, 308}, {2048,303},
+{  1, 298}, { 64, 250}, { 11, 197}, {296, 194}, { 68, 187}, { 15, 185},
+{192, 184}, {1764,183}, { 13, 173}, {560, 126}, {160, 115}, {288,  96},
+{104,  96}, {1144, 83}, { 18,  80}, { 23,  78}, { 40,  77}, { 19,  68},
+{ 48,  63}, { 17,  57}, { 72,  54}, {1280, 51}, { 20,  49}, { 28,  47},
+{ 22,  46}, {640,  45}, { 25,  41}, { 14,  40}, { 56,  37}, { 27,  35},
+{ 35,  33}, {384,  33}, { 29,  32}, { 80,  30}, {4095, 22}, {232,  22},
+{ 36,  19}, {184,  17}, { 21,  17}, {256,  16}, { 44,  15}, { 26,  15},
+{ 31,  14}, { 88,  14}, {176,  13}, { 33,  12}, {1024, 12}, {208,  11},
+{ 62,  11}, {128,  10}, {704,  10}, {324,  10}, { 96,  10}, { 60,   9},
+{136,   9}, {124,   9}, { 34,   8}, { 30,   8}, {480,   8}, {1344,  8},
+{273,   7}, {520,   7}, {112,   6}, { 52,   6}, {344,   6}, {336,   6},
+{504,   5}, {168,   5}, {424,   5}, {  0,   4}, { 76,   3}, {200,   3},
+{512,   3}, {312,   3}, {240,   3}, {960,   3}, {264,   2}, {672,   2},
+{ 38,   2}, {328,   2}, { 84,   2}, { 39,   2}, {216,   2}, { 42,   2},
+{ 37,   2}, {1608,  2}, { 70,   2}, { 46,   2}, {536,   2}, {280,   1},
+{248,   1}, { 47,   1}, {1088,  1}, {1288,  1}, {224,   1}, { 41,   1},
+{ 50,   1}, { 49,   1}, {808,   1}, {360,   1}, {440,   1}, { 43,   1},
+{ 45,   1}, { 78,   1}, {968,   1}, {392,   1}, { 54,   1}, { 53,   1},
+{ 59,   1}, {376,   1}, {664,   1}, { 58,   1}, {272,   1}, { 66,   1},
+{2688,  1}, {472,   1}, {568,   1}, {720,   1}, { 51,   1}, { 63,   1},
+{ 86,   1}, {496,   1}, {776,   1}, { 57,   1}, {680,   1}, {792,   1},
+{122,   1}, {760,   1}, {824,   1}, {552,   1}, { 67,   1}, {456,   1},
+{984,   1}, { 74,   1}, {408,   1}, { 75,   1}, { 92,   1}, {576,   1},
+{116,   1}, { 65,   1}, {117,   1}, { 82,   1}, {352,   1}, { 55,   1},
+{100,   1}, { 90,   1}, {696,   1}, {111,   1}, {880,   1}, { 79,   1},
+{488,   1}, { 61,   1}, {114,   1}, { 94,   1}, {1032,  1}, { 98,   1},
+{ 87,   1}, {584,   1}, { 85,   1}, {648,   1}, {0, 0}
+};
+
+#define ALIGN_NUM 1024
+#define ALIGN_MASK (ALIGN_NUM-1)
+static uint8_t src_align_arr[ALIGN_NUM];
+static uint8_t dst_align_arr[ALIGN_NUM];
+
+/* Source alignment frequency for memcpy based on SPEC2017.  */
+static align_data_t src_align_freq[] =
+{
+  {8, 300}, {16, 292}, {32, 168}, {64, 153}, {4, 79}, {2, 14}, {1, 18}, {0, 0}
+};
+
+static align_data_t dst_align_freq[] =
+{
+  {8, 265}, {16, 263}, {64, 209}, {32, 174}, {4, 90}, {2, 10}, {1, 13}, {0, 0}
+};
+
+typedef struct
+{
+  uint64_t src : 24;
+  uint64_t dst : 24;
+  uint64_t len : 16;
+} copy_t;
+
+static copy_t copy[MAX_COPIES];
+
+typedef char *(*proto_t) (char *, const char *, size_t);
+
+static void
+init_copy_distribution (void)
+{
+  int i, j, freq, size, n;
+
+  for (n = i = 0; (freq = size_freq[i].freq) != 0; i++)
+    for (j = 0, size = size_freq[i].size; j < freq; j++)
+      size_arr[n++] = size;
+  assert (n == SIZE_NUM);
+
+  for (n = i = 0; (freq = src_align_freq[i].freq) != 0; i++)
+    for (j = 0, size = src_align_freq[i].align; j < freq; j++)
+      src_align_arr[n++] = size - 1;
+  assert (n == ALIGN_NUM);
+
+  for (n = i = 0; (freq = dst_align_freq[i].freq) != 0; i++)
+    for (j = 0, size = dst_align_freq[i].align; j < freq; j++)
+      dst_align_arr[n++] = size - 1;
+  assert (n == ALIGN_NUM);
+}
+
+static size_t
+init_copies (size_t max_size)
+{
+  size_t total = 0;
+  /* Create a random set of copies with the given size and alignment
+     distributions.  */
+  for (int i = 0; i < MAX_COPIES; i++)
+    {
+      copy[i].dst = (rand32 (0) & (max_size - 1));
+      copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
+      copy[i].src = (rand32 (0) & (max_size - 1));
+      copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
+      copy[i].len = size_arr[rand32 (0) & SIZE_MASK];
+      total += copy[i].len;
+    }
+
+  return total;
+}
+
+int main (void)
+{
+  init_copy_distribution ();
+
+  memset (a, 1, sizeof (a));
+  memset (b, 2, sizeof (b));
+
+  printf("Random memcpy:\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      size_t total = 0;
+      uint64_t tsum = 0;
+      printf ("%22s (B/ns) ", funtab[f].name);
+      rand32 (0x12345678);
+
+      for (int size = 16384; size <= SIZE; size *= 2)
+	{
+	  size_t copy_size = init_copies (size) * ITERS;
+
+	  for (int c = 0; c < MAX_COPIES; c++)
+	    funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
+
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS; i++)
+	    for (int c = 0; c < MAX_COPIES; c++)
+	      funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
+	  t = clock_get_ns () - t;
+	  total += copy_size;
+	  tsum += t;
+	  printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
+	}
+      printf( "avg %.2f\n", (double)total / tsum);
+    }
+
+  printf ("\nMedium memcpy:\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s (B/ns) ", funtab[f].name);
+
+      for (int size = 16; size <= 512; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS2; i++)
+	    funtab[f].fun (b, a, size);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("\nLarge memcpy:\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s (B/ns) ", funtab[f].name);
+
+      for (int size = 1024; size <= 32768; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS3; i++)
+	    funtab[f].fun (b, a, size);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("\nUnaligned forwards memmove:\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s (B/ns) ", funtab[f].name);
+
+      for (int size = 1024; size <= 32768; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS3; i++)
+	    funtab[f].fun (a, a + 256 + (i & 31), size);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	}
+      printf ("\n");
+    }
+
+
+  printf ("\nUnaligned backwards memmove:\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s (B/ns) ", funtab[f].name);
+
+      for (int size = 1024; size <= 32768; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS3; i++)
+	    funtab[f].fun (a + 256 + (i & 31), a, size);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	}
+      printf ("\n");
+    }
+
+  return 0;
+}
diff --git a/string/bench/strlen.c b/string/bench/strlen.c
new file mode 100644
index 0000000..cc0f04b
--- /dev/null
+++ b/string/bench/strlen.c
@@ -0,0 +1,221 @@
+/*
+ * strlen benchmark.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "stringlib.h"
+#include "benchlib.h"
+
+#define ITERS 2000
+#define ITERS2 20000000
+#define ITERS3 2000000
+#define NUM_STRLEN 16384
+
+#define MAX_ALIGN 32
+#define MAX_STRLEN 256
+
+static char a[(MAX_STRLEN + 1) * MAX_ALIGN] __attribute__((__aligned__(4096)));
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  size_t (*fun) (const char *s);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(strlen, 0)
+#if __aarch64__
+  F(__strlen_aarch64, 0)
+  F(__strlen_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+  F(__strlen_aarch64_sve, 1)
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+  F(__strlen_armv6t2, 0)
+# endif
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+static uint16_t strlen_tests[NUM_STRLEN];
+
+typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
+typedef struct { uint8_t align; uint16_t freq; } align_data_t;
+
+#define SIZE_NUM 65536
+#define SIZE_MASK (SIZE_NUM - 1)
+static uint8_t strlen_len_arr[SIZE_NUM];
+
+/* Frequency data for strlen sizes up to 128 based on SPEC2017.  */
+static freq_data_t strlen_len_freq[] =
+{
+  { 12,22671}, { 18,12834}, { 13, 9555}, {  6, 6348}, { 17, 6095}, { 11, 2115},
+  { 10, 1335}, {  7,  814}, {  2,  646}, {  9,  483}, {  8,  471}, { 16,  418},
+  {  4,  390}, {  1,  388}, {  5,  233}, {  3,  204}, {  0,   79}, { 14,   79},
+  { 15,   69}, { 26,   36}, { 22,   35}, { 31,   24}, { 32,   24}, { 19,   21},
+  { 25,   17}, { 28,   15}, { 21,   14}, { 33,   14}, { 20,   13}, { 24,    9},
+  { 29,    9}, { 30,    9}, { 23,    7}, { 34,    7}, { 27,    6}, { 44,    5},
+  { 42,    4}, { 45,    3}, { 47,    3}, { 40,    2}, { 41,    2}, { 43,    2},
+  { 58,    2}, { 78,    2}, { 36,    2}, { 48,    1}, { 52,    1}, { 60,    1},
+  { 64,    1}, { 56,    1}, { 76,    1}, { 68,    1}, { 80,    1}, { 84,    1},
+  { 72,    1}, { 86,    1}, { 35,    1}, { 39,    1}, { 50,    1}, { 38,    1},
+  { 37,    1}, { 46,    1}, { 98,    1}, {102,    1}, {128,    1}, { 51,    1},
+  {107,    1}, { 0,     0}
+};
+
+#define ALIGN_NUM 1024
+#define ALIGN_MASK (ALIGN_NUM - 1)
+static uint8_t strlen_align_arr[ALIGN_NUM];
+
+/* Alignment data for strlen based on SPEC2017.  */
+static align_data_t string_align_freq[] =
+{
+  {8, 470}, {32, 427}, {16, 99}, {1, 19}, {2, 6}, {4, 3}, {0, 0}
+};
+
+static void
+init_strlen_distribution (void)
+{
+  int i, j, freq, size, n;
+
+  for (n = i = 0; (freq = strlen_len_freq[i].freq) != 0; i++)
+    for (j = 0, size = strlen_len_freq[i].size; j < freq; j++)
+      strlen_len_arr[n++] = size;
+  assert (n == SIZE_NUM);
+
+  for (n = i = 0; (freq = string_align_freq[i].freq) != 0; i++)
+    for (j = 0, size = string_align_freq[i].align; j < freq; j++)
+      strlen_align_arr[n++] = size;
+  assert (n == ALIGN_NUM);
+}
+
+static void
+init_strlen_tests (void)
+{
+  uint16_t index[MAX_ALIGN];
+
+  memset (a, 'x', sizeof (a));
+
+  /* Create indices for strings at all alignments.  */
+  for (int i = 0; i < MAX_ALIGN; i++)
+    {
+      index[i] = i * (MAX_STRLEN + 1);
+      a[index[i] + MAX_STRLEN] = 0;
+    }
+
+  /* Create a random set of strlen input strings using the string length
+     and alignment distributions.  */
+  for (int n = 0; n < NUM_STRLEN; n++)
+    {
+      int align = strlen_align_arr[rand32 (0) & ALIGN_MASK];
+      int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK];
+
+      strlen_tests[n] =
+	index[(align + exp_len) & (MAX_ALIGN - 1)] + MAX_STRLEN - exp_len;
+    }
+}
+
+static volatile size_t maskv = 0;
+
+int main (void)
+{
+  rand32 (0x12345678);
+  init_strlen_distribution ();
+  init_strlen_tests ();
+
+  printf ("\nRandom strlen (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      size_t res = 0, strlen_size = 0, mask = maskv;
+      printf ("%22s ", funtab[f].name);
+
+      for (int c = 0; c < NUM_STRLEN; c++)
+	strlen_size += funtab[f].fun (a + strlen_tests[c]);
+      strlen_size *= ITERS;
+
+      /* Measure latency of strlen result with (res & mask).  */
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS; i++)
+	for (int c = 0; c < NUM_STRLEN; c++)
+	  res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
+      t = clock_get_ns () - t;
+      printf ("%.2f\n", (double)strlen_size / t);
+    }
+
+  printf ("\nSmall aligned strlen (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      for (int size = 1; size <= 64; size *= 2)
+	{
+	  memset (a, 'x', size);
+	  a[size - 1] = 0;
+
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS2; i++)
+	    funtab[f].fun (a);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("\nSmall unaligned strlen (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      int align = 9;
+      for (int size = 1; size <= 64; size *= 2)
+	{
+	  memset (a + align, 'x', size);
+	  a[align + size - 1] = 0;
+
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS2; i++)
+	    funtab[f].fun (a + align);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("\nMedium strlen (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      for (int size = 128; size <= 4096; size *= 2)
+	{
+	  memset (a, 'x', size);
+	  a[size - 1] = 0;
+
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS3; i++)
+	    funtab[f].fun (a);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("\n");
+
+  return 0;
+}
diff --git a/string/include/benchlib.h b/string/include/benchlib.h
new file mode 100644
index 0000000..0f2ce2e
--- /dev/null
+++ b/string/include/benchlib.h
@@ -0,0 +1,33 @@
+/*
+ * Benchmark support functions.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <time.h>
+
+/* Fast and accurate timer returning nanoseconds.  */
+static inline uint64_t
+clock_get_ns (void)
+{
+  struct timespec ts;
+  clock_gettime (CLOCK_MONOTONIC, &ts);
+  return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec;
+}
+
+/* Fast 32-bit random number generator.  Passing a non-zero seed
+   value resets the internal state.  */
+static inline uint32_t
+rand32 (uint32_t seed)
+{
+  static uint64_t state = 0xb707be451df0bb19ULL;
+  if (seed != 0)
+    state = seed;
+  uint32_t res = state >> 32;
+  state = state * 6364136223846793005ULL + 1;
+  return res;
+}
+
+
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index b3b6181..378c3cd 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -1,7 +1,7 @@
 /*
  * Public API.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -17,8 +17,10 @@ void *__memcpy_aarch64 (void *__restrict, const void *__restrict, size_t);
 void *__memmove_aarch64 (void *, const void *, size_t);
 void *__memset_aarch64 (void *, int, size_t);
 void *__memchr_aarch64 (const void *, int, size_t);
+void *__memrchr_aarch64 (const void *, int, size_t);
 int __memcmp_aarch64 (const void *, const void *, size_t);
 char *__strcpy_aarch64 (char *__restrict, const char *__restrict);
+char *__stpcpy_aarch64 (char *__restrict, const char *__restrict);
 int __strcmp_aarch64 (const char *, const char *);
 char *__strchr_aarch64 (const char *, int);
 char *__strrchr_aarch64 (const char *, int);
@@ -26,6 +28,15 @@ char *__strchrnul_aarch64 (const char *, int );
 size_t __strlen_aarch64 (const char *);
 size_t __strnlen_aarch64 (const char *, size_t);
 int __strncmp_aarch64 (const char *, const char *, size_t);
+void * __memchr_aarch64_mte (const void *, int, size_t);
+char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict);
+char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict);
+char *__strchr_aarch64_mte (const char *, int);
+char * __strchrnul_aarch64_mte (const char *, int );
+size_t __strlen_aarch64_mte (const char *);
+char *__strrchr_aarch64_mte (const char *, int);
+int __strcmp_aarch64_mte (const char *, const char *);
+int __strncmp_aarch64_mte (const char *, const char *, size_t);
 #if __ARM_NEON
 void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
 void *__memmove_aarch64_simd (void *, const void *, size_t);
@@ -38,10 +49,15 @@ char *__strrchr_aarch64_sve (const char *, int);
 char *__strchrnul_aarch64_sve (const char *, int );
 int __strcmp_aarch64_sve (const char *, const char *);
 char *__strcpy_aarch64_sve (char *__restrict, const char *__restrict);
+char *__stpcpy_aarch64_sve (char *__restrict, const char *__restrict);
 size_t __strlen_aarch64_sve (const char *);
 size_t __strnlen_aarch64_sve (const char *, size_t);
 int __strncmp_aarch64_sve (const char *, const char *, size_t);
 # endif
+# if __ARM_FEATURE_MEMORY_TAGGING
+void *__mtag_tag_region (void *, size_t);
+void *__mtag_tag_zero_region (void *, size_t);
+# endif
 #elif __arm__
 void *__memcpy_arm (void *__restrict, const void *__restrict, size_t);
 void *__memset_arm (void *, int, size_t);
diff --git a/string/memchr.S b/string/memchr.S
deleted file mode 100644
index 0a564d8..0000000
--- a/string/memchr.S
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Selected possible memchr implementations.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if __aarch64__
-#include "aarch64/memchr.S"
-# if __ARM_FEATURE_SVE
-#include "aarch64/memchr-sve.S"
-# endif
-#elif __arm__
-#include "arm/memchr.S"
-#endif
diff --git a/string/memcmp.S b/string/memcmp.S
deleted file mode 100644
index 22da685..0000000
--- a/string/memcmp.S
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Selected possible memcpy implementations.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if __aarch64__
-#include "aarch64/memcmp.S"
-# if __ARM_FEATURE_SVE
-#include "aarch64/memcmp-sve.S"
-# endif
-#endif
diff --git a/string/memcpy.S b/string/memcpy.S
deleted file mode 100644
index b52b603..0000000
--- a/string/memcpy.S
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Selected possible memcpy implementations.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if __aarch64__
-#include "aarch64/memcpy.S"
-# if __ARM_NEON
-#include "aarch64/memcpy_simd.S"
-# endif
-#elif __arm__
-#include "arm/memcpy.S"
-#endif
diff --git a/string/memset.S b/string/memset.S
deleted file mode 100644
index 57542ef..0000000
--- a/string/memset.S
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Selected possible memset implementations.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if __aarch64__
-#include "aarch64/memset.S"
-#elif __arm__
-#include "arm/memset.S"
-#endif
diff --git a/string/strchr.S b/string/strchr.S
deleted file mode 100644
index 8cead02..0000000
--- a/string/strchr.S
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Selected possible strchr implementations.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if __aarch64__
-#include "aarch64/strchr.S"
-# if __ARM_FEATURE_SVE
-#include "aarch64/strchr-sve.S"
-# endif
-#endif
diff --git a/string/strchrnul.S b/string/strchrnul.S
deleted file mode 100644
index 3dfdeef..0000000
--- a/string/strchrnul.S
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Selected possible strchr implementations.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if __aarch64__
-#include "aarch64/strchrnul.S"
-# if __ARM_FEATURE_SVE
-#include "aarch64/strchrnul-sve.S"
-# endif
-#endif
diff --git a/string/strcmp.S b/string/strcmp.S
deleted file mode 100644
index 12530ec..0000000
--- a/string/strcmp.S
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Selected possible strcmp implementations.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if __aarch64__
-#include "aarch64/strcmp.S"
-# if __ARM_FEATURE_SVE
-#include "aarch64/strcmp-sve.S"
-# endif
-#elif __arm__
-# if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
-#include "arm/strcmp.S"
-# elif __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
-#include "arm/strcmp-armv6m.S"
-# endif
-#endif
diff --git a/string/strcpy-c.c b/string/strcpy-c.c
deleted file mode 100644
index 6bde24a..0000000
--- a/string/strcpy-c.c
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * Selected possible strcpy implementations.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if __arm__ && defined (__thumb2__) && !defined (__thumb__)
-#include "arm/strcpy.c"
-#endif
diff --git a/string/strcpy.S b/string/strcpy.S
deleted file mode 100644
index a604b22..0000000
--- a/string/strcpy.S
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Selected possible strcpy implementations.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if __aarch64__
-#include "aarch64/strcpy.S"
-# if __ARM_FEATURE_SVE
-#include "aarch64/strcpy-sve.S"
-# endif
-#endif
diff --git a/string/strlen.S b/string/strlen.S
deleted file mode 100644
index d681033..0000000
--- a/string/strlen.S
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Selected possible strlen implementations.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if __aarch64__
-#include "aarch64/strlen.S"
-# if __ARM_FEATURE_SVE
-#include "aarch64/strlen-sve.S"
-# endif
-#elif __arm__
-# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
-#include "arm/strlen-armv6t2.S"
-# endif
-#endif
diff --git a/string/strncmp.S b/string/strncmp.S
deleted file mode 100644
index 26b56b7..0000000
--- a/string/strncmp.S
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Selected possible strncmp implementations.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if __aarch64__
-#include "aarch64/strncmp.S"
-# if __ARM_FEATURE_SVE
-#include "aarch64/strncmp-sve.S"
-# endif
-#endif
diff --git a/string/strnlen.S b/string/strnlen.S
deleted file mode 100644
index eebe777..0000000
--- a/string/strnlen.S
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Selected possible strnlen implementations.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if __aarch64__
-#include "aarch64/strnlen.S"
-# if __ARM_FEATURE_SVE
-#include "aarch64/strnlen-sve.S"
-# endif
-#endif
diff --git a/string/strrchr.S b/string/strrchr.S
deleted file mode 100644
index 119b1d5..0000000
--- a/string/strrchr.S
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Selected possible strrchr implementations.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if __aarch64__
-#include "aarch64/strrchr.S"
-# if __ARM_FEATURE_SVE
-#include "aarch64/strrchr-sve.S"
-# endif
-#endif
diff --git a/string/test/__mtag_tag_region.c b/string/test/__mtag_tag_region.c
new file mode 100644
index 0000000..d8c02d9
--- /dev/null
+++ b/string/test/__mtag_tag_region.c
@@ -0,0 +1,147 @@
+/*
+ * __mtag_tag_region test.
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+static void
+mtag_quoteat (const char *prefix, void *p, int len, int at)
+{
+  /* Print tag, untag and quote the context.  */
+  printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at));
+  untag_buffer (p, len, 1);
+  p = untag_pointer (p);
+  quoteat (prefix, p, len, at);
+}
+
+#define F(x) {#x, x},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun) (void *s, size_t n);
+} funtab[] = {
+// clang-format off
+#if __aarch64__
+  F(__mtag_tag_region)
+#endif
+  {0, 0}
+  // clang-format on
+};
+#undef F
+
+#define A 64
+#define LEN 250000
+static unsigned char *sbuf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int salign, int len)
+{
+  unsigned char *src = alignup (sbuf);
+  unsigned char *s = src + salign;
+  void *p;
+  int i;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || salign >= A)
+    abort ();
+  for (i = 0; i < len + 2 * A; i++)
+    src[i] = '?';
+  for (i = 0; i < len; i++)
+    s[i] = 'a';
+
+  src = tag_buffer (src, len + 2 * A, 1);
+  s = src + salign;
+  /* Use different tag.  */
+  s = __arm_mte_increment_tag (s, 1);
+  p = fun->fun (s, len);
+
+  if (p != s)
+    ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
+
+  for (i = 0; i < salign; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got head", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  for (; i < salign + len; i++)
+    {
+      if (s[i - salign] != 'a')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got body", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  for (; i < len + 2 * A; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got tail", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  untag_buffer (src, len + 2 * A, 1);
+}
+
+int
+main ()
+{
+  if (!mte_enabled ())
+    return 0;
+
+  sbuf = mte_mmap (LEN + 3 * A);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int s = 0; s < A; s += 16)
+	{
+	  int n;
+	  for (n = 0; n < 200; n += 16)
+	    {
+	      test (funtab + i, s, n);
+	    }
+	  for (; n < LEN; n *= 2)
+	    {
+	      test (funtab + i, s, n);
+	    }
+	}
+      printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
+#else
+int
+main ()
+{
+  return 0;
+}
+#endif
diff --git a/string/test/__mtag_tag_zero_region.c b/string/test/__mtag_tag_zero_region.c
new file mode 100644
index 0000000..221c223
--- /dev/null
+++ b/string/test/__mtag_tag_zero_region.c
@@ -0,0 +1,147 @@
+/*
+ * __mtag_tag_zero_region test.
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+static void
+mtag_quoteat (const char *prefix, void *p, int len, int at)
+{
+  /* Print tag, untag and quote the context.  */
+  printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at));
+  untag_buffer (p, len, 1);
+  p = untag_pointer (p);
+  quoteat (prefix, p, len, at);
+}
+
+#define F(x) {#x, x},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun) (void *s, size_t n);
+} funtab[] = {
+// clang-format off
+#if __aarch64__
+  F(__mtag_tag_zero_region)
+#endif
+  {0, 0}
+  // clang-format on
+};
+#undef F
+
+#define A 64
+#define LEN 250000
+static unsigned char *sbuf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int salign, int len)
+{
+  unsigned char *src = alignup (sbuf);
+  unsigned char *s = src + salign;
+  void *p;
+  int i;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || salign >= A)
+    abort ();
+  for (i = 0; i < len + 2 * A; i++)
+    src[i] = '?';
+  for (i = 0; i < len; i++)
+    s[i] = 'a' + i % 23;
+
+  src = tag_buffer (src, len + 2 * A, 1);
+  s = src + salign;
+  /* Use different tag.  */
+  s = __arm_mte_increment_tag (s, 1);
+  p = fun->fun (s, len);
+
+  if (p != s)
+    ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
+
+  for (i = 0; i < salign; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got head", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  for (; i < salign + len; i++)
+    {
+      if (s[i - salign] != 0)
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got body", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  for (; i < len + 2 * A; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got tail", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  untag_buffer (src, len + 2 * A, 1);
+}
+
+int
+main ()
+{
+  if (!mte_enabled ())
+    return 0;
+
+  sbuf = mte_mmap (LEN + 3 * A);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int s = 0; s < A; s += 16)
+	{
+	  int n;
+	  for (n = 0; n < 200; n += 16)
+	    {
+	      test (funtab + i, s, n);
+	    }
+	  for (; n < LEN; n *= 2)
+	    {
+	      test (funtab + i, s, n);
+	    }
+	}
+      printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
+#else
+int
+main ()
+{
+  return 0;
+}
+#endif
diff --git a/string/test/memchr.c b/string/test/memchr.c
index 1ebc6d6..0ff77f5 100644
--- a/string/test/memchr.c
+++ b/string/test/memchr.c
@@ -1,7 +1,7 @@
 /*
  * memchr test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -10,84 +10,101 @@
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
+#include "mte.h"
 #include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
-	const char *name;
-	void *(*fun)(const void *, int c, size_t n);
+  const char *name;
+  void *(*fun) (const void *s, int c, size_t n);
+  int test_mte;
 } funtab[] = {
-#define F(x) {#x, x},
-F(memchr)
+  // clang-format off
+  F(memchr, 0)
 #if __aarch64__
-F(__memchr_aarch64)
+  F(__memchr_aarch64, 0)
+  F(__memchr_aarch64_mte, 1)
 # if __ARM_FEATURE_SVE
-F(__memchr_aarch64_sve)
+  F(__memchr_aarch64_sve, 1)
 # endif
 #elif __arm__
-F(__memchr_arm)
+  F(__memchr_arm, 0)
 #endif
-#undef F
-	{0, 0}
+  {0, 0, 0}
+  // clang-format on
 };
+#undef F
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
-#define A 32
-#define SP 512
-#define LEN 250000
-static unsigned char sbuf[LEN+2*A];
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
 
-static void *alignup(void *p)
+static void *
+alignup (void *p)
 {
-	return (void*)(((uintptr_t)p + A-1) & -A);
+  return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
 }
 
-static void test(const struct fun *fun, int align, int seekpos, int len)
+static void
+test (const struct fun *fun, int align, size_t seekpos, size_t len,
+      size_t maxlen)
 {
-	unsigned char *src = alignup(sbuf);
-	unsigned char *s = src + align;
-	unsigned char *f = len ? s + seekpos : 0;
-	int seekchar = 0x1;
-	int i;
-	void *p;
+  char *src = alignup (sbuf);
+  char *s = src + align;
+  char *f = seekpos < maxlen ? s + seekpos : NULL;
+  int seekchar = 1;
+  void *p;
 
-	if (len > LEN || seekpos >= len || align >= A)
-		abort();
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || seekpos > LEN || align > ALIGN)
+    abort ();
 
-	for (i = 0; i < seekpos; i++)
-		s[i] = 'a' + i%23;
-	s[i++] = seekchar;
-	for (; i < len; i++)
-		s[i] = 'a' + i%23;
+  for (int i = 0; src + i < s; i++)
+    src[i] = seekchar;
+  for (int i = 0; i <= ALIGN; i++)
+    s[len + i] = seekchar;
+  for (int i = 0; i < len; i++)
+    s[i] = 'a' + (i & 31);
+  s[seekpos] = seekchar;
+  s[((len ^ align) & 1) ? seekpos + 1 : len] = seekchar;
 
-	p = fun->fun(s, seekchar, len);
+  int mte_len = seekpos != -1 ? seekpos + 1 : maxlen;
+  s = tag_buffer (s, mte_len, fun->test_mte);
+  p = fun->fun (s, seekchar, maxlen);
+  untag_buffer (s, mte_len, fun->test_mte);
+  p = untag_pointer (p);
 
-	if (p != f) {
-		ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
-		ERR("expected: %p\n", f);
-		abort();
-	}
+  if (p != f)
+    {
+      ERR ("%s (%p, 0x%02x, %zu) returned %p, expected %p\n", fun->name, s,
+	   seekchar, maxlen, p, f);
+      quote ("input", s, len);
+    }
 }
 
-int main()
+int
+main (void)
 {
-	int r = 0;
-	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
-		for (int a = 0; a < A; a++) {
-			for (int n = 0; n < 100; n++)
-				for (int sp = 0; sp < n-1; sp++)
-					test(funtab+i, a, sp, n);
-			for (int n = 100; n < LEN; n *= 2) {
-				test(funtab+i, a, n-1, n);
-				test(funtab+i, a, n/2, n);
-			}
-		}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
-			r = -1;
-	}
-	return r;
+  sbuf = mte_mmap (LEN + 3 * ALIGN);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int a = 0; a < ALIGN; a++)
+	for (int n = 0; n < LEN; n++)
+	  {
+	    for (int sp = 0; sp < LEN; sp++)
+	      test (funtab + i, a, sp, n, n);
+	    test (funtab + i, a, n, n, SIZE_MAX - a);
+	  }
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
 }
diff --git a/string/test/memcmp.c b/string/test/memcmp.c
index 114f1d7..7a7cf9c 100644
--- a/string/test/memcmp.c
+++ b/string/test/memcmp.c
@@ -1,7 +1,7 @@
 /*
  * memcmp test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -9,88 +9,117 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include "mte.h"
 #include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
-	const char *name;
-	int (*fun)(const void *s1, const void *s2, size_t n);
+  const char *name;
+  int (*fun) (const void *s1, const void *s2, size_t n);
+  int test_mte;
 } funtab[] = {
-#define F(x) {#x, x},
-F(memcmp)
+  // clang-format off
+  F(memcmp, 0)
 #if __aarch64__
-F(__memcmp_aarch64)
+  F(__memcmp_aarch64, 1)
 # if __ARM_FEATURE_SVE
-F(__memcmp_aarch64_sve)
+  F(__memcmp_aarch64_sve, 1)
 # endif
 #endif
-#undef F
-	{0, 0}
+  {0, 0, 0}
+  // clang-format on
 };
-
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+#undef F
 
 #define A 32
 #define LEN 250000
-static unsigned char s1buf[LEN+2*A];
-static unsigned char s2buf[LEN+2*A];
+static unsigned char *s1buf;
+static unsigned char *s2buf;
 
-static void *alignup(void *p)
+static void *
+alignup (void *p)
 {
-	return (void*)(((uintptr_t)p + A-1) & -A);
+  return (void *) (((uintptr_t) p + A - 1) & -A);
 }
 
-static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos)
+static void
+test (const struct fun *fun, int s1align, int s2align, int len, int diffpos,
+      int delta)
 {
-	unsigned char *src1 = alignup(s1buf);
-	unsigned char *src2 = alignup(s2buf);
-	unsigned char *s1 = src1 + s1align;
-	unsigned char *s2 = src2 + s2align;
-	int r;
+  unsigned char *src1 = alignup (s1buf);
+  unsigned char *src2 = alignup (s2buf);
+  unsigned char *s1 = src1 + s1align;
+  unsigned char *s2 = src2 + s2align;
+  int r;
 
-	if (len > LEN || s1align >= A || s2align >= A)
-		abort();
-	if (diffpos && diffpos >= len)
-		abort();
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || s1align >= A || s2align >= A)
+    abort ();
+  if (diffpos >= len)
+    abort ();
+  if ((diffpos < 0) != (delta == 0))
+    abort ();
 
-	for (int i = 0; i < len+A; i++)
-		src1[i] = src2[i] = '?';
-	for (int i = 0; i < len; i++)
-		s1[i] = s2[i] = 'a' + i%23;
-	if (diffpos)
-		s1[diffpos]++;
+  for (int i = 0; i < len + A; i++)
+    src1[i] = src2[i] = '?';
+  for (int i = 0; i < len; i++)
+    s1[i] = s2[i] = 'a' + i % 23;
+  if (delta)
+    s1[diffpos] += delta;
 
-	r = fun->fun(s1, s2, len);
+  s1 = tag_buffer (s1, len, fun->test_mte);
+  s2 = tag_buffer (s2, len, fun->test_mte);
+  r = fun->fun (s1, s2, len);
+  untag_buffer (s1, len, fun->test_mte);
+  untag_buffer (s2, len, fun->test_mte);
 
-	if ((!diffpos && r != 0) || (diffpos && r == 0)) {
-		ERR("%s(align %d, align %d, %d) failed, returned %d\n",
-			fun->name, s1align, s2align, len, r);
-		ERR("src1: %.*s\n", s1align+len+1, src1);
-		ERR("src2: %.*s\n", s2align+len+1, src2);
-	}
+  if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0))
+    {
+      ERR ("%s(align %d, align %d, %d) failed, returned %d\n", fun->name,
+	   s1align, s2align, len, r);
+      quoteat ("src1", src1, len + A, diffpos);
+      quoteat ("src2", src2, len + A, diffpos);
+    }
 }
 
-int main()
+int
+main ()
 {
-	int r = 0;
-	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
-		for (int d = 0; d < A; d++)
-			for (int s = 0; s < A; s++) {
-				int n;
-				for (n = 0; n < 100; n++) {
-					test(funtab+i, d, s, n, 0);
-					test(funtab+i, d, s, n, n / 2);
-				}
-				for (; n < LEN; n *= 2) {
-					test(funtab+i, d, s, n, 0);
-					test(funtab+i, d, s, n, n / 2);
-				}
-			}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
-			r = -1;
-	}
-	return r;
+  s1buf = mte_mmap (LEN + 2 * A);
+  s2buf = mte_mmap (LEN + 2 * A);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int d = 0; d < A; d++)
+	for (int s = 0; s < A; s++)
+	  {
+	    int n;
+	    test (funtab + i, d, s, 0, -1, 0);
+	    test (funtab + i, d, s, 1, -1, 0);
+	    test (funtab + i, d, s, 1, 0, -1);
+	    test (funtab + i, d, s, 1, 0, 1);
+	    for (n = 2; n < 100; n++)
+	      {
+		test (funtab + i, d, s, n, -1, 0);
+		test (funtab + i, d, s, n, 0, -1);
+		test (funtab + i, d, s, n, n - 1, -1);
+		test (funtab + i, d, s, n, n / 2, 1);
+	      }
+	    for (; n < LEN; n *= 2)
+	      {
+		test (funtab + i, d, s, n, -1, 0);
+		test (funtab + i, d, s, n, n / 2, -1);
+	      }
+	  }
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
 }
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index 8572452..ce0ceee 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -1,7 +1,7 @@
 /*
  * memcpy test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -9,90 +9,112 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include "mte.h"
 #include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
-	const char *name;
-	void *(*fun)(void *, const void *, size_t);
+  const char *name;
+  void *(*fun) (void *, const void *, size_t);
+  int test_mte;
 } funtab[] = {
-#define F(x) {#x, x},
-F(memcpy)
+  // clang-format off
+  F(memcpy, 0)
 #if __aarch64__
-F(__memcpy_aarch64)
+  F(__memcpy_aarch64, 1)
 # if __ARM_NEON
-F(__memcpy_aarch64_simd)
+  F(__memcpy_aarch64_simd, 1)
 # endif
 #elif __arm__
-F(__memcpy_arm)
+  F(__memcpy_arm, 0)
 #endif
-#undef F
-	{0, 0}
+  {0, 0, 0}
+  // clang-format on
 };
-
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+#undef F
 
 #define A 32
 #define LEN 250000
-static unsigned char dbuf[LEN+2*A];
-static unsigned char sbuf[LEN+2*A];
-static unsigned char wbuf[LEN+2*A];
+static unsigned char *dbuf;
+static unsigned char *sbuf;
+static unsigned char wbuf[LEN + 2 * A];
 
-static void *alignup(void *p)
+static void *
+alignup (void *p)
 {
-	return (void*)(((uintptr_t)p + A-1) & -A);
+  return (void *) (((uintptr_t) p + A - 1) & -A);
 }
 
-static void test(const struct fun *fun, int dalign, int salign, int len)
+static void
+test (const struct fun *fun, int dalign, int salign, int len)
 {
-	unsigned char *src = alignup(sbuf);
-	unsigned char *dst = alignup(dbuf);
-	unsigned char *want = wbuf;
-	unsigned char *s = src + salign;
-	unsigned char *d = dst + dalign;
-	unsigned char *w = want + dalign;
-	void *p;
-	int i;
+  unsigned char *src = alignup (sbuf);
+  unsigned char *dst = alignup (dbuf);
+  unsigned char *want = wbuf;
+  unsigned char *s = src + salign;
+  unsigned char *d = dst + dalign;
+  unsigned char *w = want + dalign;
+  void *p;
+  int i;
 
-	if (len > LEN || dalign >= A || salign >= A)
-		abort();
-	for (i = 0; i < len+A; i++) {
-		src[i] = '?';
-		want[i] = dst[i] = '*';
-	}
-	for (i = 0; i < len; i++)
-		s[i] = w[i] = 'a' + i%23;
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || dalign >= A || salign >= A)
+    abort ();
+  for (i = 0; i < len + A; i++)
+    {
+      src[i] = '?';
+      want[i] = dst[i] = '*';
+    }
+  for (i = 0; i < len; i++)
+    s[i] = w[i] = 'a' + i % 23;
+
+  s = tag_buffer (s, len, fun->test_mte);
+  d = tag_buffer (d, len, fun->test_mte);
+  p = fun->fun (d, s, len);
+  untag_buffer (s, len, fun->test_mte);
+  untag_buffer (d, len, fun->test_mte);
 
-	p = fun->fun(d, s, len);
-	if (p != d)
-		ERR("%s(%p,..) returned %p\n", fun->name, d, p);
-	for (i = 0; i < len+A; i++) {
-		if (dst[i] != want[i]) {
-			ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
-			ERR("got : %.*s\n", dalign+len+1, dst);
-			ERR("want: %.*s\n", dalign+len+1, want);
-			break;
-		}
+  if (p != d)
+    ERR ("%s(%p,..) returned %p\n", fun->name, d, p);
+  for (i = 0; i < len + A; i++)
+    {
+      if (dst[i] != want[i])
+	{
+	  ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign,
+	       len);
+	  quoteat ("got", dst, len + A, i);
+	  quoteat ("want", want, len + A, i);
+	  break;
 	}
+    }
 }
 
-int main()
+int
+main ()
 {
-	int r = 0;
-	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
-		for (int d = 0; d < A; d++)
-			for (int s = 0; s < A; s++) {
-				int n;
-				for (n = 0; n < 100; n++)
-					test(funtab+i, d, s, n);
-				for (; n < LEN; n *= 2)
-					test(funtab+i, d, s, n);
-			}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
-			r = -1;
-	}
-	return r;
+  dbuf = mte_mmap (LEN + 2 * A);
+  sbuf = mte_mmap (LEN + 2 * A);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int d = 0; d < A; d++)
+	for (int s = 0; s < A; s++)
+	  {
+	    int n;
+	    for (n = 0; n < 100; n++)
+	      test (funtab + i, d, s, n);
+	    for (; n < LEN; n *= 2)
+	      test (funtab + i, d, s, n);
+	  }
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
 }
diff --git a/string/test/memmove.c b/string/test/memmove.c
index 7891b14..689b68c 100644
--- a/string/test/memmove.c
+++ b/string/test/memmove.c
@@ -1,7 +1,7 @@
 /*
  * memmove test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -9,136 +9,156 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include "mte.h"
 #include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
-	const char *name;
-	void *(*fun)(void *, const void *, size_t);
+  const char *name;
+  void *(*fun) (void *, const void *, size_t);
+  int test_mte;
 } funtab[] = {
-#define F(x) {#x, x},
-F(memmove)
+  // clang-format off
+  F(memmove, 0)
 #if __aarch64__
-F(__memmove_aarch64)
+  F(__memmove_aarch64, 1)
 # if __ARM_NEON
-F(__memmove_aarch64_simd)
+  F(__memmove_aarch64_simd, 1)
 # endif
 #endif
-#undef F
-	{0, 0}
+  {0, 0, 0}
+  // clang-format on
 };
-
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+#undef F
 
 #define A 32
 #define LEN 250000
-static unsigned char dbuf[LEN+2*A];
-static unsigned char sbuf[LEN+2*A];
-static unsigned char wbuf[LEN+2*A];
+static unsigned char *dbuf;
+static unsigned char *sbuf;
+static unsigned char wbuf[LEN + 2 * A];
 
-static void *alignup(void *p)
+static void *
+alignup (void *p)
 {
-	return (void*)(((uintptr_t)p + A-1) & -A);
+  return (void *) (((uintptr_t) p + A - 1) & -A);
 }
 
-static void test(const struct fun *fun, int dalign, int salign, int len)
+static void
+test (const struct fun *fun, int dalign, int salign, int len)
 {
-	unsigned char *src = alignup(sbuf);
-	unsigned char *dst = alignup(dbuf);
-	unsigned char *want = wbuf;
-	unsigned char *s = src + salign;
-	unsigned char *d = dst + dalign;
-	unsigned char *w = want + dalign;
-	void *p;
-	int i;
-
-	if (len > LEN || dalign >= A || salign >= A)
-		abort();
-	for (i = 0; i < len+A; i++) {
-		src[i] = '?';
-		want[i] = dst[i] = '*';
-	}
-	for (i = 0; i < len; i++)
-		s[i] = w[i] = 'a' + i%23;
-
-	p = fun->fun(d, s, len);
-	if (p != d)
-		ERR("%s(%p,..) returned %p\n", fun->name, d, p);
-	for (i = 0; i < len+A; i++) {
-		if (dst[i] != want[i]) {
-			ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
-			ERR("got : %.*s\n", dalign+len+1, dst);
-			ERR("want: %.*s\n", dalign+len+1, want);
-			break;
-		}
+  unsigned char *src = alignup (sbuf);
+  unsigned char *dst = alignup (dbuf);
+  unsigned char *want = wbuf;
+  unsigned char *s = src + salign;
+  unsigned char *d = dst + dalign;
+  unsigned char *w = want + dalign;
+  void *p;
+  int i;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || dalign >= A || salign >= A)
+    abort ();
+  for (i = 0; i < len + A; i++)
+    {
+      src[i] = '?';
+      want[i] = dst[i] = '*';
+    }
+  for (i = 0; i < len; i++)
+    s[i] = w[i] = 'a' + i % 23;
+
+  p = fun->fun (d, s, len);
+  if (p != d)
+    ERR ("%s(%p,..) returned %p\n", fun->name, d, p);
+  for (i = 0; i < len + A; i++)
+    {
+      if (dst[i] != want[i])
+	{
+	  ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign,
+	       len);
+	  quoteat ("got", dst, len + A, i);
+	  quoteat ("want", want, len + A, i);
+	  break;
 	}
+    }
 }
 
-static void test_overlap(const struct fun *fun, int dalign, int salign, int len)
+static void
+test_overlap (const struct fun *fun, int dalign, int salign, int len)
 {
-	unsigned char *src = alignup(sbuf);
-	unsigned char *dst = alignup(sbuf);
-	unsigned char *want = wbuf;
-	unsigned char *s = src + salign;
-	unsigned char *d = dst + dalign;
-	unsigned char *w = wbuf + dalign;
-	void *p;
-
-	if (len > LEN || dalign >= A || salign >= A)
-		abort();
-
-	for (int i = 0; i < len+A; i++)
-		src[i] = want[i] = '?';
-
-	for (int i = 0; i < len; i++)
-		s[i] = w[i] = 'a' + i%23;
-
-	/* Copy the potential overlap range.  */
-	if (s < d) {
-		for (int i = 0; i < (uintptr_t)d-(uintptr_t)s; i++)
-			want[salign+i] = src[salign+i];
-	} else {
-		for (int i = 0; i < (uintptr_t)s-(uintptr_t)d; i++)
-			want[len + dalign + i] = src[len + dalign + i];
-	}
-
-	p = fun->fun(d, s, len);
-	if (p != d)
-		ERR("%s(%p,..) returned %p\n", fun->name, d, p);
-	for (int i = 0; i < len+A; i++) {
-		if (dst[i] != want[i]) {
-			ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
-			ERR("got : %.*s\n", dalign+len+1, dst);
-			ERR("want: %.*s\n", dalign+len+1, want);
-			abort();
-			break;
-		}
+  unsigned char *src = alignup (sbuf);
+  unsigned char *dst = src;
+  unsigned char *want = wbuf;
+  unsigned char *s = src + salign;
+  unsigned char *d = dst + dalign;
+  unsigned char *w = wbuf + dalign;
+  void *p;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || dalign >= A || salign >= A)
+    abort ();
+
+  for (int i = 0; i < len + A; i++)
+    src[i] = want[i] = '?';
+
+  for (int i = 0; i < len; i++)
+    s[i] = want[salign + i] = 'a' + i % 23;
+  for (int i = 0; i < len; i++)
+    w[i] = s[i];
+
+  s = tag_buffer (s, len, fun->test_mte);
+  d = tag_buffer (d, len, fun->test_mte);
+  p = fun->fun (d, s, len);
+  untag_buffer (s, len, fun->test_mte);
+  untag_buffer (d, len, fun->test_mte);
+
+  if (p != d)
+    ERR ("%s(%p,..) returned %p\n", fun->name, d, p);
+  for (int i = 0; i < len + A; i++)
+    {
+      if (dst[i] != want[i])
+	{
+	  ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign,
+	       len);
+	  quoteat ("got", dst, len + A, i);
+	  quoteat ("want", want, len + A, i);
+	  break;
 	}
+    }
 }
 
-int main()
+int
+main ()
 {
-	test_overlap(funtab+0, 2, 1, 1);
-
-	int r = 0;
-	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
-		for (int d = 0; d < A; d++)
-			for (int s = 0; s < A; s++) {
-				int n;
-				for (n = 0; n < 100; n++) {
-					test(funtab+i, d, s, n);
-					test_overlap(funtab+i, d, s, n);
-				}
-				for (; n < LEN; n *= 2) {
-					test(funtab+i, d, s, n);
-					test_overlap(funtab+i, d, s, n);
-				}
-			}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
-			r = -1;
-	}
-	return r;
+  dbuf = mte_mmap (LEN + 2 * A);
+  sbuf = mte_mmap (LEN + 2 * A);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int d = 0; d < A; d++)
+	for (int s = 0; s < A; s++)
+	  {
+	    int n;
+	    for (n = 0; n < 100; n++)
+	      {
+		test (funtab + i, d, s, n);
+		test_overlap (funtab + i, d, s, n);
+	      }
+	    for (; n < LEN; n *= 2)
+	      {
+		test (funtab + i, d, s, n);
+		test_overlap (funtab + i, d, s, n);
+	      }
+	  }
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
 }
diff --git a/string/test/memrchr.c b/string/test/memrchr.c
new file mode 100644
index 0000000..adf96f0
--- /dev/null
+++ b/string/test/memrchr.c
@@ -0,0 +1,106 @@
+/*
+ * memchr test.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun) (const void *s, int c, size_t n);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(memrchr, 0)
+#if __aarch64__
+  F(__memrchr_aarch64, 1)
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + ALIGN) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int align, size_t seekpos, size_t len,
+      size_t maxlen)
+{
+  char *src = alignup (sbuf);
+  char *s = src + align;
+  char *f = seekpos < maxlen ? s + seekpos : NULL;
+  int seekchar = 1;
+  void *p;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || seekpos > LEN || align > ALIGN)
+    abort ();
+
+  for (int i = 0; src + i < s; i++)
+    src[i] = seekchar;
+  for (int i = 0; i <= ALIGN; i++)
+    s[len + i] = seekchar;
+  for (int i = 0; i < len; i++)
+    s[i] = 'a' + (i & 31);
+  s[seekpos] = seekchar;
+  s[((len ^ align) & 1) && seekpos < maxlen ? seekpos - 1 : len] = seekchar;
+
+  s = tag_buffer (s, maxlen, fun->test_mte);
+  p = fun->fun (s, seekchar, maxlen);
+  untag_buffer (s, maxlen, fun->test_mte);
+  p = untag_pointer (p);
+
+  if (p != f)
+    {
+      ERR ("%s (%p, 0x%02x, %zu) returned %p, expected %p\n", fun->name, s,
+	   seekchar, maxlen, p, f);
+      quote ("input", s, len);
+    }
+}
+
+int
+main (void)
+{
+  sbuf = mte_mmap (LEN + 3 * ALIGN);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int a = 0; a < ALIGN; a++)
+	for (int n = 0; n < LEN; n++)
+	  {
+	    for (int sp = 0; sp < LEN; sp++)
+	      test (funtab + i, a, sp, n, n);
+	  }
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
diff --git a/string/test/memset.c b/string/test/memset.c
index 48c10fa..f172144 100644
--- a/string/test/memset.c
+++ b/string/test/memset.c
@@ -1,7 +1,7 @@
 /*
  * memset test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -9,103 +9,121 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include "mte.h"
 #include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
-	const char *name;
-	void *(*fun)(void *s, int c, size_t n);
+  const char *name;
+  void *(*fun) (void *s, int c, size_t n);
+  int test_mte;
 } funtab[] = {
-#define F(x) {#x, x},
-F(memset)
+  // clang-format off
+  F(memset, 0)
 #if __aarch64__
-F(__memset_aarch64)
+  F(__memset_aarch64, 1)
 #elif __arm__
-F(__memset_arm)
+  F(__memset_arm, 0)
 #endif
-#undef F
-	{0, 0}
+  {0, 0, 0}
+  // clang-format on
 };
-
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+#undef F
 
 #define A 32
 #define LEN 250000
-static unsigned char sbuf[LEN+2*A];
+static unsigned char *sbuf;
 
-static void *alignup(void *p)
+static void *
+alignup (void *p)
 {
-	return (void*)(((uintptr_t)p + A-1) & -A);
+  return (void *) (((uintptr_t) p + A - 1) & -A);
 }
 
-static void err(const char *name, unsigned char *src, int salign, int c, int len)
+static void
+test (const struct fun *fun, int salign, int c, int len)
 {
-	ERR("%s(align %d, %d, %d) failed\n", name, salign, c, len);
-	ERR("got : %.*s\n", salign+len+1, src);
-}
+  unsigned char *src = alignup (sbuf);
+  unsigned char *s = src + salign;
+  void *p;
+  int i;
 
-static void test(const struct fun *fun, int salign, int c, int len)
-{
-	unsigned char *src = alignup(sbuf);
-	unsigned char *s = src + salign;
-	void *p;
-	int i;
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || salign >= A)
+    abort ();
+  for (i = 0; i < len + A; i++)
+    src[i] = '?';
+  for (i = 0; i < len; i++)
+    s[i] = 'a' + i % 23;
 
-	if (len > LEN || salign >= A)
-		abort();
-	for (i = 0; i < len+A; i++)
-		src[i] = '?';
-	for (i = 0; i < len; i++)
-		s[i] = 'a' + i%23;
-	for (; i<len%A; i++)
-		s[i] = '*';
+  s = tag_buffer (s, len, fun->test_mte);
+  p = fun->fun (s, c, len);
+  untag_buffer (s, len, fun->test_mte);
 
-	p = fun->fun(s, c, len);
-	if (p != s)
-		ERR("%s(%p,..) returned %p\n", fun->name, s, p);
+  if (p != s)
+    ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
 
-	for (i = 0; i < salign; i++) {
-		if (src[i] != '?') {
-			err(fun->name, src, salign, c, len);
-			return;
-		}
+  for (i = 0; i < salign; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
+	  quoteat ("got", src, len + A, i);
+	  return;
 	}
-	for (i = salign; i < len; i++) {
-		if (src[i] != (unsigned char)c) {
-			err(fun->name, src, salign, c, len);
-			return;
-		}
+    }
+  for (; i < salign + len; i++)
+    {
+      if (src[i] != (unsigned char) c)
+	{
+	  ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
+	  quoteat ("got", src, len + A, i);
+	  return;
 	}
-	for (; i < len%A; i++) {
-		if (src[i] != '*') {
-			err(fun->name, src, salign, c, len);
-			return;
-		}
+    }
+  for (; i < len + A; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
+	  quoteat ("got", src, len + A, i);
+	  return;
 	}
+    }
 }
 
-int main()
+int
+main ()
 {
-	int r = 0;
-	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
-		for (int s = 0; s < A; s++) {
-			int n;
-			for (n = 0; n < 100; n++) {
-				test(funtab+i, s, 0, n);
-				test(funtab+i, s, 0x25, n);
-				test(funtab+i, s, 0xaa25, n);
-			}
-			for (; n < LEN; n *= 2) {
-				test(funtab+i, s, 0, n);
-				test(funtab+i, s, 0x25, n);
-				test(funtab+i, s, 0xaa25, n);
-			}
-		}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
-			r = -1;
+  sbuf = mte_mmap (LEN + 2 * A);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int s = 0; s < A; s++)
+	{
+	  int n;
+	  for (n = 0; n < 100; n++)
+	    {
+	      test (funtab + i, s, 0, n);
+	      test (funtab + i, s, 0x25, n);
+	      test (funtab + i, s, 0xaa25, n);
+	    }
+	  for (; n < LEN; n *= 2)
+	    {
+	      test (funtab + i, s, 0, n);
+	      test (funtab + i, s, 0x25, n);
+	      test (funtab + i, s, 0xaa25, n);
+	    }
 	}
-	return r;
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
 }
diff --git a/string/test/mte.h b/string/test/mte.h
new file mode 100644
index 0000000..e67cbd9
--- /dev/null
+++ b/string/test/mte.h
@@ -0,0 +1,142 @@
+/*
+ * Memory tagging testing code.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef __TEST_MTE_H
+#define __TEST_MTE_H
+
+#include <stdlib.h>
+
+#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
+#include <arm_acle.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+
+// These depend on a not yet merged kernel ABI.
+#define PR_SET_TAGGED_ADDR_CTRL 55
+#define PR_TAGGED_ADDR_ENABLE (1UL << 0)
+#define PR_MTE_TCF_SHIFT 1
+#define PR_MTE_TCF_SYNC (1UL << PR_MTE_TCF_SHIFT)
+#define PR_MTE_TAG_SHIFT 3
+#define PROT_MTE 0x20
+
+#define MTE_GRANULE_SIZE 16
+
+int
+mte_enabled ()
+{
+  static int enabled = -1;
+  if (enabled == -1)
+    {
+      int res = prctl (PR_SET_TAGGED_ADDR_CTRL,
+		       PR_TAGGED_ADDR_ENABLE | PR_MTE_TCF_SYNC
+			 | (0xfffe << PR_MTE_TAG_SHIFT),
+		       0, 0, 0);
+      enabled = (res == 0);
+    }
+  return enabled;
+}
+
+static void *
+mte_mmap (size_t size)
+{
+  if (mte_enabled ())
+    {
+      return mmap (NULL, size, PROT_READ | PROT_WRITE | PROT_MTE,
+		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    }
+  else
+    {
+      return malloc (size);
+    }
+}
+
+void *
+alignup_mte (void *p)
+{
+  return (void *) (((uintptr_t) p + MTE_GRANULE_SIZE - 1)
+		   & ~(MTE_GRANULE_SIZE - 1));
+}
+
+void *
+aligndown_mte (void *p)
+{
+  return (void *) ((uintptr_t) p & ~(MTE_GRANULE_SIZE - 1));
+}
+
+void *
+untag_pointer (void *p)
+{
+  return (void *) ((unsigned long long) p & (~0ULL >> 8));
+}
+
+void
+tag_buffer_helper (void *p, int len)
+{
+  char *ptr = p;
+  char *end = alignup_mte (ptr + len);
+  ptr = aligndown_mte (p);
+  for (; ptr < end; ptr += MTE_GRANULE_SIZE)
+    {
+      __arm_mte_set_tag (ptr);
+    }
+}
+
+void *
+tag_buffer (void *p, int len, int test_mte)
+{
+  if (test_mte && mte_enabled ())
+    {
+      p = __arm_mte_increment_tag (p, 1);
+      tag_buffer_helper (p, len);
+    }
+  return p;
+}
+
+void *
+untag_buffer (void *p, int len, int test_mte)
+{
+  p = untag_pointer (p);
+  if (test_mte && mte_enabled ())
+    {
+      tag_buffer_helper (p, len);
+    }
+  return p;
+}
+
+#else  // __ARM_FEATURE_MEMORY_TAGGING
+int
+mte_enabled ()
+{
+  return 0;
+}
+static void *
+mte_mmap (size_t size)
+{
+  return malloc (size);
+}
+void *
+tag_buffer (void *p, int len, int test_mte)
+{
+  (void) len;
+  (void) test_mte;
+  return p;
+}
+void *
+untag_buffer (void *p, int len, int test_mte)
+{
+  (void) len;
+  (void) test_mte;
+  return p;
+}
+void *
+untag_pointer (void *p)
+{
+  return p;
+}
+#endif // __ARM_FEATURE_MEMORY_TAGGING
+
+#endif
diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c
new file mode 100644
index 0000000..1827e68
--- /dev/null
+++ b/string/test/stpcpy.c
@@ -0,0 +1,125 @@
+/*
+ * stpcpy test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  char *(*fun) (char *dest, const char *src);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(stpcpy, 0)
+#if __aarch64__
+  F(__stpcpy_aarch64, 0)
+  F(__stpcpy_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+  F(__stpcpy_aarch64_sve, 1)
+# endif
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *dbuf;
+static char *sbuf;
+static char wbuf[LEN + 3 * ALIGN];
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int dalign, int salign, int len)
+{
+  char *src = alignup (sbuf);
+  char *dst = alignup (dbuf);
+  char *want = wbuf;
+  char *s = src + salign;
+  char *d = dst + dalign;
+  char *w = want + dalign;
+  void *p;
+  int i;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || dalign >= ALIGN || salign >= ALIGN)
+    abort ();
+  for (i = 0; i < len + ALIGN; i++)
+    {
+      src[i] = '?';
+      want[i] = dst[i] = '*';
+    }
+  for (int i = 0; src + i < s; i++)
+    src[i] = 0;
+  for (int i = 1; i <= ALIGN; i++)
+    s[len + i] = (len + salign) & 1 ? 1 : 0;
+  for (i = 0; i < len; i++)
+    s[i] = w[i] = 'a' + (i & 31);
+  s[len] = w[len] = '\0';
+
+  s = tag_buffer (s, len + 1, fun->test_mte);
+  d = tag_buffer (d, len + 1, fun->test_mte);
+  p = fun->fun (d, s);
+  untag_buffer (s, len + 1, fun->test_mte);
+  untag_buffer (d, len + 1, fun->test_mte);
+
+  if (p != d + len)
+    ERR ("%s (%p,..) returned %p expected %p\n", fun->name, d, p, d + len);
+
+  for (i = 0; i < len + ALIGN; i++)
+    {
+      if (dst[i] != want[i])
+	{
+	  ERR ("%s (align %d, align %d, %d) failed\n",
+	       fun->name, dalign, salign, len);
+	  quoteat ("got", dst, len + ALIGN, i);
+	  quoteat ("want", want, len + ALIGN, i);
+	  break;
+	}
+    }
+}
+
+int
+main (void)
+{
+  sbuf = mte_mmap (LEN + 3 * ALIGN);
+  dbuf = mte_mmap (LEN + 3 * ALIGN);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int d = 0; d < ALIGN; d++)
+	for (int s = 0; s < ALIGN; s++)
+	  for (int n = 0; n < LEN; n++)
+	    test (funtab + i, d, s, n);
+
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
diff --git a/string/test/strchr.c b/string/test/strchr.c
index a625567..f3ae982 100644
--- a/string/test/strchr.c
+++ b/string/test/strchr.c
@@ -1,7 +1,7 @@
 /*
  * strchr test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -10,88 +10,112 @@
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
+#include "mte.h"
 #include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
-	const char *name;
-	char *(*fun)(const char *s, int c);
+  const char *name;
+  char *(*fun) (const char *s, int c);
+  int test_mte;
 } funtab[] = {
-#define F(x) {#x, x},
-F(strchr)
+  // clang-format off
+  F(strchr, 0)
 #if __aarch64__
-F(__strchr_aarch64)
+  F(__strchr_aarch64, 0)
+  F(__strchr_aarch64_mte, 1)
 # if __ARM_FEATURE_SVE
-F(__strchr_aarch64_sve)
+  F(__strchr_aarch64_sve, 1)
 # endif
 #endif
-#undef F
-	{0, 0}
+  {0, 0, 0}
+  // clang-format on
 };
+#undef F
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
-#define A 32
-#define SP 512
-#define LEN 250000
-static char sbuf[LEN+2*A];
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
 
-static void *alignup(void *p)
+static void *
+alignup (void *p)
 {
-	return (void*)(((uintptr_t)p + A-1) & -A);
+  return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
 }
 
-static void test(const struct fun *fun, int align, int seekpos, int len)
+static void
+test (const struct fun *fun, int align, int seekpos, int len)
 {
-	char *src = alignup(sbuf);
-	char *s = src + align;
-	char *f = seekpos != -1 ? s + seekpos : 0;
-	int seekchar = 0x1;
-	void *p;
+  char *src = alignup (sbuf);
+  char *s = src + align;
+  char *f = seekpos != -1 ? s + seekpos : 0;
+  int seekchar = 0x1;
+  void *p;
 
-	if (len > LEN || seekpos >= len - 1 || align >= A)
-		abort();
-	if (seekchar >= 'a' && seekchar <= 'a' + 23)
-		abort();
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || seekpos >= len || align >= ALIGN)
+    abort ();
 
-	for (int i = 0; i < len + A; i++)
-		src[i] = '?';
-	for (int i = 0; i < len - 2; i++)
-		s[i] = 'a' + i%23;
-	if (seekpos != -1)
-		s[seekpos] = seekchar;
-	s[len - 1] = '\0';
+  for (int i = 0; src + i < s; i++)
+    src[i] = (i + len) & 1 ? seekchar : 0;
+  for (int i = 1; i <= ALIGN; i++)
+     s[len + i] = (i + len) & 1 ? seekchar : 0;
+  for (int i = 0; i < len; i++)
+    s[i] = 'a' + (i & 31);
+  if (seekpos != -1)
+    s[seekpos] = seekchar;
+  if (seekpos != -1 && (len + align) & 1)
+    s[seekpos + 1] = seekchar;
+  s[len] = '\0';
 
-	p = fun->fun(s, seekchar);
+  s = tag_buffer (s, len + 1, fun->test_mte);
+  p = fun->fun (s, seekchar);
+  untag_buffer (s, len + 1, fun->test_mte);
+  p = untag_pointer (p);
 
-	if (p != f) {
-		ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
-		ERR("expected: %p\n", f);
-		abort();
-	}
+  if (p != f)
+    {
+      ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
+	   fun->name, s, seekchar, len, p, f, seekpos);
+      quote ("input", s, len);
+    }
+
+  s = tag_buffer (s, len + 1, fun->test_mte);
+  p = fun->fun (s, 0);
+  untag_buffer (s, len + 1, fun->test_mte);
+
+  if (p != s + len)
+    {
+      ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
+	   fun->name, s, 0, len, p, f, len);
+      quote ("input", s, len);
+    }
 }
 
-int main()
+int
+main (void)
 {
-	int r = 0;
-	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
-		for (int a = 0; a < A; a++) {
-			int n;
-			for (n = 1; n < 100; n++) {
-				for (int sp = 0; sp < n - 1; sp++)
-					test(funtab+i, a, sp, n);
-				test(funtab+i, a, -1, n);
-			}
-			for (; n < LEN; n *= 2) {
-				test(funtab+i, a, -1, n);
-				test(funtab+i, a, n / 2, n);
-			}
-		}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
-			r = -1;
-	}
-	return r;
+  sbuf = mte_mmap (LEN + 3 * ALIGN);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int a = 0; a < ALIGN; a++)
+	for (int n = 0; n < LEN; n++)
+	  {
+	    for (int sp = 0; sp < n; sp++)
+	      test (funtab + i, a, sp, n);
+	    test (funtab + i, a, -1, n);
+	  }
+
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
 }
diff --git a/string/test/strchrnul.c b/string/test/strchrnul.c
index 814dd1e..6c30ab2 100644
--- a/string/test/strchrnul.c
+++ b/string/test/strchrnul.c
@@ -1,99 +1,126 @@
 /*
  * strchrnul test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+#ifndef _GNU_SOURCE
 #define _GNU_SOURCE
+#endif
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
+#include "mte.h"
 #include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
-	const char *name;
-	char *(*fun)(const char *s, int c);
+  const char *name;
+  char *(*fun) (const char *s, int c);
+  int test_mte;
 } funtab[] = {
-#define F(x) {#x, x},
-F(strchrnul)
+  // clang-format off
+  F(strchrnul, 0)
 #if __aarch64__
-F(__strchrnul_aarch64)
+  F(__strchrnul_aarch64, 0)
+  F(__strchrnul_aarch64_mte, 1)
 # if __ARM_FEATURE_SVE
-F(__strchrnul_aarch64_sve)
+  F(__strchrnul_aarch64_sve, 1)
 # endif
 #endif
-#undef F
-	{0, 0}
+  {0, 0, 0}
+  // clang-format on
 };
+#undef F
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
-#define A 32
-#define SP 512
-#define LEN 250000
-static char sbuf[LEN+2*A];
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
 
-static void *alignup(void *p)
+static void *
+alignup (void *p)
 {
-	return (void*)(((uintptr_t)p + A-1) & -A);
+  return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
 }
 
-static void test(const struct fun *fun, int align, int seekpos, int len)
+static void
+test (const struct fun *fun, int align, int seekpos, int len)
 {
-	char *src = alignup(sbuf);
-	char *s = src + align;
-	char *f = seekpos != -1 ? s + seekpos : s + len - 1;
-	int seekchar = 0x1;
-	void *p;
+  char *src = alignup (sbuf);
+  char *s = src + align;
+  char *f = seekpos != -1 ? s + seekpos : s + len;
+  int seekchar = 0x1;
+  void *p;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || seekpos >= len || align >= ALIGN)
+    abort ();
 
-	if (len > LEN || seekpos >= len - 1 || align >= A)
-		abort();
-	if (seekchar >= 'a' && seekchar <= 'a' + 23)
-		abort();
+  for (int i = 0; src + i < s; i++)
+    src[i] = (i + len) & 1 ? seekchar : 0;
+  for (int i = 1; i <= ALIGN; i++)
+    s[len + i] = (i + len) & 1 ? seekchar : 0;
+  for (int i = 0; i < len; i++)
+    s[i] = 'a' + (i & 31);
+  if (seekpos != -1)
+    s[seekpos] = seekchar;
+  if (seekpos != -1 && (len + align) & 1)
+    s[seekpos + 1] = seekchar;
+  s[len] = '\0';
 
-	for (int i = 0; i < len + A; i++)
-		src[i] = '?';
-	for (int i = 0; i < len - 2; i++)
-		s[i] = 'a' + i%23;
-	if (seekpos != -1)
-		s[seekpos] = seekchar;
-	s[len - 1] = '\0';
+  int mte_len = seekpos != -1 ? seekpos + 1 : len + 1;
+  s = tag_buffer (s, mte_len, fun->test_mte);
+  p = fun->fun (s, seekchar);
+  untag_buffer (s, mte_len, fun->test_mte);
+  p = untag_pointer (p);
 
-	p = fun->fun(s, seekchar);
+  if (p != f)
+    {
+      ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
+	   fun->name, s, seekchar, len, p, f, seekpos);
+      quote ("input", s, len);
+    }
 
-	if (p != f) {
-		ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
-		ERR("expected: %p\n", f);
-		abort();
-	}
+  s = tag_buffer (s, len + 1, fun->test_mte);
+  p = fun->fun (s, 0);
+  untag_buffer (s, len + 1, fun->test_mte);
+
+  if (p != s + len)
+    {
+      ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
+	   fun->name, s, 0, len, p, f, len);
+      quote ("input", s, len);
+    }
 }
 
-int main()
+int
+main (void)
 {
-	int r = 0;
-	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
-		for (int a = 0; a < A; a++) {
-			int n;
-			for (n = 1; n < 100; n++) {
-				for (int sp = 0; sp < n - 1; sp++)
-					test(funtab+i, a, sp, n);
-				test(funtab+i, a, -1, n);
-			}
-			for (; n < LEN; n *= 2) {
-				test(funtab+i, a, -1, n);
-				test(funtab+i, a, n / 2, n);
-			}
-		}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
-			r = -1;
-	}
-	return r;
+  sbuf = mte_mmap (LEN + 3 * ALIGN);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int a = 0; a < ALIGN; a++)
+	for (int n = 0; n < LEN; n++)
+	  {
+	    for (int sp = 0; sp < n; sp++)
+	      test (funtab + i, a, sp, n);
+	    test (funtab + i, a, -1, n);
+	  }
+
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
 }
diff --git a/string/test/strcmp.c b/string/test/strcmp.c
index 91fa9dd..d57b54e 100644
--- a/string/test/strcmp.c
+++ b/string/test/strcmp.c
@@ -1,7 +1,7 @@
 /*
  * strcmp test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -9,95 +9,124 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include "mte.h"
 #include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
-	const char *name;
-	int (*fun)(const char *s1, const char *s2);
+  const char *name;
+  int (*fun) (const char *s1, const char *s2);
+  int test_mte;
 } funtab[] = {
-#define F(x) {#x, x},
-F(strcmp)
+  // clang-format off
+  F(strcmp, 0)
 #if __aarch64__
-F(__strcmp_aarch64)
+  F(__strcmp_aarch64, 0)
+  F(__strcmp_aarch64_mte, 1)
 # if __ARM_FEATURE_SVE
-F(__strcmp_aarch64_sve)
+  F(__strcmp_aarch64_sve, 1)
 # endif
 #elif __arm__
 # if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
-F(__strcmp_arm)
+  F(__strcmp_arm, 0)
 # elif __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
-F(__strcmp_armv6m)
+  F(__strcmp_armv6m, 0)
 # endif
 #endif
-#undef F
-	{0, 0}
+  {0, 0, 0}
+  // clang-format on
 };
-
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+#undef F
 
 #define A 32
 #define LEN 250000
-static char s1buf[LEN+2*A];
-static char s2buf[LEN+2*A];
+static char *s1buf;
+static char *s2buf;
 
-static void *alignup(void *p)
+static void *
+alignup (void *p)
 {
-	return (void*)(((uintptr_t)p + A-1) & -A);
+  return (void *) (((uintptr_t) p + A - 1) & -A);
 }
 
-static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos)
+static void
+test (const struct fun *fun, int s1align, int s2align, int len, int diffpos,
+      int delta)
 {
-	char *src1 = alignup(s1buf);
-	char *src2 = alignup(s2buf);
-	char *s1 = src1 + s1align;
-	char *s2 = src2 + s2align;
-	int r;
+  char *src1 = alignup (s1buf);
+  char *src2 = alignup (s2buf);
+  char *s1 = src1 + s1align;
+  char *s2 = src2 + s2align;
+  int r;
 
-	if (len > LEN || s1align >= A || s2align >= A)
-		abort();
-	if (diffpos > 1 && diffpos >= len-1)
-		abort();
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || s1align >= A || s2align >= A)
+    abort ();
+  if (diffpos >= len)
+    abort ();
+  if ((diffpos < 0) != (delta == 0))
+    abort ();
 
-	for (int i = 0; i < len+A; i++)
-		src1[i] = src2[i] = '?';
-	for (int i = 0; i < len-1; i++)
-		s1[i] = s2[i] = 'a' + i%23;
-	if (diffpos > 1)
-		s1[diffpos]++;
-	s1[len] = s2[len] = '\0';
+  for (int i = 0; i < len + A; i++)
+    src1[i] = src2[i] = '?';
+  for (int i = 0; i < len; i++)
+    s1[i] = s2[i] = 'a' + i % 23;
+  if (delta)
+    s1[diffpos] += delta;
+  s1[len] = s2[len] = '\0';
 
-	r = fun->fun(s1, s2);
+  s1 = tag_buffer (s1, len + 1, fun->test_mte);
+  s2 = tag_buffer (s2, len + 1, fun->test_mte);
+  r = fun->fun (s1, s2);
+  untag_buffer (s1, len + 1, fun->test_mte);
+  untag_buffer (s2, len + 1, fun->test_mte);
 
-	if (((diffpos <= 1) && r != 0) || (diffpos > 1 && r == 0)) {
-		ERR("%s(align %d, align %d, %d) failed, returned %d\n",
-			fun->name, s1align, s2align, len, r);
-		ERR("src1: %.*s\n", s1align+len+1, src1);
-		ERR("src2: %.*s\n", s2align+len+1, src2);
-	}
+  if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0))
+    {
+      ERR ("%s(align %d, align %d, %d) failed, returned %d\n", fun->name,
+	   s1align, s2align, len, r);
+      quoteat ("src1", src1, len + A, diffpos);
+      quoteat ("src2", src2, len + A, diffpos);
+    }
 }
 
-int main()
+int
+main ()
 {
-	int r = 0;
-	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
-		for (int d = 0; d < A; d++)
-			for (int s = 0; s < A; s++) {
-				int n;
-				for (n = 0; n < 100; n++) {
-					test(funtab+i, d, s, n, 0);
-					test(funtab+i, d, s, n, n / 2);
-				}
-				for (; n < LEN; n *= 2) {
-					test(funtab+i, d, s, n, 0);
-					test(funtab+i, d, s, n, n / 2);
-				}
-			}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
-			r = -1;
-	}
-	return r;
+  s1buf = mte_mmap (LEN + 2 * A + 1);
+  s2buf = mte_mmap (LEN + 2 * A + 1);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int d = 0; d < A; d++)
+	for (int s = 0; s < A; s++)
+	  {
+	    int n;
+	    test (funtab + i, d, s, 0, -1, 0);
+	    test (funtab + i, d, s, 1, -1, 0);
+	    test (funtab + i, d, s, 1, 0, 1);
+	    test (funtab + i, d, s, 1, 0, -1);
+	    for (n = 2; n < 100; n++)
+	      {
+		test (funtab + i, d, s, n, -1, 0);
+		test (funtab + i, d, s, n, n - 1, -1);
+		test (funtab + i, d, s, n, n / 2, 1);
+	      }
+	    for (; n < LEN; n *= 2)
+	      {
+		test (funtab + i, d, s, n, -1, 0);
+		test (funtab + i, d, s, n, n / 2, -1);
+	      }
+	  }
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
 }
diff --git a/string/test/strcpy.c b/string/test/strcpy.c
index 4882c9f..e84cace 100644
--- a/string/test/strcpy.c
+++ b/string/test/strcpy.c
@@ -1,7 +1,7 @@
 /*
  * strcpy test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -9,91 +9,115 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include "mte.h"
 #include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
-	const char *name;
-	char *(*fun)(char *dest, const char *src);
+  const char *name;
+  char *(*fun) (char *dest, const char *src);
+  int test_mte;
 } funtab[] = {
-#define F(x) {#x, x},
-F(strcpy)
+  // clang-format off
+  F(strcpy, 0)
 #if __aarch64__
-F(__strcpy_aarch64)
+  F(__strcpy_aarch64, 0)
+  F(__strcpy_aarch64_mte, 1)
 # if __ARM_FEATURE_SVE
-F(__strcpy_aarch64_sve)
+  F(__strcpy_aarch64_sve, 1)
 # endif
 #elif __arm__ && defined (__thumb2__) && !defined (__thumb__)
-F(__strcpy_arm)
+  F(__strcpy_arm, 0)
 #endif
-#undef F
-	{0, 0}
+  {0, 0, 0}
+  // clang-format on
 };
+#undef F
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
-#define A 32
-#define LEN 250000
-static char dbuf[LEN+2*A];
-static char sbuf[LEN+2*A];
-static char wbuf[LEN+2*A];
+#define ALIGN 32
+#define LEN 512
+static char *dbuf;
+static char *sbuf;
+static char wbuf[LEN + 3 * ALIGN];
 
-static void *alignup(void *p)
+static void *
+alignup (void *p)
 {
-	return (void*)(((uintptr_t)p + A-1) & -A);
+  return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
 }
 
-static void test(const struct fun *fun, int dalign, int salign, int len)
+static void
+test (const struct fun *fun, int dalign, int salign, int len)
 {
-	char *src = alignup(sbuf);
-	char *dst = alignup(dbuf);
-	char *want = wbuf;
-	char *s = src + salign;
-	char *d = dst + dalign;
-	char *w = want + dalign;
-	void *p;
-	int i;
+  char *src = alignup (sbuf);
+  char *dst = alignup (dbuf);
+  char *want = wbuf;
+  char *s = src + salign;
+  char *d = dst + dalign;
+  char *w = want + dalign;
+  void *p;
+  int i;
 
-	if (len > LEN || dalign >= A || salign >= A)
-		abort();
-	for (i = 0; i < len+A; i++) {
-		src[i] = '?';
-		want[i] = dst[i] = '*';
-	}
-	for (i = 0; i < len-1; i++)
-		s[i] = w[i] = 'a' + i%23;
-	s[i] = w[i] = '\0';
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || dalign >= ALIGN || salign >= ALIGN)
+    abort ();
+  for (i = 0; i < len + ALIGN; i++)
+    {
+      src[i] = '?';
+      want[i] = dst[i] = '*';
+    }
+  for (int i = 0; src + i < s; i++)
+    src[i] = 0;
+  for (int i = 1; i <= ALIGN; i++)
+    s[len + i] = (len + salign) & 1 ? 1 : 0;
+  for (i = 0; i < len; i++)
+    s[i] = w[i] = 'a' + (i & 31);
+  s[len] = w[len] = '\0';
 
-	p = fun->fun(d, s);
-	if (p != d)
-		ERR("%s(%p,..) returned %p\n", fun->name, d, p);
-	for (i = 0; i < len+A; i++) {
-		if (dst[i] != want[i]) {
-			ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
-			ERR("got : %.*s\n", dalign+len+1, dst);
-			ERR("want: %.*s\n", dalign+len+1, want);
-			break;
-		}
+  s = tag_buffer (s, len + 1, fun->test_mte);
+  d = tag_buffer (d, len + 1, fun->test_mte);
+  p = fun->fun (d, s);
+  untag_buffer (s, len + 1, fun->test_mte);
+  untag_buffer (d, len + 1, fun->test_mte);
+
+  if (p != d)
+    ERR ("%s (%p,..) returned %p\n", fun->name, d, p);
+
+  for (i = 0; i < len + ALIGN; i++)
+    {
+      if (dst[i] != want[i])
+	{
+	  ERR ("%s (align %d, align %d, %d) failed\n",
+	       fun->name, dalign, salign, len);
+	  quoteat ("got", dst, len + ALIGN, i);
+	  quoteat ("want", want, len + ALIGN, i);
+	  break;
 	}
+    }
 }
 
-int main()
+int
+main (void)
 {
-	int r = 0;
-	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
-		for (int d = 0; d < A; d++)
-			for (int s = 0; s < A; s++) {
-				int n;
-				for (n = 0; n < 100; n++)
-					test(funtab+i, d, s, n);
-				for (; n < LEN; n *= 2)
-					test(funtab+i, d, s, n);
-			}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
-			r = -1;
-	}
-	return r;
+  sbuf = mte_mmap (LEN + 3 * ALIGN);
+  dbuf = mte_mmap (LEN + 3 * ALIGN);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int d = 0; d < ALIGN; d++)
+	for (int s = 0; s < ALIGN; s++)
+	  for (int n = 0; n < LEN; n++)
+	    test (funtab + i, d, s, n);
+
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
 }
diff --git a/string/test/stringtest.h b/string/test/stringtest.h
new file mode 100644
index 0000000..fe855fc
--- /dev/null
+++ b/string/test/stringtest.h
@@ -0,0 +1,55 @@
+/*
+ * Common string test code.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <ctype.h>
+#include <stdio.h>
+
+/* Accounting errors for a test case.  */
+static int err_count;
+#define ERR_LIMIT 10
+#define ERR(...) (err_count++, printf (__VA_ARGS__))
+
+static inline void
+quotechar (unsigned char c)
+{
+  if (isprint (c))
+    putchar (c);
+  else
+    printf ("\\x%02x", c);
+}
+
+/* quoted print around at or the entire string if at < 0.  */
+static void
+quoteat (const char *prefix, const void *p, int len, int at)
+{
+  static const int CTXLEN = 15;
+  int i;
+  const char *pre = "\"";
+  const char *post = "\"";
+  const char *s = p;
+  if (at > CTXLEN)
+    {
+      s += at - CTXLEN;
+      len -= at - CTXLEN;
+      pre = "...\"";
+    }
+  if (at >= 0 && len > 2 * CTXLEN + 1)
+    {
+      len = 2 * CTXLEN + 1;
+      post = "\"...";
+    }
+  printf ("%4s: %s", prefix, pre);
+  for (i = 0; i < len; i++)
+    quotechar (s[i]);
+  printf ("%s\n", post);
+}
+
+static inline void
+quote (const char *prefix, const void *p, int len)
+{
+  quoteat (prefix, p, len, -1);
+}
diff --git a/string/test/strlen.c b/string/test/strlen.c
index ff8e328..6278380 100644
--- a/string/test/strlen.c
+++ b/string/test/strlen.c
@@ -1,7 +1,7 @@
 /*
  * strlen test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -9,82 +9,95 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/mman.h>
 #include <limits.h>
+#include "mte.h"
 #include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
-	const char *name;
-	size_t (*fun)(const char *s);
+  const char *name;
+  size_t (*fun) (const char *s);
+  int test_mte;
 } funtab[] = {
-#define F(x) {#x, x},
-F(strlen)
+  // clang-format off
+  F(strlen, 0)
 #if __aarch64__
-F(__strlen_aarch64)
+  F(__strlen_aarch64, 0)
+  F(__strlen_aarch64_mte, 1)
 # if __ARM_FEATURE_SVE
-F(__strlen_aarch64_sve)
+  F(__strlen_aarch64_sve, 1)
 # endif
 #elif __arm__
 # if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
-F(__strlen_armv6t2)
+  F(__strlen_armv6t2, 0)
 # endif
 #endif
-#undef F
-	{0, 0}
+  {0, 0, 0}
+  // clang-format on
 };
+#undef F
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
-#define A 32
-#define SP 512
-#define LEN 250000
-static char sbuf[LEN+2*A];
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
 
-static void *alignup(void *p)
+static void *
+alignup (void *p)
 {
-	return (void*)(((uintptr_t)p + A-1) & -A);
+  return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
 }
 
-static void test(const struct fun *fun, int align, int len)
+static void
+test (const struct fun *fun, int align, int len)
 {
-	char *src = alignup(sbuf);
-	char *s = src + align;
-	size_t r;
+  char *src = alignup (sbuf);
+  char *s = src + align;
+  size_t r;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || align >= ALIGN)
+    abort ();
 
-	if (len > LEN || align >= A)
-		abort();
+  for (int i = 0; src + i < s; i++)
+    src[i] = 0;
+  for (int i = 1; i <= ALIGN; i++)
+    s[len + i] = (len + align) & 1 ? 1 : 0;
+  for (int i = 0; i < len; i++)
+    s[i] = 'a' + (i & 31);
+  s[len] = '\0';
 
-	for (int i = 0; i < len + A; i++)
-		src[i] = '?';
-	for (int i = 0; i < len - 2; i++)
-		s[i] = 'a' + i%23;
-	s[len - 1] = '\0';
+  s = tag_buffer (s, len + 1, fun->test_mte);
+  r = fun->fun (s);
+  untag_buffer (s, len + 1, fun->test_mte);
 
-	r = fun->fun(s);
-	if (r != len-1) {
-		ERR("%s(%p) returned %zu\n", fun->name, s, r);
-		ERR("input:    %.*s\n", align+len+1, src);
-		ERR("expected: %d\n", len);
-		abort();
-	}
+  if (r != len)
+    {
+      ERR ("%s (%p) returned %zu expected %d\n", fun->name, s, r, len);
+      quote ("input", src, len);
+    }
 }
 
-int main()
+int
+main (void)
 {
-	int r = 0;
-	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
-		for (int a = 0; a < A; a++) {
-			int n;
-			for (n = 1; n < 100; n++)
-				test(funtab+i, a, n);
-			for (; n < LEN; n *= 2)
-				test(funtab+i, a, n);
-		}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
-			r = -1;
-	}
-	return r;
+  sbuf = mte_mmap (LEN + 3 * ALIGN);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int a = 0; a < ALIGN; a++)
+	for (int n = 0; n < LEN; n++)
+	  test (funtab + i, a, n);
+
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
 }
diff --git a/string/test/strncmp.c b/string/test/strncmp.c
index 43f941d..018a8a4 100644
--- a/string/test/strncmp.c
+++ b/string/test/strncmp.c
@@ -1,7 +1,7 @@
 /*
  * strncmp test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -9,95 +9,131 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include "mte.h"
 #include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
-	const char *name;
-	int (*fun)(const char *, const char *, size_t);
+  const char *name;
+  int (*fun) (const char *, const char *, size_t);
+  int test_mte;
 } funtab[] = {
-#define F(x) {#x, x},
-F(strncmp)
+  // clang-format off
+  F(strncmp, 0)
 #if __aarch64__
-F(__strncmp_aarch64)
+  F(__strncmp_aarch64, 0)
+  F(__strncmp_aarch64_mte, 1)
 # if __ARM_FEATURE_SVE
-F(__strncmp_aarch64_sve)
+  F(__strncmp_aarch64_sve, 1)
 # endif
 #endif
-#undef F
-	{0, 0}
+  {0, 0, 0}
+  // clang-format on
 };
-
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
+#undef F
 
 #define A 32
 #define LEN 250000
-static char s1buf[LEN+2*A];
-static char s2buf[LEN+2*A];
+static char *s1buf;
+static char *s2buf;
 
-static void *alignup(void *p)
+static void *
+alignup (void *p)
 {
-	return (void*)(((uintptr_t)p + A-1) & -A);
+  return (void *) (((uintptr_t) p + A - 1) & -A);
 }
 
-static void test(const struct fun *fun, int s1align, int s2align, int maxlen, int diffpos, int len)
+static void
+test (const struct fun *fun, int s1align, int s2align, int maxlen, int diffpos,
+      int len, int delta)
 {
-	char *src1 = alignup(s1buf);
-	char *src2 = alignup(s2buf);
-	char *s1 = src1 + s1align;
-	char *s2 = src2 + s2align;
-	int r;
-
-	if (len > LEN || s1align >= A || s2align >= A)
-		abort();
-	if (diffpos > 1 && diffpos >= len-1)
-		abort();
+  char *src1 = alignup (s1buf);
+  char *src2 = alignup (s2buf);
+  char *s1 = src1 + s1align;
+  char *s2 = src2 + s2align;
+  int r;
 
-	for (int i = 0; i < len+A; i++)
-		src1[i] = src2[i] = '?';
-	for (int i = 0; i < len-1; i++)
-		s1[i] = s2[i] = 'a' + i%23;
-	if (diffpos > 1)
-		s1[diffpos]++;
-	s1[len] = s2[len] = '\0';
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || s1align >= A || s2align >= A)
+    abort ();
+  if (diffpos >= len)
+    abort ();
+  if ((diffpos < 0) != (delta == 0))
+    abort ();
 
-	r = fun->fun(s1, s2, maxlen);
+  for (int i = 0; i < len + A; i++)
+    src1[i] = src2[i] = '?';
+  for (int i = 0; i < len; i++)
+    s1[i] = s2[i] = 'a' + i % 23;
+  if (delta)
+    s1[diffpos] += delta;
+  s1[len] = s2[len] = '\0';
 
-	diffpos = maxlen <= diffpos ? 0 : diffpos;
+  size_t mte_len = maxlen < len + 1 ? maxlen : len + 1;
+  s1 = tag_buffer (s1, mte_len, fun->test_mte);
+  s2 = tag_buffer (s2, mte_len, fun->test_mte);
+  r = fun->fun (s1, s2, maxlen);
+  untag_buffer (s1, mte_len, fun->test_mte);
+  untag_buffer (s2, mte_len, fun->test_mte);
 
-	if (((diffpos <= 1) && r != 0) || (diffpos > 1 && r == 0)) {
-		ERR("%s(align %d, align %d, %d (%d)) failed, returned %d (%d)\n",
-			fun->name, s1align, s2align, maxlen, len, r, diffpos);
-		ERR("src1: %.*s\n", s1align+len+1, src1);
-		ERR("src2: %.*s\n", s2align+len+1, src2);
-	}
+  if (diffpos >= maxlen)
+    {
+      diffpos = -1;
+      delta = 0;
+    }
+  if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0))
+    {
+      ERR (
+	"%s(align %d, align %d, %d) (len=%d, diffpos=%d) failed, returned %d\n",
+	fun->name, s1align, s2align, maxlen, len, diffpos, r);
+      quoteat ("src1", src1, len + A, diffpos);
+      quoteat ("src2", src2, len + A, diffpos);
+    }
 }
 
-int main()
+int
+main ()
 {
-	int r = 0;
-	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
-		for (int d = 0; d < A; d++)
-			for (int s = 0; s < A; s++) {
-				int n;
-				for (n = 0; n < 100; n++) {
-					test(funtab+i, d, s, n,   0,   n);
-					test(funtab+i, d, s, n,   n/2, n);
-					test(funtab+i, d, s, n/2, 0,   n);
-					test(funtab+i, d, s, n/2, n/2, n);
-				}
-				for (; n < LEN; n *= 2) {
-					test(funtab+i, d, s, n,   0,   n);
-					test(funtab+i, d, s, n,   n/2, n);
-					test(funtab+i, d, s, n/2, 0,   n);
-					test(funtab+i, d, s, n/2, n/2, n);
-				}
-			}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
-			r = -1;
-	}
-	return r;
+  s1buf = mte_mmap (LEN + 2 * A + 1);
+  s2buf = mte_mmap (LEN + 2 * A + 1);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int d = 0; d < A; d++)
+	for (int s = 0; s < A; s++)
+	  {
+	    int n;
+	    test (funtab + i, d, s, 0, -1, 0, 0);
+	    test (funtab + i, d, s, 1, -1, 0, 0);
+	    test (funtab + i, d, s, 0, -1, 1, 0);
+	    test (funtab + i, d, s, 1, -1, 1, 0);
+	    test (funtab + i, d, s, 2, -1, 1, 0);
+	    test (funtab + i, d, s, 1, 0, 1, 1);
+	    test (funtab + i, d, s, 1, 0, 1, -1);
+	    for (n = 2; n < 100; n++)
+	      {
+		test (funtab + i, d, s, n, -1, n, 0);
+		test (funtab + i, d, s, n, n / 2, n, 1);
+		test (funtab + i, d, s, n / 2, -1, n, 0);
+		test (funtab + i, d, s, n / 2, n / 2, n, -1);
+	      }
+	    for (; n < LEN; n *= 2)
+	      {
+		test (funtab + i, d, s, n, -1, n, 0);
+		test (funtab + i, d, s, n, n / 2, n, -1);
+		test (funtab + i, d, s, n / 2, -1, n, 0);
+		test (funtab + i, d, s, n / 2, n / 2, n, 1);
+	      }
+	  }
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
 }
diff --git a/string/test/strnlen.c b/string/test/strnlen.c
index db41f2a..0dea00e 100644
--- a/string/test/strnlen.c
+++ b/string/test/strnlen.c
@@ -1,93 +1,109 @@
 /*
  * strnlen test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
-#define _POSIX_C_SOURCE 200809L
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
+#include "mte.h"
 #include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
-	const char *name;
-	size_t (*fun)(const char *s, size_t m);
+  const char *name;
+  size_t (*fun) (const char *s, size_t m);
+  int test_mte;
 } funtab[] = {
-#define F(x) {#x, x},
-F(strnlen)
+  // clang-format off
+  F(strnlen, 0)
 #if __aarch64__
-F(__strnlen_aarch64)
+  F(__strnlen_aarch64, 1)
 # if __ARM_FEATURE_SVE
-F(__strnlen_aarch64_sve)
+  F(__strnlen_aarch64_sve, 1)
 # endif
 #endif
-#undef F
-	{0, 0}
+  {0, 0, 0}
+  // clang-format on
 };
+#undef F
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
-#define A 32
-#define SP 512
-#define LEN 250000
-static char sbuf[LEN+2*A];
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
 
-static void *alignup(void *p)
+static void *
+alignup (void *p)
 {
-	return (void*)(((uintptr_t)p + A-1) & -A);
+  return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
 }
 
-static void test(const struct fun *fun, int align, int maxlen, int len)
+static void
+test (const struct fun *fun, int align, size_t maxlen, size_t len)
 {
-	char *src = alignup(sbuf);
-	char *s = src + align;
-	size_t r;
-	size_t e = maxlen < len ? maxlen : len - 1;
+  char *src = alignup (sbuf);
+  char *s = src + align;
+  size_t r;
+  size_t e = maxlen < len ? maxlen : len;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || align >= ALIGN)
+    abort ();
 
-	if (len > LEN || align >= A)
-		abort();
+  for (int i = 0; src + i < s; i++)
+    src[i] = 0;
+  for (int i = 1; i <= ALIGN; i++)
+    s[len + i] = (len + align) & 1 ? 1 : 0;
+  for (int i = 0; i < len; i++)
+    s[i] = 'a' + (i & 31);
+  s[len] = 0;
+  if ((len + align) & 1)
+    s[e + 1] = 0;
 
-	for (int i = 0; i < len + A; i++)
-		src[i] = '?';
-	for (int i = 0; i < len - 2; i++)
-		s[i] = 'a' + i%23;
-	s[len - 1] = '\0';
+  size_t mte_len = maxlen < len + 1 ? maxlen : len + 1;
+  s = tag_buffer (s, mte_len, fun->test_mte);
+  r = fun->fun (s, maxlen);
+  untag_buffer (s, mte_len, fun->test_mte);
 
-	r = fun->fun(s, maxlen);
-	if (r != e) {
-		ERR("%s(%p) returned %zu\n", fun->name, s, r);
-		ERR("input:    %.*s\n", align+len+1, src);
-		ERR("expected: %d\n", len);
-		abort();
-	}
+  if (r != e)
+    {
+      ERR ("%s (%p, %zu) len %zu returned %zu, expected %zu\n",
+	   fun->name, s, maxlen, len, r, e);
+      quote ("input", s, len);
+    }
 }
 
-int main()
+int
+main (void)
 {
-	int r = 0;
-	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
-		for (int a = 0; a < A; a++) {
-			int n;
-			for (n = 1; n < 100; n++)
-				for (int maxlen = 0; maxlen < 100; maxlen++)
-					test(funtab+i, a, maxlen, n);
-			for (; n < LEN; n *= 2) {
-				test(funtab+i, a, n*2, n);
-				test(funtab+i, a, n, n);
-				test(funtab+i, a, n/2, n);
-			}
-		}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
-			r = -1;
-	}
-	return r;
+  sbuf = mte_mmap (LEN + 3 * ALIGN);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int a = 0; a < ALIGN; a++)
+	for (int n = 0; n < LEN; n++)
+	  {
+	    for (int maxlen = 0; maxlen < LEN; maxlen++)
+	      test (funtab + i, a, maxlen, n);
+	    test (funtab + i, a, SIZE_MAX - a, n);
+	  }
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
 }
diff --git a/string/test/strrchr.c b/string/test/strrchr.c
index 532fa51..fedbdc5 100644
--- a/string/test/strrchr.c
+++ b/string/test/strrchr.c
@@ -1,7 +1,7 @@
 /*
  * strrchr test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -10,88 +10,112 @@
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
+#include "mte.h"
 #include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
-	const char *name;
-	char *(*fun)(const char *s, int c);
+  const char *name;
+  char *(*fun) (const char *s, int c);
+  int test_mte;
 } funtab[] = {
-#define F(x) {#x, x},
-F(strrchr)
+  // clang-format off
+  F(strrchr, 0)
 #if __aarch64__
-F(__strrchr_aarch64)
+  F(__strrchr_aarch64, 0)
+  F(__strrchr_aarch64_mte, 1)
 # if __ARM_FEATURE_SVE
-F(__strrchr_aarch64_sve)
+  F(__strrchr_aarch64_sve, 1)
 # endif
 #endif
-#undef F
-	{0, 0}
+  {0, 0, 0}
+  // clang-format on
 };
+#undef F
 
-static int test_status;
-#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-
-#define A 32
-#define SP 512
-#define LEN 250000
-static char sbuf[LEN+2*A];
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
 
-static void *alignup(void *p)
+static void *
+alignup (void *p)
 {
-	return (void*)(((uintptr_t)p + A-1) & -A);
+  return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
 }
 
-static void test(const struct fun *fun, int align, int seekpos, int len)
+static void
+test (const struct fun *fun, int align, int seekpos, int len)
 {
-	char *src = alignup(sbuf);
-	char *s = src + align;
-	char *f = seekpos != -1 ? s + seekpos : 0;
-	int seekchar = 0x1;
-	void *p;
+  char *src = alignup (sbuf);
+  char *s = src + align;
+  char *f = seekpos != -1 ? s + seekpos : 0;
+  int seekchar = 0x1;
+  void *p;
 
-	if (len > LEN || seekpos >= len - 1 || align >= A)
-		abort();
-	if (seekchar >= 'a' && seekchar <= 'a' + 23)
-		abort();
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || seekpos >= len || align >= ALIGN)
+    abort ();
 
-	for (int i = 0; i < len + A; i++)
-		src[i] = '?';
-	for (int i = 0; i < len - 2; i++)
-		s[i] = 'a' + i%23;
-	if (seekpos != -1)
-		s[seekpos/2] = s[seekpos] = seekchar;
-	s[len - 1] = '\0';
+  for (int i = 0; src + i < s; i++)
+    src[i] = (i + len) & 1 ? seekchar : 0;
+  for (int i = 1; i <= ALIGN; i++)
+    s[len + i] = (i + len) & 1 ? seekchar : 0;
+  for (int i = 0; i < len; i++)
+    s[i] = 'a' + (i & 31);
+  if (seekpos != -1)
+    s[seekpos / 2] = s[seekpos] = seekchar;
+  if (seekpos > 0 && (len + align) & 1)
+    s[seekpos - 1] = seekchar;
+  s[len] = '\0';
 
-	p = fun->fun(s, seekchar);
+  s = tag_buffer (s, len + 1, fun->test_mte);
+  p = fun->fun (s, seekchar);
+  untag_buffer (s, len + 1, fun->test_mte);
+  p = untag_pointer (p);
 
-	if (p != f) {
-		ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
-		ERR("expected: %p\n", f);
-		abort();
-	}
+  if (p != f)
+    {
+      ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
+	   fun->name, s, seekchar, len, p, f, seekpos);
+      quote ("input", s, len);
+    }
+
+  s = tag_buffer (s, len + 1, fun->test_mte);
+  p = fun->fun (s, 0);
+  untag_buffer (s, len + 1, fun->test_mte);
+
+  if (p != s + len)
+    {
+      ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
+	   fun->name, s, 0, len, p, s + len, len);
+      quote ("input", s, len);
+    }
 }
 
-int main()
+int
+main (void)
 {
-	int r = 0;
-	for (int i=0; funtab[i].name; i++) {
-		test_status = 0;
-		for (int a = 0; a < A; a++) {
-			int n;
-			for (n = 1; n < 100; n++) {
-				for (int sp = 0; sp < n - 1; sp++)
-					test(funtab+i, a, sp, n);
-				test(funtab+i, a, -1, n);
-			}
-			for (; n < LEN; n *= 2) {
-				test(funtab+i, a, -1, n);
-				test(funtab+i, a, n / 2, n);
-			}
-		}
-		printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
-		if (test_status)
-			r = -1;
-	}
-	return r;
+  sbuf = mte_mmap (LEN + 3 * ALIGN);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int a = 0; a < ALIGN; a++)
+	for (int n = 0; n < LEN; n++)
+	  {
+	    for (int sp = 0; sp < n; sp++)
+	      test (funtab + i, a, sp, n);
+	    test (funtab + i, a, -1, n);
+	  }
+
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
 }
diff --git a/string/x86_64/check-arch.S b/string/x86_64/check-arch.S
new file mode 100644
index 0000000..26ade0a
--- /dev/null
+++ b/string/x86_64/check-arch.S
@@ -0,0 +1,10 @@
+/*
+ * check ARCH setting.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if !__x86_64__
+# error ARCH setting does not match the compiler.
+#endif
author	android-build-team Robot <android-build-team-robot@google.com>	2021-06-19 12:01:09 +0000
committer	android-build-team Robot <android-build-team-robot@google.com>	2021-06-19 12:01:09 +0000
commit	fa8a48fe8ce27c09976dede4af037c7acdb59a87 (patch)
tree	7ded61f3cfe322fe4706181e50b2f28b35d8a4a0
parent	8edcec53c6d84dc7f85e4c0a8539384b3fe489ec (diff)
parent	0cbe0156ef389ae56254a55c909c3da03b72616c (diff)
download	arm-optimized-routines-android12-mainline-media-swcodec-release.tar.gz