diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2021-07-15 01:25:06 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2021-07-15 01:25:06 +0000 |
commit | 8ad45898efde06ae7004485cefc495506b67b1fb (patch) | |
tree | 7ded61f3cfe322fe4706181e50b2f28b35d8a4a0 | |
parent | 8edcec53c6d84dc7f85e4c0a8539384b3fe489ec (diff) | |
parent | 0cbe0156ef389ae56254a55c909c3da03b72616c (diff) | |
download | arm-optimized-routines-android12-mainline-conscrypt-release.tar.gz |
Snap for 7550844 from 0cbe0156ef389ae56254a55c909c3da03b72616c to mainline-conscrypt-releaseandroid-mainline-12.0.0_r8android-mainline-12.0.0_r25android12-mainline-conscrypt-release
Change-Id: I66c7a07f58dd84f1326fd245a279625e5df63c10
156 files changed, 6530 insertions, 2033 deletions
@@ -1,3 +1,20 @@ +package { + default_applicable_licenses: ["external_arm-optimized-routines_license"], +} + +// Added automatically by a large-scale-change +// See: http://go/android-license-faq +license { + name: "external_arm-optimized-routines_license", + visibility: [":__subpackages__"], + license_kinds: [ + "SPDX-license-identifier-MIT", + ], + license_text: [ + "LICENSE", + ], +} + cc_defaults { name: "arm-optimized-routines-defaults", host_supported: true, @@ -26,12 +43,37 @@ cc_defaults { local_include_dirs: ["math/include"], } -cc_library { - name: "libarm-optimized-routines-math", +cc_defaults { + name: "libarm-optimized-routines-defaults", defaults: ["arm-optimized-routines-defaults"], ramdisk_available: true, + vendor_ramdisk_available: true, recovery_available: true, native_bridge_supported: true, + apex_available: [ + "//apex_available:platform", + "com.android.runtime", + ], + + stl: "none", + static: { + system_shared_libs: [], + }, + header_libs: ["libc_headers"], +} + +cc_library_static { + name: "libarm-optimized-routines-math", + defaults: ["libarm-optimized-routines-defaults"], + exclude_srcs: [ + // Provided by: + // bionic/libm/upstream-freebsd/lib/msun/src/s_erf.c + // bionic/libm/upstream-freebsd/lib/msun/src/s_erff.c + "math/erf.c", + "math/erf_data.c", + "math/erff.c", + "math/erff_data.c", + ], srcs: [ "math/*.c", ], @@ -53,9 +95,43 @@ cc_library { enabled: true, }, }, - stl: "none", - static: { - system_shared_libs: [], +} + +cc_library_static { + name: "libarm-optimized-routines-string", + defaults: ["libarm-optimized-routines-defaults"], + + arch: { + arm64: { + srcs: [ + "string/aarch64/memchr-mte.S", + "string/aarch64/memchr.S", + "string/aarch64/memcmp.S", + "string/aarch64/memrchr.S", + "string/aarch64/stpcpy-mte.S", + "string/aarch64/stpcpy.S", + "string/aarch64/strchrnul-mte.S", + "string/aarch64/strchrnul.S", + "string/aarch64/strchr-mte.S", + "string/aarch64/strchr.S", + "string/aarch64/strcmp-mte.S", + "string/aarch64/strcmp.S", + "string/aarch64/strcpy-mte.S", + "string/aarch64/strcpy.S", + "string/aarch64/strlen-mte.S", + "string/aarch64/strlen.S", + "string/aarch64/strncmp-mte.S", + "string/aarch64/strncmp.S", + "string/aarch64/strnlen.S", + "string/aarch64/strrchr-mte.S", + "string/aarch64/strrchr.S", + ], + asflags: [ + "-D__memcmp_aarch64=memcmp", + "-D__memrchr_aarch64=memrchr", + "-D__strnlen_aarch64=strnlen", + ] + }, }, } @@ -93,7 +169,7 @@ sh_test { test_suites: ["general-tests"], host_supported: true, device_supported: false, - test_config: "arm-optimized-routines-tests.xml", + require_root: true, target_required: [ "mathtest", "ulp", @@ -9,11 +9,11 @@ third_party { type: GIT value: "https://github.com/ARM-software/optimized-routines.git" } - version: "33ba19089a261964e1e84ba4edf90263b468c161" + version: "v21.02" license_type: NOTICE last_upgrade_date { - year: 2020 + year: 2021 month: 2 - day: 1 + day: 18 } } @@ -1,6 +1,6 @@ # Makefile - requires GNU make # -# Copyright (c) 2018-2019, Arm Limited. +# Copyright (c) 2018-2020, Arm Limited. # SPDX-License-Identifier: MIT srcdir = . @@ -10,7 +10,7 @@ libdir = $(prefix)/lib includedir = $(prefix)/include # Configure these in config.mk, do not make changes in this file. -SUBS = math string +SUBS = math string networking HOST_CC = cc HOST_CFLAGS = -std=c99 -O2 HOST_LDFLAGS = @@ -1 +0,0 @@ -LICENSE
\ No newline at end of file @@ -8,7 +8,8 @@ Assignment Agreement, please follow the instructions in contributor-agreement.pdf. This is needed so upstreaming code to projects that require copyright assignment is possible. -Regular quarterly releases are tagged as vYY.MM (e.g. v19.11). +Regular quarterly releases are tagged as vYY.MM, the latest +release is v20.11. Source code layout: @@ -17,6 +18,9 @@ math/ - math subproject sources. math/include/ - math library public headers. math/test/ - math test and benchmark related sources. math/tools/ - tools used for designing the algorithms. +networking/ - networking subproject sources. +networking/include/ - networking library public headers. +networking/test/ - networking test and benchmark related sources. string/ - string routines subproject sources. string/include/ - string library public headers. string/test/ - string test and benchmark related sources. diff --git a/TEST_MAPPING b/TEST_MAPPING index e4d3d5e..66bdc01 100644 --- a/TEST_MAPPING +++ b/TEST_MAPPING @@ -2,6 +2,9 @@ "presubmit": [ { "name": "CtsBionicTestCases" + }, + { + "name": "arm-optimized-routines-tests" } ] } diff --git a/arm-optimized-routines-tests.xml b/arm-optimized-routines-tests.xml deleted file mode 100644 index 96db90c..0000000 --- a/arm-optimized-routines-tests.xml +++ /dev/null @@ -1,26 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<!-- Copyright (C) 2019 The Android Open Source Project - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<configuration description="Config for running arm-optimized-routines-tests through Atest or in Infra"> - <option name="test-suite-tag" value="arm-optimized-routines-tests" /> - <!-- This test requires a device, so it's not annotated with a null-device. --> - <test class="com.android.tradefed.testtype.binary.ExecutableHostTest" > - <option name="binary" value="run-arm-optimized-routines-tests-on-android.sh" /> - <!-- Test script assumes a relative path with the tests/ folders. --> - <option name="relative-path-execution" value="true" /> - <!-- Tests shouldn't be that long but set 15m to be safe. --> - <option name="per-binary-timeout" value="15m" /> - </test> -</configuration> diff --git a/config.mk.dist b/config.mk.dist index 301b5f9..177e1ac 100644 --- a/config.mk.dist +++ b/config.mk.dist @@ -1,20 +1,28 @@ # Example config.mk # -# Copyright (c) 2018-2019, Arm Limited. +# Copyright (c) 2018-2020, Arm Limited. # SPDX-License-Identifier: MIT # Subprojects to build -SUBS = math string +SUBS = math string networking -HOST_CC = gcc -HOST_CFLAGS = -std=c99 -O2 -HOST_CFLAGS += -Wall -Wno-unused-function +# Target architecture: aarch64, arm or x86_64 +ARCH = aarch64 + +# Use for cross compilation with gcc. +#CROSS_COMPILE = aarch64-none-linux-gnu- +# Compiler for the target CC = $(CROSS_COMPILE)gcc CFLAGS = -std=c99 -pipe -O3 CFLAGS += -Wall -Wno-missing-braces CFLAGS += -Werror=implicit-function-declaration +# Used for test case generator that is executed on the host +HOST_CC = gcc +HOST_CFLAGS = -std=c99 -O2 +HOST_CFLAGS += -Wall -Wno-unused-function + # Enable debug info. HOST_CFLAGS += -g CFLAGS += -g @@ -22,8 +30,8 @@ CFLAGS += -g # Optimize the shared libraries on aarch64 assuming they fit in 1M. #CFLAGS_SHARED = -fPIC -mcmodel=tiny -# Use for cross compilation with gcc. -#CROSS_COMPILE = aarch64-none-linux-gnu- +# Enable MTE support. +#CFLAGS += -march=armv8.5-a+memtag -DWANT_MTE_TEST=1 # Use with cross testing. #EMULATOR = qemu-aarch64-static @@ -35,6 +43,7 @@ math-ldlibs = math-ulpflags = math-testflags = string-cflags = +networking-cflags = # Use if mpfr is available on the target for ulp error checking. #math-ldlibs += -lmpfr -lgmp @@ -53,3 +62,12 @@ math-cflags += -ffp-contract=fast -fno-math-errno # Disable fenv checks #math-ulpflags = -q -f #math-testflags = -nostatus + +# Remove GNU Property Notes from asm files. +#string-cflags += -DWANT_GNU_PROPERTY=0 + +# Enable assertion checks. +#networking-cflags += -DWANT_ASSERT + +# Avoid auto-vectorization of scalar code and unroll loops +networking-cflags += -O2 -fno-tree-vectorize -funroll-loops diff --git a/math/cosf.c b/math/cosf.c index 831b39e..f29f194 100644 --- a/math/cosf.c +++ b/math/cosf.c @@ -1,7 +1,7 @@ /* * Single-precision cos function. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/erf.c b/math/erf.c new file mode 100644 index 0000000..12d7e51 --- /dev/null +++ b/math/erf.c @@ -0,0 +1,244 @@ +/* + * Double-precision erf(x) function. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "math_config.h" +#include <math.h> +#include <stdint.h> + +#define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3 +#define C 0x1.b0ac16p-1 +#define PA __erf_data.erf_poly_A +#define NA __erf_data.erf_ratio_N_A +#define DA __erf_data.erf_ratio_D_A +#define NB __erf_data.erf_ratio_N_B +#define DB __erf_data.erf_ratio_D_B +#define PC __erf_data.erfc_poly_C +#define PD __erf_data.erfc_poly_D +#define PE __erf_data.erfc_poly_E +#define PF __erf_data.erfc_poly_F + +/* Top 32 bits of a double. */ +static inline uint32_t +top32 (double x) +{ + return asuint64 (x) >> 32; +} + +/* Fast erf implementation using a mix of + rational and polynomial approximations. + Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0. */ +double +erf (double x) +{ + /* Get top word and sign. */ + uint32_t ix = top32 (x); + uint32_t ia = ix & 0x7fffffff; + uint32_t sign = ix >> 31; + + /* Normalized and subnormal cases */ + if (ia < 0x3feb0000) + { /* a = |x| < 0.84375. */ + + if (ia < 0x3e300000) + { /* a < 2^(-28). */ + if (ia < 0x00800000) + { /* a < 2^(-1015). */ + double y = fma (TwoOverSqrtPiMinusOne, x, x); + return check_uflow (y); + } + return x + TwoOverSqrtPiMinusOne * x; + } + + double x2 = x * x; + + if (ia < 0x3fe00000) + { /* a < 0.5 - Use polynomial approximation. */ + double r1 = fma (x2, PA[1], PA[0]); + double r2 = fma (x2, PA[3], PA[2]); + double r3 = fma (x2, PA[5], PA[4]); + double r4 = fma (x2, PA[7], PA[6]); + double r5 = fma (x2, PA[9], PA[8]); + double x4 = x2 * x2; + double r = r5; + r = fma (x4, r, r4); + r = fma (x4, r, r3); + r = fma (x4, r, r2); + r = fma (x4, r, r1); + return fma (r, x, x); /* This fma is crucial for accuracy. */ + } + else + { /* 0.5 <= a < 0.84375 - Use rational approximation. */ + double x4, x8, r1n, r2n, r1d, r2d, r3d; + + r1n = fma (x2, NA[1], NA[0]); + x4 = x2 * x2; + r2n = fma (x2, NA[3], NA[2]); + x8 = x4 * x4; + r1d = fma (x2, DA[0], 1.0); + r2d = fma (x2, DA[2], DA[1]); + r3d = fma (x2, DA[4], DA[3]); + double P = r1n + x4 * r2n + x8 * NA[4]; + double Q = r1d + x4 * r2d + x8 * r3d; + return fma (P / Q, x, x); + } + } + else if (ia < 0x3ff40000) + { /* 0.84375 <= |x| < 1.25. */ + double a2, a4, a6, r1n, r2n, r3n, r4n, r1d, r2d, r3d, r4d; + double a = fabs (x) - 1.0; + r1n = fma (a, NB[1], NB[0]); + a2 = a * a; + r1d = fma (a, DB[0], 1.0); + a4 = a2 * a2; + r2n = fma (a, NB[3], NB[2]); + a6 = a4 * a2; + r2d = fma (a, DB[2], DB[1]); + r3n = fma (a, NB[5], NB[4]); + r3d = fma (a, DB[4], DB[3]); + r4n = NB[6]; + r4d = DB[5]; + double P = r1n + a2 * r2n + a4 * r3n + a6 * r4n; + double Q = r1d + a2 * r2d + a4 * r3d + a6 * r4d; + if (sign) + return -C - P / Q; + else + return C + P / Q; + } + else if (ia < 0x40000000) + { /* 1.25 <= |x| < 2.0. */ + double a = fabs (x); + a = a - 1.25; + + double r1 = fma (a, PC[1], PC[0]); + double r2 = fma (a, PC[3], PC[2]); + double r3 = fma (a, PC[5], PC[4]); + double r4 = fma (a, PC[7], PC[6]); + double r5 = fma (a, PC[9], PC[8]); + double r6 = fma (a, PC[11], PC[10]); + double r7 = fma (a, PC[13], PC[12]); + double r8 = fma (a, PC[15], PC[14]); + + double a2 = a * a; + + double r = r8; + r = fma (a2, r, r7); + r = fma (a2, r, r6); + r = fma (a2, r, r5); + r = fma (a2, r, r4); + r = fma (a2, r, r3); + r = fma (a2, r, r2); + r = fma (a2, r, r1); + + if (sign) + return -1.0 + r; + else + return 1.0 - r; + } + else if (ia < 0x400a0000) + { /* 2 <= |x| < 3.25. */ + double a = fabs (x); + a = fma (0.5, a, -1.0); + + double r1 = fma (a, PD[1], PD[0]); + double r2 = fma (a, PD[3], PD[2]); + double r3 = fma (a, PD[5], PD[4]); + double r4 = fma (a, PD[7], PD[6]); + double r5 = fma (a, PD[9], PD[8]); + double r6 = fma (a, PD[11], PD[10]); + double r7 = fma (a, PD[13], PD[12]); + double r8 = fma (a, PD[15], PD[14]); + double r9 = fma (a, PD[17], PD[16]); + + double a2 = a * a; + + double r = r9; + r = fma (a2, r, r8); + r = fma (a2, r, r7); + r = fma (a2, r, r6); + r = fma (a2, r, r5); + r = fma (a2, r, r4); + r = fma (a2, r, r3); + r = fma (a2, r, r2); + r = fma (a2, r, r1); + + if (sign) + return -1.0 + r; + else + return 1.0 - r; + } + else if (ia < 0x40100000) + { /* 3.25 <= |x| < 4.0. */ + double a = fabs (x); + a = a - 3.25; + + double r1 = fma (a, PE[1], PE[0]); + double r2 = fma (a, PE[3], PE[2]); + double r3 = fma (a, PE[5], PE[4]); + double r4 = fma (a, PE[7], PE[6]); + double r5 = fma (a, PE[9], PE[8]); + double r6 = fma (a, PE[11], PE[10]); + double r7 = fma (a, PE[13], PE[12]); + + double a2 = a * a; + + double r = r7; + r = fma (a2, r, r6); + r = fma (a2, r, r5); + r = fma (a2, r, r4); + r = fma (a2, r, r3); + r = fma (a2, r, r2); + r = fma (a2, r, r1); + + if (sign) + return -1.0 + r; + else + return 1.0 - r; + } + else if (ia < 0x4017a000) + { /* 4 <= |x| < 5.90625. */ + double a = fabs (x); + a = fma (0.5, a, -2.0); + + double r1 = fma (a, PF[1], PF[0]); + double r2 = fma (a, PF[3], PF[2]); + double r3 = fma (a, PF[5], PF[4]); + double r4 = fma (a, PF[7], PF[6]); + double r5 = fma (a, PF[9], PF[8]); + double r6 = fma (a, PF[11], PF[10]); + double r7 = fma (a, PF[13], PF[12]); + double r8 = fma (a, PF[15], PF[14]); + double r9 = PF[16]; + + double a2 = a * a; + + double r = r9; + r = fma (a2, r, r8); + r = fma (a2, r, r7); + r = fma (a2, r, r6); + r = fma (a2, r, r5); + r = fma (a2, r, r4); + r = fma (a2, r, r3); + r = fma (a2, r, r2); + r = fma (a2, r, r1); + + if (sign) + return -1.0 + r; + else + return 1.0 - r; + } + else + { + /* Special cases : erf(nan)=nan, erf(+inf)=+1 and erf(-inf)=-1. */ + if (unlikely (ia >= 0x7ff00000)) + return (double) (1.0 - (sign << 1)) + 1.0 / x; + + if (sign) + return -1.0; + else + return 1.0; + } +} diff --git a/math/erf_data.c b/math/erf_data.c new file mode 100644 index 0000000..807875b --- /dev/null +++ b/math/erf_data.c @@ -0,0 +1,85 @@ +/* + * Shared data between erf and erfc. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "math_config.h" + +/* +Minimax approximation of erf +*/ +const struct erf_data __erf_data = { +.erf_poly_A = { +#if ERF_POLY_A_NCOEFFS == 10 +0x1.06eba8214db68p-3, -0x1.812746b037948p-2, 0x1.ce2f21a03872p-4, +-0x1.b82ce30e6548p-6, 0x1.565bcc360a2f2p-8, -0x1.c02d812bc979ap-11, +0x1.f99bddfc1ebe9p-14, -0x1.f42c457cee912p-17, 0x1.b0e414ec20ee9p-20, +-0x1.18c47fd143c5ep-23 +#endif +}, +/* Rational approximation on [0x1p-28, 0.84375] */ +.erf_ratio_N_A = { +0x1.06eba8214db68p-3, -0x1.4cd7d691cb913p-2, -0x1.d2a51dbd7194fp-6, +-0x1.7a291236668e4p-8, -0x1.8ead6120016acp-16 +}, +.erf_ratio_D_A = { +0x1.97779cddadc09p-2, 0x1.0a54c5536cebap-4, 0x1.4d022c4d36b0fp-8, +0x1.15dc9221c1a1p-13, -0x1.09c4342a2612p-18 +}, +/* Rational approximation on [0.84375, 1.25] */ +.erf_ratio_N_B = { +-0x1.359b8bef77538p-9, 0x1.a8d00ad92b34dp-2, -0x1.7d240fbb8c3f1p-2, +0x1.45fca805120e4p-2, -0x1.c63983d3e28ecp-4, 0x1.22a36599795ebp-5, +-0x1.1bf380a96073fp-9 +}, +.erf_ratio_D_B = { +0x1.b3e6618eee323p-4, 0x1.14af092eb6f33p-1, 0x1.2635cd99fe9a7p-4, +0x1.02660e763351fp-3, 0x1.bedc26b51dd1cp-7, 0x1.88b545735151dp-7 +}, +.erfc_poly_C = { +#if ERFC_POLY_C_NCOEFFS == 16 +/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=15 a=1.25 b=2 c=1 d=1.25 */ +0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2, +-0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5, +-0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8, +-0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12, +0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16, +-0x1.578c9e375d37p-19 +#endif +}, +.erfc_poly_D = { +#if ERFC_POLY_D_NCOEFFS == 18 +/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2 */ +0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3, +-0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2, +-0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2, +0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3, +-0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3, +0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10 +#endif +}, +.erfc_poly_E = { +#if ERFC_POLY_E_NCOEFFS == 14 +/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25 */ +0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14, +-0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12, +0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14, +-0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20, +-0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23 +#endif +}, +.erfc_poly_F = { +#if ERFC_POLY_F_NCOEFFS == 17 +/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4 */ +0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19, +-0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14, +0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11, +-0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11, +0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14, +-0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19 +#endif +} +}; + diff --git a/math/erff.c b/math/erff.c new file mode 100644 index 0000000..a58e825 --- /dev/null +++ b/math/erff.c @@ -0,0 +1,104 @@ +/* + * Single-precision erf(x) function. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <math.h> +#include "math_config.h" + +#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f +#define A __erff_data.erff_poly_A +#define B __erff_data.erff_poly_B + +/* Top 12 bits of a float. */ +static inline uint32_t +top12 (float x) +{ + return asuint (x) >> 20; +} + +/* Efficient implementation of erff + using either a pure polynomial approximation or + the exponential of a polynomial. + Worst-case error is 1.09ulps at 0x1.c111acp-1. */ +float +erff (float x) +{ + float r, x2, u; + + /* Get top word. */ + uint32_t ix = asuint (x); + uint32_t sign = ix >> 31; + uint32_t ia12 = top12 (x) & 0x7ff; + + /* Limit of both intervals is 0.875 for performance reasons but coefficients + computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy + from 0.94 to 1.1ulps. */ + if (ia12 < 0x3f6) + { /* a = |x| < 0.875. */ + + /* Tiny and subnormal cases. */ + if (unlikely (ia12 < 0x318)) + { /* |x| < 2^(-28). */ + if (unlikely (ia12 < 0x040)) + { /* |x| < 2^(-119). */ + float y = fmaf (TwoOverSqrtPiMinusOne, x, x); + return check_uflowf (y); + } + return x + TwoOverSqrtPiMinusOne * x; + } + + x2 = x * x; + + /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2). */ + r = A[5]; + r = fmaf (r, x2, A[4]); + r = fmaf (r, x2, A[3]); + r = fmaf (r, x2, A[2]); + r = fmaf (r, x2, A[1]); + r = fmaf (r, x2, A[0]); + r = fmaf (r, x, x); + } + else if (ia12 < 0x408) + { /* |x| < 4.0 - Use a custom Estrin scheme. */ + + float a = fabsf (x); + /* Start with Estrin scheme on high order (small magnitude) coefficients. */ + r = fmaf (B[6], a, B[5]); + u = fmaf (B[4], a, B[3]); + x2 = x * x; + r = fmaf (r, x2, u); + /* Then switch to pure Horner scheme. */ + r = fmaf (r, a, B[2]); + r = fmaf (r, a, B[1]); + r = fmaf (r, a, B[0]); + r = fmaf (r, a, a); + /* Single precision exponential with ~0.5ulps, + ensures erff has max. rel. error + < 1ulp on [0.921875, 4.0], + < 1.1ulps on [0.875, 4.0]. */ + r = expf (-r); + /* Explicit copysign (calling copysignf increases latency). */ + if (sign) + r = -1.0f + r; + else + r = 1.0f - r; + } + else + { /* |x| >= 4.0. */ + + /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */ + if (unlikely (ia12 >= 0x7f8)) + return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x; + + /* Explicit copysign (calling copysignf increases latency). */ + if (sign) + r = -1.0f; + else + r = 1.0f; + } + return r; +} diff --git a/math/erff_data.c b/math/erff_data.c new file mode 100644 index 0000000..fa6b1ef --- /dev/null +++ b/math/erff_data.c @@ -0,0 +1,22 @@ +/* + * Data for approximation of erff. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "math_config.h" + +/* Minimax approximation of erff. */ +const struct erff_data __erff_data = { +.erff_poly_A = { +0x1.06eba6p-03f, -0x1.8126e0p-02f, 0x1.ce1a46p-04f, +-0x1.b68bd2p-06f, 0x1.473f48p-08f, -0x1.3a1a82p-11f +}, +.erff_poly_B = { +0x1.079d0cp-3f, 0x1.450aa0p-1f, 0x1.b55cb0p-4f, +-0x1.8d6300p-6f, 0x1.fd1336p-9f, -0x1.91d2ccp-12f, +0x1.222900p-16f +} +}; + @@ -1,7 +1,7 @@ /* * Double-precision e^x function. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/exp2.c b/math/exp2.c index 47aa479..35ab39f 100644 --- a/math/exp2.c +++ b/math/exp2.c @@ -1,7 +1,7 @@ /* * Double-precision 2^x function. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/expf.c b/math/expf.c index 0fe1f7d..9b2f0c3 100644 --- a/math/expf.c +++ b/math/expf.c @@ -1,7 +1,7 @@ /* * Single-precision e^x function. * - * Copyright (c) 2017-2018, Arm Limited. + * Copyright (c) 2017-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/include/mathlib.h b/math/include/mathlib.h index 4493008..279d829 100644 --- a/math/include/mathlib.h +++ b/math/include/mathlib.h @@ -1,7 +1,7 @@ /* * Public API. * - * Copyright (c) 2015-2019, Arm Limited. + * Copyright (c) 2015-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -1,7 +1,7 @@ /* * Double-precision log(x) function. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/log2.c b/math/log2.c index 804fb85..55102b7 100644 --- a/math/log2.c +++ b/math/log2.c @@ -1,7 +1,7 @@ /* * Double-precision log2(x) function. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/logf.c b/math/logf.c index ee3120a..cfbaee1 100644 --- a/math/logf.c +++ b/math/logf.c @@ -1,7 +1,7 @@ /* * Single-precision log function. * - * Copyright (c) 2017-2018, Arm Limited. + * Copyright (c) 2017-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/logf_data.c b/math/logf_data.c index 53c5f62..e8973ce 100644 --- a/math/logf_data.c +++ b/math/logf_data.c @@ -1,7 +1,7 @@ /* * Data definition for logf. * - * Copyright (c) 2017-2018, Arm Limited. + * Copyright (c) 2017-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/math_config.h b/math/math_config.h index 7a1cc81..e851043 100644 --- a/math/math_config.h +++ b/math/math_config.h @@ -1,7 +1,7 @@ /* * Configuration for math routines. * - * Copyright (c) 2017-2018, Arm Limited. + * Copyright (c) 2017-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -12,12 +12,17 @@ #include <stdint.h> #ifndef WANT_ROUNDING -/* Correct special case results in non-nearest rounding modes. */ +/* If defined to 1, return correct results for special cases in non-nearest + rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than -0.0f). + This may be set to 0 if there is no fenv support or if math functions only + get called in round to nearest mode. */ # define WANT_ROUNDING 1 #endif #ifndef WANT_ERRNO -/* Set errno according to ISO C with (math_errhandling & MATH_ERRNO) != 0. */ -# define WANT_ERRNO 1 +/* If defined to 1, set errno in math functions according to ISO C. Many math + libraries do not set errno, so this is 0 by default. It may need to be + set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0. */ +# define WANT_ERRNO 0 #endif #ifndef WANT_ERRNO_UFLOW /* Set errno to ERANGE if result underflows to 0 (in all rounding modes). */ @@ -293,6 +298,24 @@ check_uflow (double x) return WANT_ERRNO ? __math_check_uflow (x) : x; } +/* Check if the result overflowed to infinity. */ +HIDDEN float __math_check_oflowf (float); +/* Check if the result underflowed to 0. */ +HIDDEN float __math_check_uflowf (float); + +/* Check if the result overflowed to infinity. */ +static inline float +check_oflowf (float x) +{ + return WANT_ERRNO ? __math_check_oflowf (x) : x; +} + +/* Check if the result underflowed to 0. */ +static inline float +check_uflowf (float x) +{ + return WANT_ERRNO ? __math_check_uflowf (x) : x; +} /* Shared between expf, exp2f and powf. */ #define EXP2F_TABLE_BITS 5 @@ -411,4 +434,29 @@ extern const struct pow_log_data struct {double invc, pad, logc, logctail;} tab[1 << POW_LOG_TABLE_BITS]; } __pow_log_data HIDDEN; +extern const struct erff_data +{ + float erff_poly_A[6]; + float erff_poly_B[7]; +} __erff_data HIDDEN; + +#define ERF_POLY_A_ORDER 19 +#define ERF_POLY_A_NCOEFFS 10 +#define ERFC_POLY_C_NCOEFFS 16 +#define ERFC_POLY_D_NCOEFFS 18 +#define ERFC_POLY_E_NCOEFFS 14 +#define ERFC_POLY_F_NCOEFFS 17 +extern const struct erf_data +{ + double erf_poly_A[ERF_POLY_A_NCOEFFS]; + double erf_ratio_N_A[5]; + double erf_ratio_D_A[5]; + double erf_ratio_N_B[7]; + double erf_ratio_D_B[6]; + double erfc_poly_C[ERFC_POLY_C_NCOEFFS]; + double erfc_poly_D[ERFC_POLY_D_NCOEFFS]; + double erfc_poly_E[ERFC_POLY_E_NCOEFFS]; + double erfc_poly_F[ERFC_POLY_F_NCOEFFS]; +} __erf_data HIDDEN; + #endif diff --git a/math/math_errf.c b/math/math_errf.c index 07154c5..d5350b8 100644 --- a/math/math_errf.c +++ b/math/math_errf.c @@ -1,7 +1,7 @@ /* * Single-precision math error handling. * - * Copyright (c) 2017-2018, Arm Limited. + * Copyright (c) 2017-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -64,3 +64,17 @@ __math_invalidf (float x) float y = (x - x) / (x - x); return isnan (x) ? y : with_errnof (y, EDOM); } + +/* Check result and set errno if necessary. */ + +HIDDEN float +__math_check_uflowf (float y) +{ + return y == 0.0f ? with_errnof (y, ERANGE) : y; +} + +HIDDEN float +__math_check_oflowf (float y) +{ + return isinf (y) ? with_errnof (y, ERANGE) : y; +} @@ -1,7 +1,7 @@ /* * Double-precision x^y function. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2020, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/powf.c b/math/powf.c index 1534a09..6ba45d3 100644 --- a/math/powf.c +++ b/math/powf.c @@ -1,7 +1,7 @@ /* * Single-precision pow function. * - * Copyright (c) 2017-2018, Arm Limited. + * Copyright (c) 2017-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/powf_log2_data.c b/math/powf_log2_data.c index b9fbdc4..97e0d98 100644 --- a/math/powf_log2_data.c +++ b/math/powf_log2_data.c @@ -1,7 +1,7 @@ /* * Data definition for powf. * - * Copyright (c) 2017-2018, Arm Limited. + * Copyright (c) 2017-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/sincosf.c b/math/sincosf.c index e6cd41e..9746f1c 100644 --- a/math/sincosf.c +++ b/math/sincosf.c @@ -1,7 +1,7 @@ /* * Single-precision sin/cos function. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/sincosf_data.c b/math/sincosf_data.c index 5d0b58e..ab4ac47 100644 --- a/math/sincosf_data.c +++ b/math/sincosf_data.c @@ -1,7 +1,7 @@ /* * Data definition for sinf, cosf and sincosf. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/sinf.c b/math/sinf.c index 770b294..ddbc1da 100644 --- a/math/sinf.c +++ b/math/sinf.c @@ -1,7 +1,7 @@ /* * Single-precision sin function. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/mathbench.c b/math/test/mathbench.c index 33ceda3..0c17826 100644 --- a/math/test/mathbench.c +++ b/math/test/mathbench.c @@ -1,7 +1,7 @@ /* * Microbenchmark for math functions. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -248,6 +248,7 @@ D (log2, 0.999, 1.001) {"pow", 'd', 0, 0.01, 11.1, {.d = xypow}}, D (xpow, 0.01, 11.1) D (ypow, -9.9, 9.9) +D (erf, -6.0, 6.0) F (dummyf, 1.0, 2.0) F (expf, -9.9, 9.9) @@ -275,6 +276,7 @@ F (cosf, -3.1, 3.1) F (cosf, 3.3, 33.3) F (cosf, 100, 1000) F (cosf, 1e6, 1e32) +F (erff, -4.0, 4.0) #if WANT_VMATH D (__s_sin, -3.1, 3.1) D (__s_cos, -3.1, 3.1) diff --git a/math/test/mathtest.c b/math/test/mathtest.c index 2ff8c3f..3108967 100644 --- a/math/test/mathtest.c +++ b/math/test/mathtest.c @@ -1,7 +1,7 @@ /* * mathtest.c - test rig for mathlib * - * Copyright (c) 1998-2018, Arm Limited. + * Copyright (c) 1998-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/dotest.c b/math/test/rtest/dotest.c index f416477..6be79e1 100644 --- a/math/test/rtest/dotest.c +++ b/math/test/rtest/dotest.c @@ -1,7 +1,7 @@ /* * dotest.c - actually generate mathlib test cases * - * Copyright (c) 1999-2018, Arm Limited. + * Copyright (c) 1999-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/intern.h b/math/test/rtest/intern.h index af574b0..12a9c74 100644 --- a/math/test/rtest/intern.h +++ b/math/test/rtest/intern.h @@ -1,7 +1,7 @@ /* * intern.h * - * Copyright (c) 1999-2018, Arm Limited. + * Copyright (c) 1999-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/main.c b/math/test/rtest/main.c index e94e455..0d8ead8 100644 --- a/math/test/rtest/main.c +++ b/math/test/rtest/main.c @@ -1,7 +1,7 @@ /* * main.c * - * Copyright (c) 1999-2018, Arm Limited. + * Copyright (c) 1999-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/random.c b/math/test/rtest/random.c index e97a8c6..5612396 100644 --- a/math/test/rtest/random.c +++ b/math/test/rtest/random.c @@ -1,7 +1,7 @@ /* * random.c - random number generator for producing mathlib test cases * - * Copyright (c) 1998-2018, Arm Limited. + * Copyright (c) 1998-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/random.h b/math/test/rtest/random.h index c1ce956..b4b22df 100644 --- a/math/test/rtest/random.h +++ b/math/test/rtest/random.h @@ -1,7 +1,7 @@ /* * random.h - header for random.c * - * Copyright (c) 2009-2018, Arm Limited. + * Copyright (c) 2009-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/semi.c b/math/test/rtest/semi.c index 938dc3a..c9f0daf 100644 --- a/math/test/rtest/semi.c +++ b/math/test/rtest/semi.c @@ -1,7 +1,7 @@ /* * semi.c: test implementations of mathlib seminumerical functions * - * Copyright (c) 1999-2018, Arm Limited. + * Copyright (c) 1999-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/semi.h b/math/test/rtest/semi.h index da473a2..17dc415 100644 --- a/math/test/rtest/semi.h +++ b/math/test/rtest/semi.h @@ -1,7 +1,7 @@ /* * semi.h: header for semi.c * - * Copyright (c) 1999-2018, Arm Limited. + * Copyright (c) 1999-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/types.h b/math/test/rtest/types.h index 1a76c2e..53cd557 100644 --- a/math/test/rtest/types.h +++ b/math/test/rtest/types.h @@ -1,7 +1,7 @@ /* * types.h * - * Copyright (c) 2005-2018, Arm Limited. + * Copyright (c) 2005-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/wrappers.c b/math/test/rtest/wrappers.c index acaf671..de45ac5 100644 --- a/math/test/rtest/wrappers.c +++ b/math/test/rtest/wrappers.c @@ -1,7 +1,7 @@ /* * wrappers.c - wrappers to modify output of MPFR/MPC test functions * - * Copyright (c) 2014-2018, Arm Limited. + * Copyright (c) 2014-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/wrappers.h b/math/test/rtest/wrappers.h index 5804935..7b09c85 100644 --- a/math/test/rtest/wrappers.h +++ b/math/test/rtest/wrappers.h @@ -1,7 +1,7 @@ /* * wrappers.h - wrappers to modify output of MPFR/MPC test functions * - * Copyright (c) 2014-2018, Arm Limited. + * Copyright (c) 2014-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/runulp.sh b/math/test/runulp.sh index a8c391b..0190d9a 100755 --- a/math/test/runulp.sh +++ b/math/test/runulp.sh @@ -2,7 +2,7 @@ # ULP error check script. # -# Copyright (c) 2019, Arm Limited. +# Copyright (c) 2019-2020, Arm Limited. # SPDX-License-Identifier: MIT #set -x @@ -72,6 +72,16 @@ t pow 0x1.ffffffffffff0p-1 0x1.0000000000008p0 x 0x1p60 0x1p68 50000 t pow 0x1.ffffffffff000p-1 0x1p0 x 0x1p50 0x1p52 50000 t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000 +L=1.0 +Ldir=0.9 +t erf 0 0xffff000000000000 10000 +t erf 0x1p-1022 0x1p-26 40000 +t erf -0x1p-1022 -0x1p-26 40000 +t erf 0x1p-26 0x1p3 40000 +t erf -0x1p-26 -0x1p3 40000 +t erf 0 inf 40000 +Ldir=0.5 + L=0.01 t expf 0 0xffff0000 10000 t expf 0x1p-14 0x1p8 50000 @@ -119,6 +129,17 @@ t powf 0x1p-70 0x1p70 x 0x1p-1 0x1p1 50000 t powf 0x1p-70 0x1p70 x -0x1p-1 -0x1p1 50000 t powf 0x1.ep-1 0x1.1p0 x 0x1p8 0x1p14 50000 t powf 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000 + +L=0.6 +Ldir=0.9 +t erff 0 0xffff0000 10000 +t erff 0x1p-127 0x1p-26 40000 +t erff -0x1p-127 -0x1p-26 40000 +t erff 0x1p-26 0x1p3 40000 +t erff -0x1p-26 -0x1p3 40000 +t erff 0 inf 40000 +Ldir=0.5 + done # vector functions diff --git a/math/test/testcases/directed/cosf.tst b/math/test/testcases/directed/cosf.tst index 5dc0994..7916044 100644 --- a/math/test/testcases/directed/cosf.tst +++ b/math/test/testcases/directed/cosf.tst @@ -1,6 +1,6 @@ ; cosf.tst - Directed test cases for SP cosine ; -; Copyright (c) 2007-2018, Arm Limited. +; Copyright (c) 2007-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=cosf op1=7fc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/erf.tst b/math/test/testcases/directed/erf.tst new file mode 100644 index 0000000..7fa4d18 --- /dev/null +++ b/math/test/testcases/directed/erf.tst @@ -0,0 +1,17 @@ +; erf.tst - Directed test cases for erf +; +; Copyright (c) 2007-2020, Arm Limited. +; SPDX-License-Identifier: MIT + +func=erf op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=erf op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=erf op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=erf op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=erf op1=7ff00000.00000000 result=3ff00000.00000000 errno=0 +func=erf op1=fff00000.00000000 result=bff00000.00000000 errno=0 +func=erf op1=00000000.00000000 result=00000000.00000000 errno=ERANGE +func=erf op1=80000000.00000000 result=80000000.00000000 errno=ERANGE +func=erf op1=00000000.00000001 result=00000000.00000001 errno=0 status=ux +func=erf op1=80000000.00000001 result=80000000.00000001 errno=0 status=ux +func=erf op1=3ff00000.00000000 result=3feaf767.a741088a.c6d errno=0 +func=erf op1=bff00000.00000000 result=bfeaf767.a741088a.c6d errno=0 diff --git a/math/test/testcases/directed/erff.tst b/math/test/testcases/directed/erff.tst new file mode 100644 index 0000000..d05b7b1 --- /dev/null +++ b/math/test/testcases/directed/erff.tst @@ -0,0 +1,17 @@ +; erff.tst +; +; Copyright (c) 2007-2020, Arm Limited. +; SPDX-License-Identifier: MIT + +func=erff op1=7fc00001 result=7fc00001 errno=0 +func=erff op1=ffc00001 result=7fc00001 errno=0 +func=erff op1=7f800001 result=7fc00001 errno=0 status=i +func=erff op1=ff800001 result=7fc00001 errno=0 status=i +func=erff op1=7f800000 result=3f800000 errno=0 +func=erff op1=ff800000 result=bf800000 errno=0 +func=erff op1=00000000 result=00000000 errno=ERANGE +func=erff op1=80000000 result=80000000 errno=ERANGE +func=erff op1=00000001 result=00000001 errno=0 status=ux +func=erff op1=80000001 result=80000001 errno=0 status=ux +func=erff op1=3f800000 result=3f57bb3d.3a0 errno=0 +func=erff op1=bf800000 result=bf57bb3d.3a0 errno=0 diff --git a/math/test/testcases/directed/exp.tst b/math/test/testcases/directed/exp.tst index addfc0a..85d556c 100644 --- a/math/test/testcases/directed/exp.tst +++ b/math/test/testcases/directed/exp.tst @@ -1,6 +1,6 @@ ; Directed test cases for exp ; -; Copyright (c) 2018, Arm Limited. +; Copyright (c) 2018-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=exp op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/math/test/testcases/directed/exp2.tst b/math/test/testcases/directed/exp2.tst index 04a5a50..fa56c9f 100644 --- a/math/test/testcases/directed/exp2.tst +++ b/math/test/testcases/directed/exp2.tst @@ -1,6 +1,6 @@ ; Directed test cases for exp2 ; -; Copyright (c) 2018, Arm Limited. +; Copyright (c) 2018-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=exp2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/math/test/testcases/directed/exp2f.tst b/math/test/testcases/directed/exp2f.tst index 2b6a9b5..38cfc3f 100644 --- a/math/test/testcases/directed/exp2f.tst +++ b/math/test/testcases/directed/exp2f.tst @@ -1,6 +1,6 @@ ; exp2f.tst - Directed test cases for exp2f ; -; Copyright (c) 2017-2018, Arm Limited. +; Copyright (c) 2017-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=exp2f op1=7fc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/expf.tst b/math/test/testcases/directed/expf.tst index 74664c7..ff0f671 100644 --- a/math/test/testcases/directed/expf.tst +++ b/math/test/testcases/directed/expf.tst @@ -1,6 +1,6 @@ ; expf.tst - Directed test cases for expf ; -; Copyright (c) 2007-2018, Arm Limited. +; Copyright (c) 2007-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=expf op1=7fc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/log.tst b/math/test/testcases/directed/log.tst index eeb762c..a0aa398 100644 --- a/math/test/testcases/directed/log.tst +++ b/math/test/testcases/directed/log.tst @@ -1,6 +1,6 @@ ; Directed test cases for log ; -; Copyright (c) 2018, Arm Limited. +; Copyright (c) 2018-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=log op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/math/test/testcases/directed/log2.tst b/math/test/testcases/directed/log2.tst index e0765d8..ff1286c 100644 --- a/math/test/testcases/directed/log2.tst +++ b/math/test/testcases/directed/log2.tst @@ -1,6 +1,6 @@ ; Directed test cases for log2 ; -; Copyright (c) 2018, Arm Limited. +; Copyright (c) 2018-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/math/test/testcases/directed/log2f.tst b/math/test/testcases/directed/log2f.tst index 8d685ba..5832c4f 100644 --- a/math/test/testcases/directed/log2f.tst +++ b/math/test/testcases/directed/log2f.tst @@ -1,6 +1,6 @@ ; log2f.tst - Directed test cases for log2f ; -; Copyright (c) 2017-2018, Arm Limited. +; Copyright (c) 2017-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=log2f op1=7fc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/logf.tst b/math/test/testcases/directed/logf.tst index 7ccc873..6e68a36 100644 --- a/math/test/testcases/directed/logf.tst +++ b/math/test/testcases/directed/logf.tst @@ -1,6 +1,6 @@ ; logf.tst - Directed test cases for logf ; -; Copyright (c) 2007-2018, Arm Limited. +; Copyright (c) 2007-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=logf op1=7fc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/pow.tst b/math/test/testcases/directed/pow.tst index a4c42be..1966581 100644 --- a/math/test/testcases/directed/pow.tst +++ b/math/test/testcases/directed/pow.tst @@ -1,6 +1,6 @@ ; Directed test cases for pow ; -; Copyright (c) 2018, Arm Limited. +; Copyright (c) 2018-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=pow op1=00000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0 diff --git a/math/test/testcases/directed/powf.tst b/math/test/testcases/directed/powf.tst index efd1dd5..3fa8b11 100644 --- a/math/test/testcases/directed/powf.tst +++ b/math/test/testcases/directed/powf.tst @@ -1,6 +1,6 @@ ; powf.tst - Directed test cases for powf ; -; Copyright (c) 2007-2018, Arm Limited. +; Copyright (c) 2007-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=powf op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i diff --git a/math/test/testcases/directed/sincosf.tst b/math/test/testcases/directed/sincosf.tst index b4b2526..4b33d22 100644 --- a/math/test/testcases/directed/sincosf.tst +++ b/math/test/testcases/directed/sincosf.tst @@ -1,6 +1,6 @@ ; Directed test cases for SP sincos ; -; Copyright (c) 2007-2018, Arm Limited. +; Copyright (c) 2007-2019, Arm Limited. ; SPDX-License-Identifier: MIT diff --git a/math/test/testcases/directed/sinf.tst b/math/test/testcases/directed/sinf.tst index 13cfdca..ded80b1 100644 --- a/math/test/testcases/directed/sinf.tst +++ b/math/test/testcases/directed/sinf.tst @@ -1,6 +1,6 @@ ; sinf.tst - Directed test cases for SP sine ; -; Copyright (c) 2007-2018, Arm Limited. +; Copyright (c) 2007-2019, Arm Limited. ; SPDX-License-Identifier: MIT diff --git a/math/test/testcases/random/double.tst b/math/test/testcases/random/double.tst index c37e837..c24ff80 100644 --- a/math/test/testcases/random/double.tst +++ b/math/test/testcases/random/double.tst @@ -1,6 +1,6 @@ !! double.tst - Random test case specification for DP functions !! -!! Copyright (c) 1999-2018, Arm Limited. +!! Copyright (c) 1999-2019, Arm Limited. !! SPDX-License-Identifier: MIT test exp 10000 diff --git a/math/test/testcases/random/float.tst b/math/test/testcases/random/float.tst index baf62b9..d02a227 100644 --- a/math/test/testcases/random/float.tst +++ b/math/test/testcases/random/float.tst @@ -1,6 +1,6 @@ !! single.tst - Random test case specification for SP functions !! -!! Copyright (c) 1999-2018, Arm Limited. +!! Copyright (c) 1999-2019, Arm Limited. !! SPDX-License-Identifier: MIT test sinf 10000 diff --git a/math/test/ulp.c b/math/test/ulp.c index 371567a..51479b8 100644 --- a/math/test/ulp.c +++ b/math/test/ulp.c @@ -1,7 +1,7 @@ /* * ULP error checking tool for math functions. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -331,11 +331,13 @@ static const struct fun fun[] = { F1 (log) F1 (log2) F2 (pow) + F1 (erf) D1 (exp) D1 (exp2) D1 (log) D1 (log2) D2 (pow) + D1 (erf) #if WANT_VMATH F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0) F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0) diff --git a/math/tools/remez.jl b/math/tools/remez.jl index f479fc5..2ff436f 100755 --- a/math/tools/remez.jl +++ b/math/tools/remez.jl @@ -3,7 +3,7 @@ # remez.jl - implementation of the Remez algorithm for polynomial approximation # -# Copyright (c) 2015-2018, Arm Limited. +# Copyright (c) 2015-2019, Arm Limited. # SPDX-License-Identifier: MIT import Base.\ diff --git a/math/v_math.h b/math/v_math.h index 3db22e5..f2cc467 100644 --- a/math/v_math.h +++ b/math/v_math.h @@ -1,7 +1,7 @@ /* * Vector math abstractions. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/networking/Dir.mk b/networking/Dir.mk new file mode 100644 index 0000000..b496103 --- /dev/null +++ b/networking/Dir.mk @@ -0,0 +1,76 @@ +# Makefile fragment - requires GNU make +# +# Copyright (c) 2019-2020, Arm Limited. +# SPDX-License-Identifier: MIT + +S := $(srcdir)/networking +B := build/networking + +ifeq ($(ARCH),) +all-networking check-networking install-networking clean-networking: + @echo "*** Please set ARCH in config.mk. ***" + @exit 1 +else + +networking-lib-srcs := $(wildcard $(S)/*.[cS]) $(wildcard $(S)/$(ARCH)/*.[cS]) +networking-test-srcs := $(wildcard $(S)/test/*.c) + +networking-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h)) + +networking-libs := \ + build/lib/libnetworking.so \ + build/lib/libnetworking.a \ + +networking-tools := \ + build/bin/test/chksum + +networking-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(networking-lib-srcs))) +networking-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(networking-test-srcs))) + +networking-objs := \ + $(networking-lib-objs) \ + $(networking-lib-objs:%.o=%.os) \ + $(networking-test-objs) \ + +networking-files := \ + $(networking-objs) \ + $(networking-libs) \ + $(networking-tools) \ + $(networking-includes) \ + +all-networking: $(networking-libs) $(networking-tools) $(networking-includes) + +$(networking-objs): $(networking-includes) +$(networking-objs): CFLAGS_ALL += $(networking-cflags) + +build/lib/libnetworking.so: $(networking-lib-objs:%.o=%.os) + $(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^ + +build/lib/libnetworkinglib.a: $(networking-lib-objs) + rm -f $@ + $(AR) rc $@ $^ + $(RANLIB) $@ + +build/bin/test/%: $(B)/test/%.o build/lib/libnetworkinglib.a + $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) + +build/include/%.h: $(S)/include/%.h + cp $< $@ + +build/bin/%.sh: $(S)/test/%.sh + cp $< $@ + +check-networking: $(networking-tools) + $(EMULATOR) build/bin/test/chksum -i simple + $(EMULATOR) build/bin/test/chksum -i scalar + $(EMULATOR) build/bin/test/chksum -i simd || true # simd is not always available + +install-networking: \ + $(networking-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \ + $(networking-includes:build/include/%=$(DESTDIR)$(includedir)/%) + +clean-networking: + rm -f $(networking-files) +endif + +.PHONY: all-networking check-networking install-networking clean-networking diff --git a/networking/aarch64/chksum_simd.c b/networking/aarch64/chksum_simd.c new file mode 100644 index 0000000..6d5be58 --- /dev/null +++ b/networking/aarch64/chksum_simd.c @@ -0,0 +1,146 @@ +/* + * AArch64-specific checksum implementation using NEON + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "networking.h" +#include "../chksum_common.h" + +#ifndef __ARM_NEON +#pragma GCC target("+simd") +#endif + +#include <arm_neon.h> + +always_inline +static inline uint64_t +slurp_head64(const void **pptr, uint32_t *nbytes) +{ + Assert(*nbytes >= 8); + uint64_t sum = 0; + uint32_t off = (uintptr_t) *pptr % 8; + if (likely(off != 0)) + { + /* Get rid of bytes 0..off-1 */ + const unsigned char *ptr64 = align_ptr(*pptr, 8); + uint64_t mask = ALL_ONES << (CHAR_BIT * off); + uint64_t val = load64(ptr64) & mask; + /* Fold 64-bit sum to 33 bits */ + sum = val >> 32; + sum += (uint32_t) val; + *pptr = ptr64 + 8; + *nbytes -= 8 - off; + } + return sum; +} + +always_inline +static inline uint64_t +slurp_tail64(uint64_t sum, const void *ptr, uint32_t nbytes) +{ + Assert(nbytes < 8); + if (likely(nbytes != 0)) + { + /* Get rid of bytes 7..nbytes */ + uint64_t mask = ALL_ONES >> (CHAR_BIT * (8 - nbytes)); + Assert(__builtin_popcountl(mask) / CHAR_BIT == nbytes); + uint64_t val = load64(ptr) & mask; + sum += val >> 32; + sum += (uint32_t) val; + nbytes = 0; + } + Assert(nbytes == 0); + return sum; +} + +unsigned short +__chksum_aarch64_simd(const void *ptr, unsigned int nbytes) +{ + bool swap = (uintptr_t) ptr & 1; + uint64_t sum; + + if (unlikely(nbytes < 50)) + { + sum = slurp_small(ptr, nbytes); + swap = false; + goto fold; + } + + /* 8-byte align pointer */ + Assert(nbytes >= 8); + sum = slurp_head64(&ptr, &nbytes); + Assert(((uintptr_t) ptr & 7) == 0); + + const uint32_t *may_alias ptr32 = ptr; + + uint64x2_t vsum0 = { 0, 0 }; + uint64x2_t vsum1 = { 0, 0 }; + uint64x2_t vsum2 = { 0, 0 }; + uint64x2_t vsum3 = { 0, 0 }; + + /* Sum groups of 64 bytes */ + for (uint32_t i = 0; i < nbytes / 64; i++) + { + uint32x4_t vtmp0 = vld1q_u32(ptr32); + uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4); + uint32x4_t vtmp2 = vld1q_u32(ptr32 + 8); + uint32x4_t vtmp3 = vld1q_u32(ptr32 + 12); + vsum0 = vpadalq_u32(vsum0, vtmp0); + vsum1 = vpadalq_u32(vsum1, vtmp1); + vsum2 = vpadalq_u32(vsum2, vtmp2); + vsum3 = vpadalq_u32(vsum3, vtmp3); + ptr32 += 16; + } + nbytes %= 64; + + /* Fold vsum2 and vsum3 into vsum0 and vsum1 */ + vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum2)); + vsum1 = vpadalq_u32(vsum1, vreinterpretq_u32_u64(vsum3)); + + /* Add any trailing group of 32 bytes */ + if (nbytes & 32) + { + uint32x4_t vtmp0 = vld1q_u32(ptr32); + uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4); + vsum0 = vpadalq_u32(vsum0, vtmp0); + vsum1 = vpadalq_u32(vsum1, vtmp1); + ptr32 += 8; + nbytes -= 32; + } + Assert(nbytes < 32); + + /* Fold vsum1 into vsum0 */ + vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum1)); + + /* Add any trailing group of 16 bytes */ + if (nbytes & 16) + { + uint32x4_t vtmp = vld1q_u32(ptr32); + vsum0 = vpadalq_u32(vsum0, vtmp); + ptr32 += 4; + nbytes -= 16; + } + Assert(nbytes < 16); + + /* Add any trailing group of 8 bytes */ + if (nbytes & 8) + { + uint32x2_t vtmp = vld1_u32(ptr32); + vsum0 = vaddw_u32(vsum0, vtmp); + ptr32 += 2; + nbytes -= 8; + } + Assert(nbytes < 8); + + uint64_t val = vaddlvq_u32(vreinterpretq_u32_u64(vsum0)); + sum += val >> 32; + sum += (uint32_t) val; + + /* Handle any trailing 0..7 bytes */ + sum = slurp_tail64(sum, ptr32, nbytes); + +fold: + return fold_and_swap(sum, swap); +} diff --git a/networking/arm/chksum_simd.c b/networking/arm/chksum_simd.c new file mode 100644 index 0000000..7f69adf --- /dev/null +++ b/networking/arm/chksum_simd.c @@ -0,0 +1,149 @@ +/* + * Armv7-A specific checksum implementation using NEON + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "networking.h" +#include "../chksum_common.h" + +#ifndef __ARM_NEON +#pragma GCC target("+simd") +#endif + +#include <arm_neon.h> + +unsigned short +__chksum_arm_simd(const void *ptr, unsigned int nbytes) +{ + bool swap = (uintptr_t) ptr & 1; + uint64x1_t vsum = { 0 }; + + if (unlikely(nbytes < 40)) + { + uint64_t sum = slurp_small(ptr, nbytes); + return fold_and_swap(sum, false); + } + + /* 8-byte align pointer */ + /* Inline slurp_head-like code since we use NEON here */ + Assert(nbytes >= 8); + uint32_t off = (uintptr_t) ptr & 7; + if (likely(off != 0)) + { + const uint64_t *may_alias ptr64 = align_ptr(ptr, 8); + uint64x1_t vword64 = vld1_u64(ptr64); + /* Get rid of bytes 0..off-1 */ + uint64x1_t vmask = vdup_n_u64(ALL_ONES); + int64x1_t vshiftl = vdup_n_s64(CHAR_BIT * off); + vmask = vshl_u64(vmask, vshiftl); + vword64 = vand_u64(vword64, vmask); + uint32x2_t vtmp = vreinterpret_u32_u64(vword64); + /* Set accumulator */ + vsum = vpaddl_u32(vtmp); + /* Update pointer and remaining size */ + ptr = (char *) ptr64 + 8; + nbytes -= 8 - off; + } + Assert(((uintptr_t) ptr & 7) == 0); + + /* Sum groups of 64 bytes */ + uint64x2_t vsum0 = { 0, 0 }; + uint64x2_t vsum1 = { 0, 0 }; + uint64x2_t vsum2 = { 0, 0 }; + uint64x2_t vsum3 = { 0, 0 }; + const uint32_t *may_alias ptr32 = ptr; + for (uint32_t i = 0; i < nbytes / 64; i++) + { + uint32x4_t vtmp0 = vld1q_u32(ptr32); + uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4); + uint32x4_t vtmp2 = vld1q_u32(ptr32 + 8); + uint32x4_t vtmp3 = vld1q_u32(ptr32 + 12); + vsum0 = vpadalq_u32(vsum0, vtmp0); + vsum1 = vpadalq_u32(vsum1, vtmp1); + vsum2 = vpadalq_u32(vsum2, vtmp2); + vsum3 = vpadalq_u32(vsum3, vtmp3); + ptr32 += 16; + } + nbytes %= 64; + + /* Fold vsum1/vsum2/vsum3 into vsum0 */ + vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum2)); + vsum1 = vpadalq_u32(vsum1, vreinterpretq_u32_u64(vsum3)); + vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum1)); + + /* Add any trailing 16-byte groups */ + while (likely(nbytes >= 16)) + { + uint32x4_t vtmp0 = vld1q_u32(ptr32); + vsum0 = vpadalq_u32(vsum0, vtmp0); + ptr32 += 4; + nbytes -= 16; + } + Assert(nbytes < 16); + + /* Fold vsum0 into vsum */ + { + /* 4xu32 (4x32b) -> 2xu64 (2x33b) */ + vsum0 = vpaddlq_u32(vreinterpretq_u32_u64(vsum0)); + /* 4xu32 (2x(1b+32b)) -> 2xu64 (2x(0b+32b)) */ + vsum0 = vpaddlq_u32(vreinterpretq_u32_u64(vsum0)); + /* 4xu32 (4x32b) -> 2xu64 (2x33b) */ + Assert((vgetq_lane_u64(vsum0, 0) >> 32) == 0); + Assert((vgetq_lane_u64(vsum0, 1) >> 32) == 0); + uint32x2_t vtmp = vmovn_u64(vsum0); + /* Add to accumulator */ + vsum = vpadal_u32(vsum, vtmp); + } + + /* Add any trailing group of 8 bytes */ + if (nbytes & 8) + { + uint32x2_t vtmp = vld1_u32(ptr32); + /* Add to accumulator */ + vsum = vpadal_u32(vsum, vtmp); + ptr32 += 2; + nbytes -= 8; + } + Assert(nbytes < 8); + + /* Handle any trailing 1..7 bytes */ + if (likely(nbytes != 0)) + { + Assert(((uintptr_t) ptr32 & 7) == 0); + Assert(nbytes < 8); + uint64x1_t vword64 = vld1_u64((const uint64_t *) ptr32); + /* Get rid of bytes 7..nbytes */ + uint64x1_t vmask = vdup_n_u64(ALL_ONES); + int64x1_t vshiftr = vdup_n_s64(-CHAR_BIT * (8 - nbytes)); + vmask = vshl_u64(vmask, vshiftr);/* Shift right */ + vword64 = vand_u64(vword64, vmask); + /* Fold 64-bit sum to 33 bits */ + vword64 = vpaddl_u32(vreinterpret_u32_u64(vword64)); + /* Add to accumulator */ + vsum = vpadal_u32(vsum, vreinterpret_u32_u64(vword64)); + } + + /* Fold 64-bit vsum to 32 bits */ + vsum = vpaddl_u32(vreinterpret_u32_u64(vsum)); + vsum = vpaddl_u32(vreinterpret_u32_u64(vsum)); + Assert(vget_lane_u32(vreinterpret_u32_u64(vsum), 1) == 0); + + /* Fold 32-bit vsum to 16 bits */ + uint32x2_t vsum32 = vreinterpret_u32_u64(vsum); + vsum32 = vpaddl_u16(vreinterpret_u16_u32(vsum32)); + vsum32 = vpaddl_u16(vreinterpret_u16_u32(vsum32)); + Assert(vget_lane_u16(vreinterpret_u16_u32(vsum32), 1) == 0); + Assert(vget_lane_u16(vreinterpret_u16_u32(vsum32), 2) == 0); + Assert(vget_lane_u16(vreinterpret_u16_u32(vsum32), 3) == 0); + + /* Convert to 16-bit scalar */ + uint16_t sum = vget_lane_u16(vreinterpret_u16_u32(vsum32), 0); + + if (unlikely(swap))/* Odd base pointer is unexpected */ + { + sum = bswap16(sum); + } + return sum; +} diff --git a/networking/chksum.c b/networking/chksum.c new file mode 100644 index 0000000..95ce5ba --- /dev/null +++ b/networking/chksum.c @@ -0,0 +1,81 @@ +/* + * Compute 16-bit sum in ones' complement arithmetic (with end-around carry). + * This sum is often used as a simple checksum in networking. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "networking.h" +#include "chksum_common.h" + +always_inline +static inline uint32_t +slurp_head32(const void **pptr, uint32_t *nbytes) +{ + uint32_t sum = 0; + Assert(*nbytes >= 4); + uint32_t off = (uintptr_t) *pptr % 4; + if (likely(off != 0)) + { + /* Get rid of bytes 0..off-1 */ + const unsigned char *ptr32 = align_ptr(*pptr, 4); + uint32_t mask = ~0U << (CHAR_BIT * off); + sum = load32(ptr32) & mask; + *pptr = ptr32 + 4; + *nbytes -= 4 - off; + } + return sum; +} + +/* Additional loop unrolling would help when not auto-vectorizing */ +unsigned short +__chksum(const void *ptr, unsigned int nbytes) +{ + bool swap = false; + uint64_t sum = 0; + + if (nbytes > 300) + { + /* 4-byte align pointer */ + swap = (uintptr_t) ptr & 1; + sum = slurp_head32(&ptr, &nbytes); + } + /* Else benefit of aligning not worth the overhead */ + + /* Sum all 16-byte chunks */ + const char *cptr = ptr; + for (uint32_t nquads = nbytes / 16; nquads != 0; nquads--) + { + uint64_t h0 = load32(cptr + 0); + uint64_t h1 = load32(cptr + 4); + uint64_t h2 = load32(cptr + 8); + uint64_t h3 = load32(cptr + 12); + sum += h0 + h1 + h2 + h3; + cptr += 16; + } + nbytes %= 16; + Assert(nbytes < 16); + + /* Handle any trailing 4-byte chunks */ + while (nbytes >= 4) + { + sum += load32(cptr); + cptr += 4; + nbytes -= 4; + } + Assert(nbytes < 4); + + if (nbytes & 2) + { + sum += load16(cptr); + cptr += 2; + } + + if (nbytes & 1) + { + sum += *(uint8_t *)cptr; + } + + return fold_and_swap(sum, swap); +} diff --git a/networking/chksum_common.h b/networking/chksum_common.h new file mode 100644 index 0000000..958c8cc --- /dev/null +++ b/networking/chksum_common.h @@ -0,0 +1,132 @@ +/* + * Common code for checksum implementations + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#ifndef CHKSUM_COMMON_H +#define CHKSUM_COMMON_H + +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ +#error Only little endian supported +#endif + +#include <limits.h> +#include <stdbool.h> +#include <stdint.h> +#include <string.h> + +/* Assertions must be explicitly enabled */ +#if WANT_ASSERT +#undef NDEBUG +#include <assert.h> +#define Assert(exp) assert(exp) +#else +#define Assert(exp) (void) (exp) +#endif + +#ifdef __GNUC__ +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#define may_alias __attribute__((__may_alias__)) +#define always_inline __attribute__((always_inline)) +#ifdef __clang__ +#define no_unroll_loops +#else +#define no_unroll_loops __attribute__((optimize("no-unroll-loops"))) +#endif +#define bswap16(x) __builtin_bswap16((x)) +#else +#define likely(x) (x) +#define unlikely(x) (x) +#define may_alias +#define always_inline +#define no_unroll_loops +#define bswap16(x) ((uint8_t)((x) >> 8) | ((uint8_t)(x) << 8)) +#endif + +#define ALL_ONES ~UINT64_C(0) + +static inline +uint64_t load64(const void *ptr) +{ + /* GCC will optimise this to a normal load instruction */ + uint64_t v; + memcpy(&v, ptr, sizeof v); + return v; +} + +static inline +uint32_t load32(const void *ptr) +{ + /* GCC will optimise this to a normal load instruction */ + uint32_t v; + memcpy(&v, ptr, sizeof v); + return v; +} + +static inline +uint16_t load16(const void *ptr) +{ + /* GCC will optimise this to a normal load instruction */ + uint16_t v; + memcpy(&v, ptr, sizeof v); + return v; +} + +/* slurp_small() is for small buffers, don't waste cycles on alignment */ +no_unroll_loops +always_inline +static inline uint64_t +slurp_small(const void *ptr, uint32_t nbytes) +{ + const unsigned char *cptr = ptr; + uint64_t sum = 0; + while (nbytes >= 4) + { + sum += load32(cptr); + cptr += 4; + nbytes -= 4; + } + if (nbytes & 2) + { + sum += load16(cptr); + cptr += 2; + } + if (nbytes & 1) + { + sum += (uint8_t) *cptr; + } + return sum; +} + +static inline const void * +align_ptr(const void *ptr, size_t bytes) +{ + return (void *) ((uintptr_t) ptr & -(uintptr_t) bytes); +} + +always_inline +static inline uint16_t +fold_and_swap(uint64_t sum, bool swap) +{ + /* Fold 64-bit sum to 32 bits */ + sum = (sum & 0xffffffff) + (sum >> 32); + sum = (sum & 0xffffffff) + (sum >> 32); + Assert(sum == (uint32_t) sum); + + /* Fold 32-bit sum to 16 bits */ + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + Assert(sum == (uint16_t) sum); + + if (unlikely(swap)) /* Odd base pointer is unexpected */ + { + sum = bswap16(sum); + } + + return (uint16_t) sum; +} + +#endif diff --git a/networking/include/networking.h b/networking/include/networking.h new file mode 100644 index 0000000..a88feff --- /dev/null +++ b/networking/include/networking.h @@ -0,0 +1,14 @@ +/* + * Public API. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +unsigned short __chksum (const void *, unsigned int); +#if __aarch64__ && __ARM_NEON +unsigned short __chksum_aarch64_simd (const void *, unsigned int); +#endif +#if __arm__ && __ARM_NEON +unsigned short __chksum_arm_simd (const void *, unsigned int); +#endif diff --git a/networking/test/chksum.c b/networking/test/chksum.c new file mode 100644 index 0000000..41b9812 --- /dev/null +++ b/networking/test/chksum.c @@ -0,0 +1,381 @@ +/* + * Ones' complement checksum test & benchmark + * + * Copyright (c) 2016-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define _GNU_SOURCE +#include <inttypes.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <time.h> +#include <unistd.h> +#include "../include/networking.h" + +#if WANT_ASSERT +#undef NDEBUG +#include <assert.h> +#define Assert(exp) assert(exp) +#else +#define Assert(exp) (void) (exp) +#endif + +#ifdef __GNUC__ +#define may_alias __attribute__((__may_alias__)) +#else +#define may_alias +#endif + +#define CACHE_LINE 64 +#define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1)) + +/* Reference implementation - do not modify! */ +static uint16_t +checksum_simple(const void *ptr, uint32_t nbytes) +{ + const uint16_t *may_alias hptr = ptr; + uint64_t sum = 0;/* Need 64-bit accumulator when nbytes > 64K */ + + /* Sum all halfwords, assume misaligned accesses are handled in HW */ + for (uint32_t nhalfs = nbytes >> 1; nhalfs != 0; nhalfs--) + { + sum += *hptr++; + } + + /* Add any trailing odd byte */ + if ((nbytes & 0x01) != 0) + { + sum += *(uint8_t *) hptr; + } + + /* Fold 64-bit sum to 32 bits */ + sum = (sum & 0xffffffff) + (sum >> 32); + sum = (sum & 0xffffffff) + (sum >> 32); + Assert(sum == (uint32_t) sum); + + /* Fold 32-bit sum to 16 bits */ + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + Assert(sum == (uint16_t) sum); + + return (uint16_t) sum; +} + +static struct +{ + uint16_t (*cksum_fp)(const void *, uint32_t); + const char *name; +} implementations[] = +{ + { checksum_simple, "simple"}, + { __chksum, "scalar"}, +#if __arm__ + { __chksum_arm_simd, "simd" }, +#elif __aarch64__ + { __chksum_aarch64_simd, "simd" }, +#endif + { NULL, NULL} +}; + +static int +find_impl(const char *name) +{ + for (int i = 0; implementations[i].name != NULL; i++) + { + if (strcmp(implementations[i].name, name) == 0) + { + return i; + } + } + return -1; +} + +static uint16_t (*CKSUM_FP)(const void *, uint32_t); +static volatile uint16_t SINK; + +static bool +verify(const void *data, uint32_t offset, uint32_t size) +{ + + uint16_t csum_expected = checksum_simple(data, size); + uint16_t csum_actual = CKSUM_FP(data, size); + if (csum_actual != csum_expected) + { + fprintf(stderr, "\nInvalid checksum for offset %u size %u: " + "actual %04x expected %04x (valid)", + offset, size, csum_actual, csum_expected); + if (size < 65536) + { + /* Fatal error */ + exit(EXIT_FAILURE); + } + /* Else some implementations only support sizes up to 2^16 */ + return false; + } + return true; +} + +static uint64_t +clock_get_ns(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec; +} + +static void +benchmark(const uint8_t *base, + size_t poolsize, + uint32_t blksize, + uint32_t numops, + uint64_t cpufreq) +{ + printf("%11u ", (unsigned int) blksize); fflush(stdout); + + uint64_t start = clock_get_ns(); + for (uint32_t i = 0; i < numops; i ++) + { + /* Read a random value from the pool */ + uint32_t random = ((uint32_t *) base)[i % (poolsize / 4)]; + /* Generate a random starting address */ + const void *data = &base[random % (poolsize - blksize)]; + SINK = CKSUM_FP(data, blksize); + } + uint64_t end = clock_get_ns(); + +#define MEGABYTE 1000000 /* Decimal megabyte (MB) */ + uint64_t elapsed_ns = end - start; + uint64_t elapsed_ms = elapsed_ns / 1000000; + uint32_t blks_per_s = (uint32_t) ((numops / elapsed_ms) * 1000); + uint64_t accbytes = (uint64_t) numops * blksize; + printf("%11ju ", (uintmax_t) ((accbytes / elapsed_ms) * 1000) / MEGABYTE); + unsigned int cyc_per_blk = cpufreq / blks_per_s; + printf("%11u ", cyc_per_blk); + if (blksize != 0) + { + unsigned int cyc_per_byte = 1000 * cyc_per_blk / blksize; + printf("%7u.%03u ", + cyc_per_byte / 1000, cyc_per_byte % 1000); + } + printf("\n"); +} + +int main(int argc, char *argv[]) +{ + int c; + bool DUMP = false; + uint32_t IMPL = 0;/* Simple implementation */ + uint64_t CPUFREQ = 0; + uint32_t BLKSIZE = 0; + uint32_t NUMOPS = 1000000; + uint32_t POOLSIZE = 512 * 1024;/* Typical ARM L2 cache size */ + + setvbuf(stdout, NULL, _IOLBF, 160); + while ((c = getopt(argc, argv, "b:df:i:n:p:")) != -1) + { + switch (c) + { + case 'b' : + { + int blksize = atoi(optarg); + if (blksize < 1 || blksize > POOLSIZE / 2) + { + fprintf(stderr, "Invalid block size %d\n", blksize); + exit(EXIT_FAILURE); + } + BLKSIZE = (unsigned) blksize; + break; + } + case 'd' : + DUMP = true; + break; + case 'f' : + { + int64_t cpufreq = atoll(optarg); + if (cpufreq < 1) + { + fprintf(stderr, "Invalid CPU frequency %"PRId64"\n", + cpufreq); + exit(EXIT_FAILURE); + } + CPUFREQ = cpufreq; + break; + } + case 'i' : + { + int impl = find_impl(optarg); + if (impl < 0) + { + fprintf(stderr, "Invalid implementation %s\n", optarg); + goto usage; + } + IMPL = (unsigned) impl; + break; + } + case 'n' : + { + int numops = atoi(optarg); + if (numops < 1) + { + fprintf(stderr, "Invalid number of operations %d\n", numops); + exit(EXIT_FAILURE); + } + NUMOPS = (unsigned) numops; + break; + } + case 'p' : + { + int poolsize = atoi(optarg); + if (poolsize < 4096) + { + fprintf(stderr, "Invalid pool size %d\n", poolsize); + exit(EXIT_FAILURE); + } + char c = optarg[strlen(optarg) - 1]; + if (c == 'M') + { + POOLSIZE = (unsigned) poolsize * 1024 * 1024; + } + else if (c == 'K') + { + POOLSIZE = (unsigned) poolsize * 1024; + } + else + { + POOLSIZE = (unsigned) poolsize; + } + break; + } + default : +usage : + fprintf(stderr, "Usage: checksum <options>\n" + "-b <blksize> Block size\n" + "-d Dump first 96 bytes of data\n" + "-f <cpufreq> CPU frequency (Hz)\n" + "-i <impl> Implementation\n" + "-n <numops> Number of operations\n" + "-p <poolsize> Pool size (K or M suffix)\n" + ); + printf("Implementations:"); + for (int i = 0; implementations[i].name != NULL; i++) + { + printf(" %s", implementations[i].name); + } + printf("\n"); + exit(EXIT_FAILURE); + } + } + if (optind > argc) + { + goto usage; + } + + CKSUM_FP = implementations[IMPL].cksum_fp; + POOLSIZE = ALIGN(POOLSIZE, CACHE_LINE); + uint8_t *base = mmap(0, POOLSIZE, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (base == MAP_FAILED) + { + perror("aligned_alloc"), exit(EXIT_FAILURE); + } + for (size_t i = 0; i < POOLSIZE / 4; i++) + { + ((uint32_t *) base)[i] = rand(); + } + + printf("Implementation: %s\n", implementations[IMPL].name); + printf("numops %u, poolsize ", NUMOPS); + if (POOLSIZE % (1024 * 1024) == 0) + { + printf("%uMiB", POOLSIZE / (1024 * 1024)); + } + else if (POOLSIZE % 1024 == 0) + { + printf("%uKiB", POOLSIZE / 1024); + } + else + { + printf("%uB", POOLSIZE); + } + printf(", blocksize %u, CPU frequency %juMHz\n", + BLKSIZE, (uintmax_t) (CPUFREQ / 1000000)); +#if WANT_ASSERT + printf("Warning: assertions are enabled\n"); +#endif + + if (DUMP) + { + /* Print out first 96 bytes of data for human debugging */ + for (int i = 0; i < 96; i++) + { + if (i % 8 == 0) + printf("%2u:", i); + printf(" %02x", base[i]); + if (i % 8 == 7) + printf("\n"); + } + } + + /* Verify that chosen algorithm handles all combinations of offsets and sizes */ + printf("Verifying..."); fflush(stdout); + bool success = true; + /* Check all (relevant) combinations of size and offset */ + for (int size = 0; size <= 256; size++) + { + for (int offset = 0; offset < 255; offset++) + { + /* Check at start of mapped memory */ + success &= verify(&base[offset], offset, size); + /* Check at end of mapped memory */ + uint8_t *p = base + POOLSIZE - (size + offset); + success &= verify(p, (uintptr_t) p % 64, size); + } + } + /* Check increasingly larger sizes */ + for (size_t size = 1; size < POOLSIZE; size *= 2) + { + success &= verify(base, 0, size); + } + /* Check the full size, this can detect accumulator overflows */ + success &= verify(base, 0, POOLSIZE); + printf("%s\n", success ? "OK" : "failure"); + + /* Print throughput in decimal megabyte (1000000B) per second */ + if (CPUFREQ != 0) + { + printf("%11s %11s %11s %11s\n", + "block size", "MB/s", "cycles/blk", "cycles/byte"); + } + else + { + printf("%11s %11s %11s %11s\n", + "block size", "MB/s", "ns/blk", "ns/byte"); + CPUFREQ = 1000000000; + } + if (BLKSIZE != 0) + { + benchmark(base, POOLSIZE, BLKSIZE, NUMOPS, CPUFREQ); + } + else + { + static const uint16_t sizes[] = + { 20, 42, 102, 250, 612, 1500, 3674, 9000, 0 }; + for (int i = 0; sizes[i] != 0; i++) + { + uint32_t numops = NUMOPS * 10000 / (40 + sizes[i]); + benchmark(base, POOLSIZE, sizes[i], numops, CPUFREQ); + } + } + + if (munmap(base, POOLSIZE) != 0) + { + perror("munmap"), exit(EXIT_FAILURE); + } + + return success ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/run-arm-optimized-routines-tests-on-android.sh b/run-arm-optimized-routines-tests-on-android.sh index 61efeaf..21163a3 100755 --- a/run-arm-optimized-routines-tests-on-android.sh +++ b/run-arm-optimized-routines-tests-on-android.sh @@ -25,16 +25,20 @@ check_failure() { } # Run the 32-bit tests. -adb shell /data/nativetest/mathtest/mathtest /data/nativetest/mathtest/math/test/testcases/directed/* -check_failure +if [ -e "$ANDROID_PRODUCT_OUT/data/nativetest/mathtest/mathtest" ]; then + adb shell /data/nativetest/mathtest/mathtest /data/nativetest/mathtest/math/test/testcases/directed/* + check_failure +fi # TODO: these tests are currently a bloodbath. #adb shell 'cp /data/nativetest/ulp/math/test/runulp.sh /data/nativetest/ulp/ && sh /data/nativetest/ulp/runulp.sh' #check_failure # Run the 64-bit tests. -adb shell /data/nativetest64/mathtest/mathtest /data/nativetest64/mathtest/math/test/testcases/directed/* -check_failure +if [ -e "$ANDROID_PRODUCT_OUT/data/nativetest64/mathtest/mathtest" ]; then + adb shell /data/nativetest64/mathtest/mathtest /data/nativetest64/mathtest/math/test/testcases/directed/* + check_failure +fi # TODO: these tests are currently a bloodbath. #adb shell 'cp /data/nativetest64/ulp/math/test/runulp.sh /data/nativetest64/ulp/ && sh /data/nativetest64/ulp/runulp.sh' diff --git a/string/Dir.mk b/string/Dir.mk index 470917a..cf3453f 100644 --- a/string/Dir.mk +++ b/string/Dir.mk @@ -1,13 +1,20 @@ # Makefile fragment - requires GNU make # -# Copyright (c) 2019, Arm Limited. +# Copyright (c) 2019-2021, Arm Limited. # SPDX-License-Identifier: MIT S := $(srcdir)/string B := build/string -string-lib-srcs := $(wildcard $(S)/*.[cS]) +ifeq ($(ARCH),) +all-string bench-string check-string install-string clean-string: + @echo "*** Please set ARCH in config.mk. ***" + @exit 1 +else + +string-lib-srcs := $(wildcard $(S)/$(ARCH)/*.[cS]) string-test-srcs := $(wildcard $(S)/test/*.c) +string-bench-srcs := $(wildcard $(S)/bench/*.c) string-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h)) @@ -15,13 +22,17 @@ string-libs := \ build/lib/libstringlib.so \ build/lib/libstringlib.a \ -string-tools := \ +string-tests := \ build/bin/test/memcpy \ build/bin/test/memmove \ build/bin/test/memset \ build/bin/test/memchr \ + build/bin/test/memrchr \ build/bin/test/memcmp \ + build/bin/test/__mtag_tag_region \ + build/bin/test/__mtag_tag_zero_region \ build/bin/test/strcpy \ + build/bin/test/stpcpy \ build/bin/test/strcmp \ build/bin/test/strchr \ build/bin/test/strrchr \ @@ -30,25 +41,34 @@ string-tools := \ build/bin/test/strnlen \ build/bin/test/strncmp +string-benches := \ + build/bin/bench/memcpy \ + build/bin/bench/strlen + string-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-lib-srcs))) string-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-test-srcs))) +string-bench-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-bench-srcs))) string-objs := \ $(string-lib-objs) \ $(string-lib-objs:%.o=%.os) \ $(string-test-objs) \ + $(string-bench-objs) string-files := \ $(string-objs) \ $(string-libs) \ - $(string-tools) \ + $(string-tests) \ + $(string-benches) \ $(string-includes) \ -all-string: $(string-libs) $(string-tools) $(string-includes) +all-string: $(string-libs) $(string-tests) $(string-benches) $(string-includes) $(string-objs): $(string-includes) $(string-objs): CFLAGS_ALL += $(string-cflags) +$(string-test-objs): CFLAGS_ALL += -D_GNU_SOURCE + build/lib/libstringlib.so: $(string-lib-objs:%.o=%.os) $(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^ @@ -60,26 +80,27 @@ build/lib/libstringlib.a: $(string-lib-objs) build/bin/test/%: $(B)/test/%.o build/lib/libstringlib.a $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) +build/bin/bench/%: $(B)/bench/%.o build/lib/libstringlib.a + $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) + build/include/%.h: $(S)/include/%.h cp $< $@ build/bin/%.sh: $(S)/test/%.sh cp $< $@ -check-string: $(string-tools) - $(EMULATOR) build/bin/test/memcpy - $(EMULATOR) build/bin/test/memmove - $(EMULATOR) build/bin/test/memset - $(EMULATOR) build/bin/test/memchr - $(EMULATOR) build/bin/test/memcmp - $(EMULATOR) build/bin/test/strcpy - $(EMULATOR) build/bin/test/strcmp - $(EMULATOR) build/bin/test/strchr - $(EMULATOR) build/bin/test/strrchr - $(EMULATOR) build/bin/test/strchrnul - $(EMULATOR) build/bin/test/strlen - $(EMULATOR) build/bin/test/strnlen - $(EMULATOR) build/bin/test/strncmp +string-tests-out = $(string-tests:build/bin/test/%=build/string/test/%.out) + +build/string/test/%.out: build/bin/test/% + $(EMULATOR) $^ | tee $@.tmp + mv $@.tmp $@ + +check-string: $(string-tests-out) + ! grep FAIL $^ + +bench-string: $(string-benches) + $(EMULATOR) build/bin/bench/strlen + $(EMULATOR) build/bin/bench/memcpy install-string: \ $(string-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \ @@ -87,5 +108,6 @@ install-string: \ clean-string: rm -f $(string-files) +endif -.PHONY: all-string check-string install-string clean-string +.PHONY: all-string bench-string check-string install-string clean-string diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S new file mode 100644 index 0000000..84339f7 --- /dev/null +++ b/string/aarch64/__mtag_tag_region.S @@ -0,0 +1,100 @@ +/* + * __mtag_tag_region - tag memory + * + * Copyright (c) 2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, MTE, LP64 ABI. + * + * Interface contract: + * Address is 16 byte aligned and size is multiple of 16. + * Returns the passed pointer. + * The memory region may remain untagged if tagging is not enabled. + */ + +#include "../asmdefs.h" + +#if __ARM_FEATURE_MEMORY_TAGGING + +#define dstin x0 +#define count x1 +#define dst x2 +#define dstend x3 +#define tmp x4 +#define zva_val x4 + +ENTRY (__mtag_tag_region) + PTR_ARG (0) + SIZE_ARG (1) + + add dstend, dstin, count + + cmp count, 96 + b.hi L(set_long) + + tbnz count, 6, L(set96) + + /* Set 0, 16, 32, or 48 bytes. */ + lsr tmp, count, 5 + add tmp, dstin, tmp, lsl 4 + cbz count, L(end) + stg dstin, [dstin] + stg dstin, [tmp] + stg dstin, [dstend, -16] +L(end): + ret + + .p2align 4 + /* Set 64..96 bytes. Write 64 bytes from the start and + 32 bytes from the end. */ +L(set96): + st2g dstin, [dstin] + st2g dstin, [dstin, 32] + st2g dstin, [dstend, -32] + ret + + .p2align 4 + /* Size is > 96 bytes. */ +L(set_long): + cmp count, 160 + b.lo L(no_zva) + +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(no_zva) +#endif + st2g dstin, [dstin] + st2g dstin, [dstin, 32] + bic dst, dstin, 63 + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + .p2align 4 +L(zva_loop): + add dst, dst, 64 + dc gva, dst + subs count, count, 64 + b.hi L(zva_loop) + st2g dstin, [dstend, -64] + st2g dstin, [dstend, -32] + ret + +L(no_zva): + sub dst, dstin, 32 /* Dst is biased by -32. */ + sub count, count, 64 /* Adjust count for loop. */ +L(no_zva_loop): + st2g dstin, [dst, 32] + st2g dstin, [dst, 64]! + subs count, count, 64 + b.hi L(no_zva_loop) + st2g dstin, [dstend, -64] + st2g dstin, [dstend, -32] + ret + +END (__mtag_tag_region) +#endif diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S new file mode 100644 index 0000000..f58364c --- /dev/null +++ b/string/aarch64/__mtag_tag_zero_region.S @@ -0,0 +1,100 @@ +/* + * __mtag_tag_zero_region - tag memory and fill it with zero bytes + * + * Copyright (c) 2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, MTE, LP64 ABI. + * + * Interface contract: + * Address is 16 byte aligned and size is multiple of 16. + * Returns the passed pointer. + * The memory region may remain untagged if tagging is not enabled. + */ + +#include "../asmdefs.h" + +#if __ARM_FEATURE_MEMORY_TAGGING + +#define dstin x0 +#define count x1 +#define dst x2 +#define dstend x3 +#define tmp x4 +#define zva_val x4 + +ENTRY (__mtag_tag_zero_region) + PTR_ARG (0) + SIZE_ARG (1) + + add dstend, dstin, count + + cmp count, 96 + b.hi L(set_long) + + tbnz count, 6, L(set96) + + /* Set 0, 16, 32, or 48 bytes. */ + lsr tmp, count, 5 + add tmp, dstin, tmp, lsl 4 + cbz count, L(end) + stzg dstin, [dstin] + stzg dstin, [tmp] + stzg dstin, [dstend, -16] +L(end): + ret + + .p2align 4 + /* Set 64..96 bytes. Write 64 bytes from the start and + 32 bytes from the end. */ +L(set96): + stz2g dstin, [dstin] + stz2g dstin, [dstin, 32] + stz2g dstin, [dstend, -32] + ret + + .p2align 4 + /* Size is > 96 bytes. */ +L(set_long): + cmp count, 160 + b.lo L(no_zva) + +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(no_zva) +#endif + stz2g dstin, [dstin] + stz2g dstin, [dstin, 32] + bic dst, dstin, 63 + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + .p2align 4 +L(zva_loop): + add dst, dst, 64 + dc gzva, dst + subs count, count, 64 + b.hi L(zva_loop) + stz2g dstin, [dstend, -64] + stz2g dstin, [dstend, -32] + ret + +L(no_zva): + sub dst, dstin, 32 /* Dst is biased by -32. */ + sub count, count, 64 /* Adjust count for loop. */ +L(no_zva_loop): + stz2g dstin, [dst, 32] + stz2g dstin, [dst, 64]! + subs count, count, 64 + b.hi L(no_zva_loop) + stz2g dstin, [dstend, -64] + stz2g dstin, [dstend, -32] + ret + +END (__mtag_tag_zero_region) +#endif diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S new file mode 100644 index 0000000..5a54242 --- /dev/null +++ b/string/aarch64/check-arch.S @@ -0,0 +1,13 @@ +/* + * check ARCH setting. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if !__aarch64__ +# error ARCH setting does not match the compiler. +#endif + +/* Include for GNU property notes. */ +#include "../asmdefs.h" diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S new file mode 100644 index 0000000..c2e967d --- /dev/null +++ b/string/aarch64/memchr-mte.S @@ -0,0 +1,116 @@ +/* + * memchr - find a character in a memory zone + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define srcin x0 +#define chrin w1 +#define cntin x2 +#define result x0 + +#define src x3 +#define cntrem x4 +#define synd x5 +#define shift x6 +#define tmp x7 +#define wtmp w7 + +#define vrepchr v0 +#define qdata q1 +#define vdata v1 +#define vhas_chr v2 +#define vrepmask v3 +#define vend v4 +#define dend d4 + +/* + Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (__memchr_aarch64_mte) + PTR_ARG (0) + SIZE_ARG (2) + bic src, srcin, 15 + cbz cntin, L(nomatch) + ld1 {vdata.16b}, [src] + dup vrepchr.16b, chrin + mov wtmp, 0xf00f + dup vrepmask.8h, wtmp + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + lsl shift, srcin, 2 + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + lsr synd, synd, shift + cbz synd, L(start_loop) + + rbit synd, synd + clz synd, synd + add result, srcin, synd, lsr 2 + cmp cntin, synd, lsr 2 + csel result, result, xzr, hi + ret + +L(start_loop): + sub tmp, src, srcin + add tmp, tmp, 16 + subs cntrem, cntin, tmp + b.ls L(nomatch) + + /* Make sure that it won't overread by a 16-byte chunk */ + add tmp, cntrem, 15 + tbnz tmp, 4, L(loop32_2) + + .p2align 4 +L(loop32): + ldr qdata, [src, 16]! + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbnz synd, L(end) + +L(loop32_2): + ldr qdata, [src, 16]! + subs cntrem, cntrem, 32 + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + b.ls L(end) + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbz synd, L(loop32) +L(end): + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + add tmp, srcin, cntin + sub cntrem, tmp, src +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz synd, synd + cmp cntrem, synd, lsr 2 + add result, src, synd, lsr 2 + csel result, result, xzr, hi + ret + +L(nomatch): + mov result, 0 + ret + +END (__memchr_aarch64_mte) + diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S index 0d75acd..c22e659 100644 --- a/string/aarch64/memchr-sve.S +++ b/string/aarch64/memchr-sve.S @@ -1,28 +1,27 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2021, Arm Limited. * SPDX-License-Identifier: MIT */ +#include "../asmdefs.h" + +#if __ARM_FEATURE_SVE /* Assumptions: * * ARMv8-a, AArch64 * SVE Available. */ - .arch armv8-a+sve - .text - - .globl __memchr_aarch64_sve - .type __memchr_aarch64_sve, %function - .p2align 4 -__memchr_aarch64_sve: +ENTRY (__memchr_aarch64_sve) + PTR_ARG (0) + SIZE_ARG (2) dup z1.b, w1 /* duplicate c to a vector */ setffr /* initialize FFR */ mov x3, 0 /* initialize off */ - nop + .p2align 4 0: whilelo p1.b, x3, x2 /* make sure off < max */ b.none 9f @@ -59,4 +58,7 @@ __memchr_aarch64_sve: 9: mov x0, 0 /* return null */ ret - .size __memchr_aarch64_sve, . - __memchr_aarch64_sve +END (__memchr_aarch64_sve) + +#endif + diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S index 10be49e..353f0d1 100644 --- a/string/aarch64/memchr.S +++ b/string/aarch64/memchr.S @@ -1,7 +1,7 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2014-2019, Arm Limited. + * Copyright (c) 2014-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -47,6 +47,8 @@ */ ENTRY (__memchr_aarch64) + PTR_ARG (0) + SIZE_ARG (2) /* Do not dereference srcin if no bytes to compare. */ cbz cntin, L(zero_length) /* @@ -110,7 +112,7 @@ L(end): addp vend.16b, vend.16b, vend.16b /* 128->64 */ mov synd, vend.d[0] /* Only do the clear for the last possible block */ - b.hi L(tail) + b.hs L(tail) L(masklast): /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ @@ -141,3 +143,4 @@ L(zero_length): ret END (__memchr_aarch64) + diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S index d4f6026..78c5eca 100644 --- a/string/aarch64/memcmp-sve.S +++ b/string/aarch64/memcmp-sve.S @@ -1,23 +1,23 @@ /* * memcmp - compare memory * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2021, Arm Limited. * SPDX-License-Identifier: MIT */ +#include "../asmdefs.h" + +#if __ARM_FEATURE_SVE /* Assumptions: * * ARMv8-a, AArch64 * SVE Available. */ - .arch armv8-a+sve - .text - - .globl __memcmp_aarch64_sve - .type __memcmp_aarch64_sve, %function - .p2align 4 -__memcmp_aarch64_sve: +ENTRY (__memcmp_aarch64_sve) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) mov x3, 0 /* initialize off */ 0: whilelo p0.b, x3, x2 /* while off < max */ @@ -45,4 +45,7 @@ __memcmp_aarch64_sve: 9: mov x0, 0 /* return equality */ ret - .size __memcmp_aarch64_sve, . - __memcmp_aarch64_sve +END (__memcmp_aarch64_sve) + +#endif + diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S index 6722516..3b10266 100644 --- a/string/aarch64/memcmp.S +++ b/string/aarch64/memcmp.S @@ -1,6 +1,6 @@ /* memcmp - compare memory * - * Copyright (c) 2013, Arm Limited. + * Copyright (c) 2013-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -28,6 +28,9 @@ #define tmp2 x8 ENTRY (__memcmp_aarch64) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) subs limit, limit, 8 b.lo L(less8) @@ -131,3 +134,4 @@ L(byte_loop): ret END (__memcmp_aarch64) + diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S new file mode 100644 index 0000000..f97f2c3 --- /dev/null +++ b/string/aarch64/memcpy-advsimd.S @@ -0,0 +1,206 @@ +/* + * memcpy - copy memory area + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * + */ + +#include "../asmdefs.h" + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_lw w10 +#define tmp1 x14 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The source pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +ENTRY_ALIAS (__memmove_aarch64_simd) +ENTRY (__memcpy_aarch64_simd) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldr A_q, [src] + ldr B_q, [srcend, -16] + str A_q, [dstin] + str B_q, [dstend, -16] + ret + + /* Copy 8-15 bytes. */ +L(copy16): + tbz count, 3, L(copy8) + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + + .p2align 3 + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) + ldr A_lw, [src] + ldr B_lw, [srcend, -4] + str A_lw, [dstin] + str B_lw, [dstend, -4] + ret + + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +L(copy0): + ret + + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + .p2align 4 + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_q, F_q, [src, 32] + cmp count, 96 + b.ls L(copy96) + ldp G_q, H_q, [srcend, -64] + stp G_q, H_q, [dstend, -64] +L(copy96): + stp A_q, B_q, [dstin] + stp E_q, F_q, [dstin, 32] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy more than 128 bytes. */ +L(copy_long): + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align src to 16-byte alignment. */ + ldr D_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_q, B_q, [src, 16] + str D_q, [dstin] + ldp C_q, D_q, [src, 48] + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) +L(loop64): + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [src, 80] + stp C_q, D_q, [dst, 48] + ldp C_q, D_q, [src, 112] + add src, src, 64 + add dst, dst, 64 + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_q, F_q, [srcend, -64] + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [srcend, -32] + stp C_q, D_q, [dst, 48] + stp E_q, F_q, [dstend, -64] + stp A_q, B_q, [dstend, -32] + ret + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align srcend to 16-byte alignment. */ +L(copy_long_backwards): + cbz tmp1, L(copy0) + ldr D_q, [srcend, -16] + and tmp1, srcend, 15 + bic srcend, srcend, 15 + sub count, count, tmp1 + ldp A_q, B_q, [srcend, -32] + str D_q, [dstend, -16] + ldp C_q, D_q, [srcend, -64] + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + str B_q, [dstend, -16] + str A_q, [dstend, -32] + ldp A_q, B_q, [srcend, -96] + str D_q, [dstend, -48] + str C_q, [dstend, -64]! + ldp C_q, D_q, [srcend, -128] + sub srcend, srcend, 64 + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp E_q, F_q, [src, 32] + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [src] + stp C_q, D_q, [dstend, -64] + stp E_q, F_q, [dstin, 32] + stp A_q, B_q, [dstin] + ret + +END (__memcpy_aarch64_simd) + diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S index 1aad88e..dd254f6 100644 --- a/string/aarch64/memcpy.S +++ b/string/aarch64/memcpy.S @@ -1,7 +1,7 @@ /* * memcpy - copy memory area * - * Copyright (c) 2012-2019, Arm Limited. + * Copyright (c) 2012-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -22,11 +22,11 @@ #define A_l x6 #define A_lw w6 #define A_h x7 -#define A_hw w7 #define B_l x8 #define B_lw w8 #define B_h x9 #define C_l x10 +#define C_lw w10 #define C_h x11 #define D_l x12 #define D_h x13 @@ -40,119 +40,117 @@ #define H_h srcend #define tmp1 x14 -/* This implementation of memcpy correctly handles overlaps, therefore - __memmove_aarch64 aliases to __memcpy_aarch64. By moving the src and - dst buffer overlap check from the start of memmove code to the - beginning of large copy code, the overhead of combining memcpy - and memmove implementations is negligible. +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. - Copies are split into 3 main cases: small copies of up to 16 bytes, - medium copies of 17..128 bytes which are fully unrolled, and large - copies (moves). + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. - Large forward moves align the destination and use an unrolled loop - processing 64 bytes per iteration. - - Large backward moves align dstend and use an unrolled loop processing - 64 bytes per iteration. + Large copies use a software pipelined loop processing 64 bytes per iteration. + The destination pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. */ -ENTRY (__memcpy_aarch64) ENTRY_ALIAS (__memmove_aarch64) +ENTRY (__memcpy_aarch64) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) add srcend, src, count add dstend, dstin, count - cmp count, 16 - b.ls L(copy16) cmp count, 128 - b.hi L(move_long) + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) - /* Medium copies: 17..128 bytes. */ + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) ldp A_l, A_h, [src] ldp D_l, D_h, [srcend, -16] - cmp count, 32 - b.hi L(copy33_128) stp A_l, A_h, [dstin] stp D_l, D_h, [dstend, -16] ret - .p2align 4 - /* Small copies: 0..16 bytes. */ + /* Copy 8-15 bytes. */ L(copy16): - /* 8-15 bytes. */ - cmp count, 8 - b.lo 1f + tbz count, 3, L(copy8) ldr A_l, [src] ldr A_h, [srcend, -8] str A_l, [dstin] str A_h, [dstend, -8] ret - .p2align 4 -1: - /* 4-7 bytes. */ - tbz count, 2, 1f + .p2align 3 + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) ldr A_lw, [src] - ldr A_hw, [srcend, -4] + ldr B_lw, [srcend, -4] str A_lw, [dstin] - str A_hw, [dstend, -4] + str B_lw, [dstend, -4] ret - .p2align 4 - /* Copy 0..3 bytes. Use a branchless sequence that copies the same - byte 3 times if count==1, or the 2nd byte twice if count==2. */ -1: - cbz count, 2f + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) lsr tmp1, count, 1 ldrb A_lw, [src] - ldrb A_hw, [srcend, -1] + ldrb C_lw, [srcend, -1] ldrb B_lw, [src, tmp1] strb A_lw, [dstin] strb B_lw, [dstin, tmp1] - strb A_hw, [dstend, -1] -2: ret + strb C_lw, [dstend, -1] +L(copy0): + ret .p2align 4 - /* Copy 33..128 bytes. */ -L(copy33_128): + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_l, A_h, [src] ldp B_l, B_h, [src, 16] ldp C_l, C_h, [srcend, -32] + ldp D_l, D_h, [srcend, -16] cmp count, 64 - b.hi L(copy65_128) + b.hi L(copy128) stp A_l, A_h, [dstin] - stp D_l, D_h, [dstend, -16] stp B_l, B_h, [dstin, 16] stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] ret .p2align 4 /* Copy 65..128 bytes. */ -L(copy65_128): +L(copy128): ldp E_l, E_h, [src, 32] ldp F_l, F_h, [src, 48] + cmp count, 96 + b.ls L(copy96) ldp G_l, G_h, [srcend, -64] ldp H_l, H_h, [srcend, -48] + stp G_l, G_h, [dstend, -64] + stp H_l, H_h, [dstend, -48] +L(copy96): stp A_l, A_h, [dstin] - stp D_l, D_h, [dstend, -16] stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] stp E_l, E_h, [dstin, 32] stp F_l, F_h, [dstin, 48] - stp G_l, G_h, [dstend, -64] - stp H_l, H_h, [dstend, -48] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] ret .p2align 4 - /* Move more than 128 bytes. */ -L(move_long): - sub tmp1, dstin, src /* Overlap check. */ + /* Copy more than 128 bytes. */ +L(copy_long): + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src cbz tmp1, L(copy0) cmp tmp1, count - b.lo L(move_long_backwards) + b.lo L(copy_long_backwards) - /* Align dst to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 128 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ + /* Copy 16 bytes and then align dst to 16-byte alignment. */ ldp D_l, D_h, [src] and tmp1, dstin, 15 @@ -179,9 +177,7 @@ L(loop64): subs count, count, 64 b.hi L(loop64) - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the end even if - there is just 1 byte left. */ + /* Write the last iteration and copy 64 bytes from the end. */ L(copy64_from_end): ldp E_l, E_h, [srcend, -64] stp A_l, A_h, [dst, 16] @@ -195,20 +191,13 @@ L(copy64_from_end): stp A_l, A_h, [dstend, -48] stp B_l, B_h, [dstend, -32] stp C_l, C_h, [dstend, -16] - -L(copy0): ret .p2align 4 - /* Move more than 128 bytes where src and dst buffers overlap - and dst > src. - - Align dstend to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 128 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ -L(move_long_backwards): + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ +L(copy_long_backwards): ldp D_l, D_h, [srcend, -16] and tmp1, dstend, 15 sub srcend, srcend, tmp1 @@ -234,9 +223,7 @@ L(loop64_backwards): subs count, count, 64 b.hi L(loop64_backwards) - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the start even if - there is just 1 byte left. */ + /* Write the last iteration and copy 64 bytes from the start. */ L(copy64_from_start): ldp G_l, G_h, [src, 48] stp A_l, A_h, [dstend, -16] @@ -253,3 +240,4 @@ L(copy64_from_start): ret END (__memcpy_aarch64) + diff --git a/string/aarch64/memcpy_simd.S b/string/aarch64/memcpy_simd.S deleted file mode 100644 index fa2442f..0000000 --- a/string/aarch64/memcpy_simd.S +++ /dev/null @@ -1,265 +0,0 @@ -/* - * memcpy/memmove using SIMD registers - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -/* Assumptions: - * - * ARMv8-a, AArch64, unaligned accesses. - * - */ - -#include "../asmdefs.h" - -#define dstin x0 -#define src x1 -#define count x2 -#define dst x3 -#define srcend x4 -#define dstend x5 -#define A_l x6 -#define A_lw w6 -#define A_h x7 -#define A_hw w7 -#define B_l x8 -#define B_lw w8 -#define B_h x9 -#define C_l x10 -#define C_h x11 -#define D_l x12 -#define D_h x13 -#define E_l x14 -#define E_h x15 -#define F_l x16 -#define F_h x17 -#define G_l count -#define G_h dst -#define H_l src -#define H_h srcend -#define tmp1 x14 - -#define A_q q0 -#define B_q q1 -#define C_q q2 -#define D_q q3 -#define E_q q4 -#define F_q q5 -#define G_q q6 -#define H_q q7 - -/* This implementation of memcpy correctly handles overlaps, therefore - __memmove_aarch64_simd aliases to __memcpy_aarch64_simd. By moving the - src and dst buffer overlap check from the start of memmove code to the - beginning of large copy code, the overhead of combining memcpy - and memmove implementations is negligible. - - Copies are split into 3 main cases: small copies of up to 16 bytes, - medium copies of 17..128 bytes which are fully unrolled, and large - copies (moves). - - Large forward moves align the source and use an unrolled loop - processing 64 bytes per iteration. - - Large backward moves align srcend and use an unrolled loop processing - 64 bytes per iteration. -*/ - -ENTRY (__memcpy_aarch64_simd) -ENTRY_ALIAS (__memmove_aarch64_simd) - add srcend, src, count - add dstend, dstin, count - cmp count, 16 - b.ls L(copy16_simd) - cmp count, 128 - b.hi L(move_long_simd) - - /* Medium copies: 17..128 bytes. */ - ldr A_q, [src] - ldr D_q, [srcend, -16] - cmp count, 32 - b.hi L(copy33_128_simd) - str A_q, [dstin] - str D_q, [dstend, -16] - ret - - .p2align 4 - /* Small copies: 0..16 bytes. */ -L(copy16_simd): - /* 8-15 bytes. */ - cmp count, 8 - b.lo 1f - ldr A_l, [src] - ldr A_h, [srcend, -8] - str A_l, [dstin] - str A_h, [dstend, -8] - ret - - .p2align 4 -1: - /* 4-7 bytes. */ - tbz count, 2, 1f - ldr A_lw, [src] - ldr A_hw, [srcend, -4] - str A_lw, [dstin] - str A_hw, [dstend, -4] - ret - - .p2align 4 - /* Copy 0..3 bytes. Use a branchless sequence that copies the same - byte 3 times if count==1, or the 2nd byte twice if count==2. */ -1: - cbz count, 2f - lsr tmp1, count, 1 - ldrb A_lw, [src] - ldrb A_hw, [srcend, -1] - ldrb B_lw, [src, tmp1] - strb A_lw, [dstin] - strb B_lw, [dstin, tmp1] - strb A_hw, [dstend, -1] -2: ret - - .p2align 4 - /* Copy 33..128 bytes. */ -L(copy33_128_simd): - ldr B_q, [src, 16] - ldr C_q, [srcend, -32] - cmp count, 64 - b.hi L(copy65_128_simd) - str A_q, [dstin] - str D_q, [dstend, -16] - str B_q, [dstin, 16] - str C_q, [dstend, -32] - ret - - .p2align 4 - /* Copy 65..128 bytes. */ -L(copy65_128_simd): - ldr E_q, [src, 32] - ldr F_q, [src, 48] - ldr G_q, [srcend, -64] - ldr H_q, [srcend, -48] - str A_q, [dstin] - str D_q, [dstend, -16] - str B_q, [dstin, 16] - str C_q, [dstend, -32] - str E_q, [dstin, 32] - str F_q, [dstin, 48] - str G_q, [dstend, -64] - str H_q, [dstend, -48] - ret - - .p2align 4 - /* Move more than 128 bytes. */ -L(move_long_simd): - sub tmp1, dstin, src /* Overlap check. */ - cbz tmp1, L(copy0_simd) - cmp tmp1, count - b.lo L(move_long_backwards_simd) - - /* Align src to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 128 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ - - ldr D_q, [src] - and tmp1, src, 15 - bic src, src, 15 - sub dst, dstin, tmp1 - add count, count, tmp1 /* Count is now 16 too large. */ - ldr A_q, [src, 16] - str D_q, [dstin] - ldr B_q, [src, 32] - ldr C_q, [src, 48] - ldr D_q, [src, 64]! - subs count, count, 128 + 16 /* Test and readjust count. */ - b.ls L(copy64_from_end_simd) - -L(loop64_simd): - str A_q, [dst, 16] - ldr A_q, [src, 16] - str B_q, [dst, 32] - ldr B_q, [src, 32] - str C_q, [dst, 48] - ldr C_q, [src, 48] - str D_q, [dst, 64]! - ldr D_q, [src, 64]! - subs count, count, 64 - b.hi L(loop64_simd) - - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the end even if - there is just 1 byte left. */ -L(copy64_from_end_simd): - ldr E_q, [srcend, -64] - str A_q, [dst, 16] - ldr A_q, [srcend, -48] - str B_q, [dst, 32] - ldr B_q, [srcend, -32] - str C_q, [dst, 48] - ldr C_q, [srcend, -16] - str D_q, [dst, 64] - str E_q, [dstend, -64] - str A_q, [dstend, -48] - str B_q, [dstend, -32] - str C_q, [dstend, -16] - -L(copy0_simd): - ret - - .p2align 4 - - /* Move more than 128 bytes where src and dst buffers overlap - and dst > src. - - Align srcend to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 128 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ - -L(move_long_backwards_simd): - ldr D_q, [srcend, -16] - and tmp1, srcend, 15 - sub srcend, srcend, tmp1 - sub count, count, tmp1 - ldr A_q, [srcend, -16] - str D_q, [dstend, -16] - ldr B_q, [srcend, -32] - ldr C_q, [srcend, -48] - ldr D_q, [srcend, -64]! - sub dstend, dstend, tmp1 - subs count, count, 128 - b.ls L(copy64_from_start_simd) - -L(loop64_backwards_simd): - str A_q, [dstend, -16] - ldr A_q, [srcend, -16] - str B_q, [dstend, -32] - ldr B_q, [srcend, -32] - str C_q, [dstend, -48] - ldr C_q, [srcend, -48] - str D_q, [dstend, -64]! - ldr D_q, [srcend, -64]! - subs count, count, 64 - b.hi L(loop64_backwards_simd) - - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the start even if - there is just 1 byte left. */ -L(copy64_from_start_simd): - ldr G_q, [src, 48] - str A_q, [dstend, -16] - ldr A_q, [src, 32] - str B_q, [dstend, -32] - ldr B_q, [src, 16] - str C_q, [dstend, -48] - ldr C_q, [src] - str D_q, [dstend, -64] - str G_q, [dstin, 48] - str A_q, [dstin, 32] - str B_q, [dstin, 16] - str C_q, [dstin] - ret - -END (__memcpy_aarch64_simd) diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S new file mode 100644 index 0000000..7b4be84 --- /dev/null +++ b/string/aarch64/memrchr.S @@ -0,0 +1,117 @@ +/* + * memrchr - find last character in a memory zone. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define srcin x0 +#define chrin w1 +#define cntin x2 +#define result x0 + +#define src x3 +#define cntrem x4 +#define synd x5 +#define shift x6 +#define tmp x7 +#define wtmp w7 +#define end x8 +#define endm1 x9 + +#define vrepchr v0 +#define qdata q1 +#define vdata v1 +#define vhas_chr v2 +#define vrepmask v3 +#define vend v4 +#define dend d4 + +/* + Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (__memrchr_aarch64) + PTR_ARG (0) + add end, srcin, cntin + sub endm1, end, 1 + bic src, endm1, 15 + cbz cntin, L(nomatch) + ld1 {vdata.16b}, [src] + dup vrepchr.16b, chrin + mov wtmp, 0xf00f + dup vrepmask.8h, wtmp + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + neg shift, end, lsl 2 + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + lsl synd, synd, shift + cbz synd, L(start_loop) + + clz synd, synd + sub result, endm1, synd, lsr 2 + cmp cntin, synd, lsr 2 + csel result, result, xzr, hi + ret + +L(start_loop): + sub tmp, end, src + subs cntrem, cntin, tmp + b.ls L(nomatch) + + /* Make sure that it won't overread by a 16-byte chunk */ + add tmp, cntrem, 15 + tbnz tmp, 4, L(loop32_2) + + .p2align 4 +L(loop32): + ldr qdata, [src, -16]! + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbnz synd, L(end) + +L(loop32_2): + ldr qdata, [src, -16]! + subs cntrem, cntrem, 32 + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + b.ls L(end) + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbz synd, L(loop32) +L(end): + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + + add tmp, src, 15 +#ifdef __AARCH64EB__ + rbit synd, synd +#endif + clz synd, synd + sub tmp, tmp, synd, lsr 2 + cmp tmp, srcin + csel result, tmp, xzr, hs + ret + +L(nomatch): + mov result, 0 + ret + +END (__memrchr_aarch64) + diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S index 3868141..9fcd975 100644 --- a/string/aarch64/memset.S +++ b/string/aarch64/memset.S @@ -1,13 +1,13 @@ /* * memset - fill memory with a constant byte * - * Copyright (c) 2012, Arm Limited. + * Copyright (c) 2012-2021, Arm Limited. * SPDX-License-Identifier: MIT */ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. * */ @@ -19,14 +19,11 @@ #define count x2 #define dst x3 #define dstend x4 -#define tmp1 x5 -#define tmp1w w5 -#define tmp2 x6 -#define tmp2w w6 -#define zva_len x7 -#define zva_lenw w7 +#define zva_val x5 ENTRY (__memset_aarch64) + PTR_ARG (0) + SIZE_ARG (2) dup v0.16B, valw add dstend, dstin, count @@ -42,7 +39,7 @@ ENTRY (__memset_aarch64) str val, [dstin] str val, [dstend, -8] ret - nop + .p2align 4 1: tbz count, 2, 2f str valw, [dstin] str valw, [dstend, -4] @@ -72,108 +69,49 @@ L(set96): stp q0, q0, [dstend, -32] ret - .p2align 3 - nop + .p2align 4 L(set_long): and valw, valw, 255 bic dst, dstin, 15 str q0, [dstin] - cmp count, 256 - ccmp valw, 0, 0, cs - b.eq L(try_zva) -L(no_zva): - sub count, dstend, dst /* Count is 16 too large. */ - add dst, dst, 16 - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ -1: stp q0, q0, [dst], 64 - stp q0, q0, [dst, -32] -L(tail64): - subs count, count, 64 - b.hi 1b -2: stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] - ret - - .p2align 3 -L(try_zva): - mrs tmp1, dczid_el0 - tbnz tmp1w, 4, L(no_zva) - and tmp1w, tmp1w, 15 - cmp tmp1w, 4 /* ZVA size is 64 bytes. */ - b.ne L(zva_128) - - /* Write the first and last 64 byte aligned block using stp rather - than using DC ZVA. This is faster on some cores. - */ -L(zva_64): + cmp count, 160 + ccmp valw, 0, 0, hs + b.ne L(no_zva) + +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(no_zva) +#endif str q0, [dst, 16] stp q0, q0, [dst, 32] bic dst, dst, 63 - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] - sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ - add dst, dst, 128 - nop -1: dc zva, dst + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + .p2align 4 +L(zva_loop): add dst, dst, 64 + dc zva, dst subs count, count, 64 - b.hi 1b - stp q0, q0, [dst, 0] - stp q0, q0, [dst, 32] + b.hi L(zva_loop) stp q0, q0, [dstend, -64] stp q0, q0, [dstend, -32] ret - .p2align 3 -L(zva_128): - cmp tmp1w, 5 /* ZVA size is 128 bytes. */ - b.ne L(zva_other) - - str q0, [dst, 16] +L(no_zva): + sub count, dstend, dst /* Count is 16 too large. */ + sub dst, dst, 16 /* Dst is biased by -32. */ + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +L(no_zva_loop): stp q0, q0, [dst, 32] - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] - bic dst, dst, 127 - sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+128 /* Adjust count and bias for loop. */ - add dst, dst, 128 -1: dc zva, dst - add dst, dst, 128 - subs count, count, 128 - b.hi 1b - stp q0, q0, [dstend, -128] - stp q0, q0, [dstend, -96] + stp q0, q0, [dst, 64]! + subs count, count, 64 + b.hi L(no_zva_loop) stp q0, q0, [dstend, -64] stp q0, q0, [dstend, -32] ret -L(zva_other): - mov tmp2w, 4 - lsl zva_lenw, tmp2w, tmp1w - add tmp1, zva_len, 64 /* Max alignment bytes written. */ - cmp count, tmp1 - blo L(no_zva) - - sub tmp2, zva_len, 1 - add tmp1, dst, zva_len - add dst, dst, 16 - subs count, tmp1, dst /* Actual alignment bytes to write. */ - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ - beq 2f -1: stp q0, q0, [dst], 64 - stp q0, q0, [dst, -32] - subs count, count, 64 - b.hi 1b -2: mov dst, tmp1 - sub count, dstend, tmp1 /* Remaining bytes to write. */ - subs count, count, zva_len - b.lo 4f -3: dc zva, dst - add dst, dst, zva_len - subs count, count, zva_len - b.hs 3b -4: add count, count, zva_len - b L(tail64) - END (__memset_aarch64) + diff --git a/string/aarch64/stpcpy-mte.S b/string/aarch64/stpcpy-mte.S new file mode 100644 index 0000000..f1c7119 --- /dev/null +++ b/string/aarch64/stpcpy-mte.S @@ -0,0 +1,10 @@ +/* + * stpcpy - copy a string returning pointer to end. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define BUILD_STPCPY 1 + +#include "strcpy-mte.S" diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/stpcpy-sve.S new file mode 100644 index 0000000..82dd971 --- /dev/null +++ b/string/aarch64/stpcpy-sve.S @@ -0,0 +1,10 @@ +/* + * stpcpy - copy a string returning pointer to end. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define BUILD_STPCPY 1 + +#include "strcpy-sve.S" diff --git a/string/aarch64/stpcpy.S b/string/aarch64/stpcpy.S new file mode 100644 index 0000000..4f62aa4 --- /dev/null +++ b/string/aarch64/stpcpy.S @@ -0,0 +1,10 @@ +/* + * stpcpy - copy a string returning pointer to end. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define BUILD_STPCPY 1 + +#include "strcpy.S" diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S new file mode 100644 index 0000000..dcb0e46 --- /dev/null +++ b/string/aarch64/strchr-mte.S @@ -0,0 +1,105 @@ +/* + * strchr - find a character in a string + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define srcin x0 +#define chrin w1 +#define result x0 + +#define src x2 +#define tmp1 x1 +#define wtmp2 w3 +#define tmp3 x3 + +#define vrepchr v0 +#define vdata v1 +#define qdata q1 +#define vhas_nul v2 +#define vhas_chr v3 +#define vrepmask v4 +#define vrepmask2 v5 +#define vend v6 +#define dend d6 + +/* Core algorithm. + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-1 are set if the relevant byte matched the + requested character, bits 2-3 are set if the byte is NUL (or matched), and + bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd + bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits + in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (__strchr_aarch64_mte) + PTR_ARG (0) + bic src, srcin, 15 + dup vrepchr.16b, chrin + ld1 {vdata.16b}, [src] + mov wtmp2, 0x3003 + dup vrepmask.8h, wtmp2 + cmeq vhas_nul.16b, vdata.16b, 0 + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + mov wtmp2, 0xf00f + dup vrepmask2.8h, wtmp2 + + bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b + and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b + lsl tmp3, srcin, 2 + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + + fmov tmp1, dend + lsr tmp1, tmp1, tmp3 + cbz tmp1, L(loop) + + rbit tmp1, tmp1 + clz tmp1, tmp1 + /* Tmp1 is an even multiple of 2 if the target character was + found first. Otherwise we've found the end of string. */ + tst tmp1, 2 + add result, srcin, tmp1, lsr 2 + csel result, result, xzr, eq + ret + + .p2align 4 +L(loop): + ldr qdata, [src, 16]! + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov tmp1, dend + cbz tmp1, L(loop) + +#ifdef __AARCH64EB__ + bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b + and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + fmov tmp1, dend +#else + bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b + and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + fmov tmp1, dend + rbit tmp1, tmp1 +#endif + clz tmp1, tmp1 + /* Tmp1 is an even multiple of 2 if the target character was + found first. Otherwise we've found the end of string. */ + tst tmp1, 2 + add result, src, tmp1, lsr 2 + csel result, result, xzr, eq + ret + +END (__strchr_aarch64_mte) + diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S index 8d8a319..13ba9f4 100644 --- a/string/aarch64/strchr-sve.S +++ b/string/aarch64/strchr-sve.S @@ -1,19 +1,19 @@ /* * strchr/strchrnul - find a character in a string * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2021, Arm Limited. * SPDX-License-Identifier: MIT */ +#include "../asmdefs.h" + +#if __ARM_FEATURE_SVE /* Assumptions: * * ARMv8-a, AArch64 * SVE Available. */ - .arch armv8-a+sve - .text - /* To build as strchrnul, define BUILD_STRCHRNUL before compiling this file. */ #ifdef BUILD_STRCHRNUL #define FUNC __strchrnul_aarch64_sve @@ -21,10 +21,8 @@ #define FUNC __strchr_aarch64_sve #endif - .globl FUNC - .type FUNC, %function - .p2align 4 -FUNC: +ENTRY (FUNC) + PTR_ARG (0) dup z1.b, w1 /* replicate byte across vector */ setffr /* initialize FFR */ ptrue p1.b /* all ones; loop invariant */ @@ -66,4 +64,7 @@ FUNC: incp x0, p0.b b 0b - .size FUNC, . - FUNC +END (FUNC) + +#endif + diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S index 00d9be3..1063cbf 100644 --- a/string/aarch64/strchr.S +++ b/string/aarch64/strchr.S @@ -1,7 +1,7 @@ /* * strchr - find a character in a string * - * Copyright (c) 2014-2019, Arm Limited. + * Copyright (c) 2014-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -51,11 +51,12 @@ /* Locals and temporaries. */ ENTRY (__strchr_aarch64) - /* Magic constant 0x40100401 to allow us to identify which lane - matches the requested byte. Magic constant 0x80200802 used - similarly for NUL termination. */ - mov wtmp2, #0x0401 - movk wtmp2, #0x4010, lsl #16 + PTR_ARG (0) + /* Magic constant 0xc0300c03 to allow us to identify which lane + matches the requested byte. Even bits are set if the character + matches, odd bits if either the char is NUL or matches. */ + mov wtmp2, 0x0c03 + movk wtmp2, 0xc030, lsl 16 dup vrepchr.16b, chrin bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ dup vrepmask_c.4s, wtmp2 @@ -73,12 +74,10 @@ ENTRY (__strchr_aarch64) cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b cmeq vhas_nul2.16b, vdata2.16b, #0 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b - and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b - and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b - and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b + bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b + bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b + and vend1.16b, vhas_nul1.16b, vrepmask_c.16b + and vend2.16b, vhas_nul2.16b, vrepmask_c.16b lsl tmp1, tmp1, #1 addp vend1.16b, vend1.16b, vend2.16b // 256->128 mov tmp3, #~0 @@ -89,31 +88,26 @@ ENTRY (__strchr_aarch64) bic tmp1, tmp3, tmp1 // Mask padding bits. cbnz tmp1, L(tail) + .p2align 4 L(loop): ld1 {vdata1.16b, vdata2.16b}, [src], #32 - cmeq vhas_nul1.16b, vdata1.16b, #0 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b - cmeq vhas_nul2.16b, vdata2.16b, #0 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - /* Use a fast check for the termination condition. */ - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b - orr vend1.16b, vend1.16b, vend2.16b - addp vend1.2d, vend1.2d, vend1.2d + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b + orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b + umaxp vend1.16b, vend1.16b, vend1.16b mov tmp1, vend1.d[0] cbz tmp1, L(loop) /* Termination condition found. Now need to establish exactly why we terminated. */ - and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b - and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b - and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b - and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b + bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b + bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b + and vend1.16b, vhas_nul1.16b, vrepmask_c.16b + and vend2.16b, vhas_nul2.16b, vrepmask_c.16b addp vend1.16b, vend1.16b, vend2.16b // 256->128 addp vend1.16b, vend1.16b, vend2.16b // 128->64 - mov tmp1, vend1.d[0] L(tail): /* Count the trailing zeros, by bit reversing... */ @@ -129,3 +123,4 @@ L(tail): ret END (__strchr_aarch64) + diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S new file mode 100644 index 0000000..1b0d0a6 --- /dev/null +++ b/string/aarch64/strchrnul-mte.S @@ -0,0 +1,84 @@ +/* + * strchrnul - find a character or nul in a string + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define srcin x0 +#define chrin w1 +#define result x0 + +#define src x2 +#define tmp1 x1 +#define tmp2 x3 +#define tmp2w w3 + +#define vrepchr v0 +#define vdata v1 +#define qdata q1 +#define vhas_nul v2 +#define vhas_chr v3 +#define vrepmask v4 +#define vend v5 +#define dend d5 + +/* Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (__strchrnul_aarch64_mte) + PTR_ARG (0) + bic src, srcin, 15 + dup vrepchr.16b, chrin + ld1 {vdata.16b}, [src] + mov tmp2w, 0xf00f + dup vrepmask.8h, tmp2w + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b + lsl tmp2, srcin, 2 + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov tmp1, dend + lsr tmp1, tmp1, tmp2 /* Mask padding bits. */ + cbz tmp1, L(loop) + + rbit tmp1, tmp1 + clz tmp1, tmp1 + add result, srcin, tmp1, lsr 2 + ret + + .p2align 4 +L(loop): + ldr qdata, [src, 16]! + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b + fmov tmp1, dend + cbz tmp1, L(loop) + + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov tmp1, dend +#ifndef __AARCH64EB__ + rbit tmp1, tmp1 +#endif + clz tmp1, tmp1 + add result, src, tmp1, lsr 2 + ret + +END (__strchrnul_aarch64_mte) + diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S index 5140e59..428ff1a 100644 --- a/string/aarch64/strchrnul-sve.S +++ b/string/aarch64/strchrnul-sve.S @@ -1,7 +1,7 @@ /* * strchrnul - find a character or nul in a string * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S index 81264ea..a4230d9 100644 --- a/string/aarch64/strchrnul.S +++ b/string/aarch64/strchrnul.S @@ -1,7 +1,7 @@ /* * strchrnul - find a character or nul in a string * - * Copyright (c) 2014-2019, Arm Limited. + * Copyright (c) 2014-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -47,6 +47,7 @@ /* Locals and temporaries. */ ENTRY (__strchrnul_aarch64) + PTR_ARG (0) /* Magic constant 0x40100401 to allow us to identify which lane matches the termination condition. */ mov wtmp2, #0x0401 @@ -63,14 +64,12 @@ ENTRY (__strchrnul_aarch64) syndrome that are related to the padding. */ ld1 {vdata1.16b, vdata2.16b}, [src], #32 neg tmp1, tmp1 - cmeq vhas_nul1.16b, vdata1.16b, #0 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b - cmeq vhas_nul2.16b, vdata2.16b, #0 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b - orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b - and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b - and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b + and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b lsl tmp1, tmp1, #1 addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 mov tmp3, #~0 @@ -81,24 +80,22 @@ ENTRY (__strchrnul_aarch64) bic tmp1, tmp3, tmp1 // Mask padding bits. cbnz tmp1, L(tail) + .p2align 4 L(loop): ld1 {vdata1.16b, vdata2.16b}, [src], #32 - cmeq vhas_nul1.16b, vdata1.16b, #0 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b - cmeq vhas_nul2.16b, vdata2.16b, #0 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - /* Use a fast check for the termination condition. */ - orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b - orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b - orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b - addp vend1.2d, vend1.2d, vend1.2d + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b + orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b + umaxp vend1.16b, vend1.16b, vend1.16b mov tmp1, vend1.d[0] cbz tmp1, L(loop) /* Termination condition found. Now need to establish exactly why we terminated. */ - and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b - and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 addp vend1.16b, vend1.16b, vend1.16b // 128->64 @@ -114,3 +111,4 @@ L(tail): ret END (__strchrnul_aarch64) + diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S new file mode 100644 index 0000000..12d1a6b --- /dev/null +++ b/string/aarch64/strcmp-mte.S @@ -0,0 +1,189 @@ +/* + * strcmp - compare two strings + * + * Copyright (c) 2012-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + + +/* Assumptions: + * + * ARMv8-a, AArch64. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +#define src1 x0 +#define src2 x1 +#define result x0 + +#define data1 x2 +#define data1w w2 +#define data2 x3 +#define data2w w3 +#define has_nul x4 +#define diff x5 +#define off1 x5 +#define syndrome x6 +#define tmp x6 +#define data3 x7 +#define zeroones x8 +#define shift x9 +#define off2 x10 + +/* On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. */ +#ifdef __AARCH64EB__ +# define LS_FW lsl +#else +# define LS_FW lsr +#endif + +/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. + Since carry propagation makes 0x1 bytes before a NUL byte appear + NUL too in big-endian, byte-reverse the data before the NUL check. */ + + +ENTRY (__strcmp_aarch64_mte) + PTR_ARG (0) + PTR_ARG (1) + sub off2, src2, src1 + mov zeroones, REP8_01 + and tmp, src1, 7 + tst off2, 7 + b.ne L(misaligned8) + cbnz tmp, L(mutual_align) + + .p2align 4 + +L(loop_aligned): + ldr data2, [src1, off2] + ldr data1, [src1], 8 +L(start_realigned): +#ifdef __AARCH64EB__ + rev tmp, data1 + sub has_nul, tmp, zeroones + orr tmp, tmp, REP8_7f +#else + sub has_nul, data1, zeroones + orr tmp, data1, REP8_7f +#endif + bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ + ccmp data1, data2, 0, eq + b.eq L(loop_aligned) +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + eor diff, data1, data2 + orr syndrome, diff, has_nul +L(end): +#ifndef __AARCH64EB__ + rev syndrome, syndrome + rev data1, data1 + rev data2, data2 +#endif + clz shift, syndrome + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + lsl data1, data1, shift + lsl data2, data2, shift + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, 56 + sub result, data1, data2, lsr 56 + ret + + .p2align 4 + +L(mutual_align): + /* Sources are mutually aligned, but are not currently at an + alignment boundary. Round down the addresses and then mask off + the bytes that precede the start point. */ + bic src1, src1, 7 + ldr data2, [src1, off2] + ldr data1, [src1], 8 + neg shift, src2, lsl 3 /* Bits to alignment -64. */ + mov tmp, -1 + LS_FW tmp, tmp, shift + orr data1, data1, tmp + orr data2, data2, tmp + b L(start_realigned) + +L(misaligned8): + /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always + checking to make sure that we don't access beyond the end of SRC2. */ + cbz tmp, L(src1_aligned) +L(do_misaligned): + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + cmp data1w, 0 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.ne L(done) + tst src1, 7 + b.ne L(do_misaligned) + +L(src1_aligned): + neg shift, src2, lsl 3 + bic src2, src2, 7 + ldr data3, [src2], 8 +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + lsr tmp, zeroones, shift + orr data3, data3, tmp + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + bics has_nul, has_nul, tmp + b.ne L(tail) + + sub off1, src2, src1 + + .p2align 4 + +L(loop_unaligned): + ldr data3, [src1, off1] + ldr data2, [src1, off2] +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + ldr data1, [src1], 8 + bics has_nul, has_nul, tmp + ccmp data1, data2, 0, eq + b.eq L(loop_unaligned) + + lsl tmp, has_nul, shift +#ifdef __AARCH64EB__ + rev tmp, tmp +#endif + eor diff, data1, data2 + orr syndrome, diff, tmp + cbnz syndrome, L(end) +L(tail): + ldr data1, [src1] + neg shift, shift + lsr data2, data3, shift + lsr has_nul, has_nul, shift +#ifdef __AARCH64EB__ + rev data2, data2 + rev has_nul, has_nul +#endif + eor diff, data1, data2 + orr syndrome, diff, has_nul + b L(end) + +L(done): + sub result, data1, data2 + ret + +END (__strcmp_aarch64_mte) + diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S index 91bac19..e6d2da5 100644 --- a/string/aarch64/strcmp-sve.S +++ b/string/aarch64/strcmp-sve.S @@ -1,29 +1,28 @@ /* * __strcmp_aarch64_sve - compare two strings * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2021, Arm Limited. * SPDX-License-Identifier: MIT */ +#include "../asmdefs.h" + +#if __ARM_FEATURE_SVE /* Assumptions: * * ARMv8-a, AArch64 * SVE Available. */ - .arch armv8-a+sve - .text - - .globl __strcmp_aarch64_sve - .type __strcmp_aarch64_sve, %function - .p2align 4 -__strcmp_aarch64_sve: +ENTRY (__strcmp_aarch64_sve) + PTR_ARG (0) + PTR_ARG (1) setffr /* initialize FFR */ ptrue p1.b, all /* all ones; loop invariant */ mov x2, 0 /* initialize offset */ - nop /* Read a vector's worth of bytes, stopping on first fault. */ + .p2align 4 0: ldff1b z0.b, p1/z, [x0, x2] ldff1b z1.b, p1/z, [x1, x2] rdffrs p0.b, p1/z @@ -54,4 +53,7 @@ __strcmp_aarch64_sve: b.none 0b b 1b - .size __strcmp_aarch64_sve, . - __strcmp_aarch64_sve +END (__strcmp_aarch64_sve) + +#endif + diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S index 65af5ce..7714ebf 100644 --- a/string/aarch64/strcmp.S +++ b/string/aarch64/strcmp.S @@ -1,7 +1,7 @@ /* * strcmp - compare two strings * - * Copyright (c) 2012, Arm Limited. + * Copyright (c) 2012-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -37,6 +37,8 @@ /* Start of performance-critical section -- one 64B cache line. */ ENTRY (__strcmp_aarch64) + PTR_ARG (0) + PTR_ARG (1) eor tmp1, src1, src2 mov zeroones, #REP8_01 tst tmp1, #7 @@ -168,3 +170,4 @@ L(done): ret END (__strcmp_aarch64) + diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S new file mode 100644 index 0000000..88c222d --- /dev/null +++ b/string/aarch64/strcpy-mte.S @@ -0,0 +1,161 @@ +/* + * strcpy/stpcpy - copy a string returning pointer to start/end. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define dstin x0 +#define srcin x1 +#define result x0 + +#define src x2 +#define dst x3 +#define len x4 +#define synd x4 +#define tmp x5 +#define wtmp w5 +#define shift x5 +#define data1 x6 +#define dataw1 w6 +#define data2 x7 +#define dataw2 w7 + +#define dataq q0 +#define vdata v0 +#define vhas_nul v1 +#define vrepmask v2 +#define vend v3 +#define dend d3 +#define dataq2 q1 + +#ifdef BUILD_STPCPY +# define STRCPY __stpcpy_aarch64_mte +# define IFSTPCPY(X,...) X,__VA_ARGS__ +#else +# define STRCPY __strcpy_aarch64_mte +# define IFSTPCPY(X,...) +#endif + +/* Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (STRCPY) + PTR_ARG (0) + PTR_ARG (1) + bic src, srcin, 15 + mov wtmp, 0xf00f + ld1 {vdata.16b}, [src] + dup vrepmask.8h, wtmp + cmeq vhas_nul.16b, vdata.16b, 0 + lsl shift, srcin, 2 + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + lsr synd, synd, shift + cbnz synd, L(tail) + + ldr dataq, [src, 16]! + cmeq vhas_nul.16b, vdata.16b, 0 + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(start_loop) + +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + sub tmp, src, srcin + clz len, synd + add len, tmp, len, lsr 2 + tbz len, 4, L(less16) + sub tmp, len, 15 + ldr dataq, [srcin] + ldr dataq2, [srcin, tmp] + str dataq, [dstin] + str dataq2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret + + .p2align 4,,8 +L(tail): + rbit synd, synd + clz len, synd + lsr len, len, 2 + + .p2align 4 +L(less16): + tbz len, 3, L(less8) + sub tmp, len, 7 + ldr data1, [srcin] + ldr data2, [srcin, tmp] + str data1, [dstin] + str data2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret + + .p2align 4 +L(less8): + subs tmp, len, 3 + b.lo L(less4) + ldr dataw1, [srcin] + ldr dataw2, [srcin, tmp] + str dataw1, [dstin] + str dataw2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret + +L(less4): + cbz len, L(zerobyte) + ldrh dataw1, [srcin] + strh dataw1, [dstin] +L(zerobyte): + strb wzr, [dstin, len] + IFSTPCPY (add result, dstin, len) + ret + + .p2align 4 +L(start_loop): + sub len, src, srcin + ldr dataq2, [srcin] + add dst, dstin, len + str dataq2, [dstin] + + .p2align 5 +L(loop): + str dataq, [dst], 16 + ldr dataq, [src, 16]! + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(loop) + + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + fmov synd, dend +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz len, synd + lsr len, len, 2 + sub tmp, len, 15 + ldr dataq, [src, tmp] + str dataq, [dst, tmp] + IFSTPCPY (add result, dst, len) + ret + +END (STRCPY) diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S index c929f37..f515462 100644 --- a/string/aarch64/strcpy-sve.S +++ b/string/aarch64/strcpy-sve.S @@ -1,19 +1,19 @@ /* * strcpy/stpcpy - copy a string returning pointer to start/end. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2021, Arm Limited. * SPDX-License-Identifier: MIT */ +#include "../asmdefs.h" + +#if __ARM_FEATURE_SVE /* Assumptions: * * ARMv8-a, AArch64 * SVE Available. */ - .arch armv8-a+sve - .text - /* To build as stpcpy, define BUILD_STPCPY before compiling this file. */ #ifdef BUILD_STPCPY #define FUNC __stpcpy_aarch64_sve @@ -21,10 +21,9 @@ #define FUNC __strcpy_aarch64_sve #endif - .globl FUNC - .type FUNC, %function - .p2align 4 -FUNC: +ENTRY (FUNC) + PTR_ARG (0) + PTR_ARG (1) setffr /* initialize FFR */ ptrue p2.b, all /* all ones; loop invariant */ mov x2, 0 /* initialize offset */ @@ -66,4 +65,7 @@ FUNC: #endif ret - .size FUNC, . - FUNC +END (FUNC) + +#endif + diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S index 4edffcf..6e9ed42 100644 --- a/string/aarch64/strcpy.S +++ b/string/aarch64/strcpy.S @@ -1,7 +1,7 @@ /* * strcpy/stpcpy - copy a string returning pointer to start/end. * - * Copyright (c) 2013-2019, Arm Limited. + * Copyright (c) 2013-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -80,6 +80,8 @@ #define MIN_PAGE_SIZE (1 << MIN_PAGE_P2) ENTRY (STRCPY) + PTR_ARG (0) + PTR_ARG (1) /* For moderately short strings, the fastest way to do the copy is to calculate the length of the string in the same way as strlen, then essentially do a memcpy of the result. This avoids the need for @@ -306,3 +308,4 @@ L(page_cross): b L(fp_gt8) END (STRCPY) + diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S new file mode 100644 index 0000000..7cf41d5 --- /dev/null +++ b/string/aarch64/strlen-mte.S @@ -0,0 +1,80 @@ +/* + * strlen - calculate the length of a string. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define srcin x0 +#define result x0 + +#define src x1 +#define synd x2 +#define tmp x3 +#define wtmp w3 +#define shift x4 + +#define data q0 +#define vdata v0 +#define vhas_nul v1 +#define vrepmask v2 +#define vend v3 +#define dend d3 + +/* Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (__strlen_aarch64_mte) + PTR_ARG (0) + bic src, srcin, 15 + mov wtmp, 0xf00f + ld1 {vdata.16b}, [src] + dup vrepmask.8h, wtmp + cmeq vhas_nul.16b, vdata.16b, 0 + lsl shift, srcin, 2 + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + fmov synd, dend + lsr synd, synd, shift + cbz synd, L(loop) + + rbit synd, synd + clz result, synd + lsr result, result, 2 + ret + + .p2align 5 +L(loop): + ldr data, [src, 16]! + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(loop) + + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + sub result, src, srcin + fmov synd, dend +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz tmp, synd + add result, result, tmp, lsr 2 + ret + +END (__strlen_aarch64_mte) + diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S index 64ede85..2392493 100644 --- a/string/aarch64/strlen-sve.S +++ b/string/aarch64/strlen-sve.S @@ -1,31 +1,28 @@ /* * __strlen_aarch64_sve - compute the length of a string * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2021, Arm Limited. * SPDX-License-Identifier: MIT */ +#include "../asmdefs.h" + +#if __ARM_FEATURE_SVE /* Assumptions: * * ARMv8-a, AArch64 * SVE Available. */ - .arch armv8-a+sve - .text - - .globl __strlen_aarch64_sve - .type __strlen_aarch64_sve, %function - .p2align 4 -__strlen_aarch64_sve: +ENTRY (__strlen_aarch64_sve) + PTR_ARG (0) setffr /* initialize FFR */ ptrue p2.b /* all ones; loop invariant */ mov x1, 0 /* initialize length */ - nop /* Read a vector's worth of bytes, stopping on first fault. */ + .p2align 4 0: ldff1b z0.b, p2/z, [x0, x1] - nop rdffrs p0.b, p2/z b.nlast 2f @@ -52,4 +49,7 @@ __strlen_aarch64_sve: incp x1, p0.b b 0b - .size __strlen_aarch64_sve, . - __strlen_aarch64_sve +END (__strlen_aarch64_sve) + +#endif + diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S index 2293f73..a1b164a 100644 --- a/string/aarch64/strlen.S +++ b/string/aarch64/strlen.S @@ -1,84 +1,88 @@ /* - * strlen - calculate the length of a string + * strlen - calculate the length of a string. * - * Copyright (c) 2013, Arm Limited. + * Copyright (c) 2020, Arm Limited. * SPDX-License-Identifier: MIT */ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses, min page size 4k. + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * Not MTE compatible. */ #include "../asmdefs.h" -/* To test the page crossing code path more thoroughly, compile with - -DTEST_PAGE_CROSS - this will force all calls through the slower - entry path. This option is not intended for production use. */ - -/* Arguments and results. */ -#define srcin x0 -#define len x0 - -/* Locals and temporaries. */ -#define src x1 -#define data1 x2 -#define data2 x3 -#define has_nul1 x4 -#define has_nul2 x5 -#define tmp1 x4 -#define tmp2 x5 -#define tmp3 x6 -#define tmp4 x7 -#define zeroones x8 - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. A faster check - (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives - false hits for characters 129..255. */ +#define srcin x0 +#define len x0 + +#define src x1 +#define data1 x2 +#define data2 x3 +#define has_nul1 x4 +#define has_nul2 x5 +#define tmp1 x4 +#define tmp2 x5 +#define tmp3 x6 +#define tmp4 x7 +#define zeroones x8 + +#define maskv v0 +#define maskd d0 +#define dataq1 q1 +#define dataq2 q2 +#define datav1 v1 +#define datav2 v2 +#define tmp x2 +#define tmpw w2 +#define synd x3 +#define shift x4 + +/* For the first 32 bytes, NUL detection works on the principle that + (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a + byte is zero, and can be done in parallel across the entire word. */ #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 + +/* To test the page crossing code path more thoroughly, compile with + -DTEST_PAGE_CROSS - this will force all calls through the slower + entry path. This option is not intended for production use. */ #ifdef TEST_PAGE_CROSS -# define MIN_PAGE_SIZE 15 +# define MIN_PAGE_SIZE 32 #else # define MIN_PAGE_SIZE 4096 #endif - /* Since strings are short on average, we check the first 16 bytes - of the string for a NUL character. In order to do an unaligned ldp - safely we have to do a page cross check first. If there is a NUL - byte we calculate the length from the 2 8-byte words using - conditional select to reduce branch mispredictions (it is unlikely - __strlen_aarch64 will be repeatedly called on strings with the same length). - - If the string is longer than 16 bytes, we align src so don't need - further page cross checks, and process 32 bytes per iteration - using the fast NUL check. If we encounter non-ASCII characters, - fallback to a second loop using the full NUL check. - - If the page cross check fails, we read 16 bytes from an aligned - address, remove any characters before the string, and continue - in the main loop using aligned loads. Since strings crossing a - page in the first 16 bytes are rare (probability of - 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. - - AArch64 systems have a minimum page size of 4k. We don't bother - checking for larger page sizes - the cost of setting up the correct - page size is just not worth the extra gain from a small reduction in - the cases taking the slow path. Note that we only care about - whether the first fetch, which may be misaligned, crosses a page - boundary. */ +/* Core algorithm: + + Since strings are short on average, we check the first 32 bytes of the + string for a NUL character without aligning the string. In order to use + unaligned loads safely we must do a page cross check first. + + If there is a NUL byte we calculate the length from the 2 8-byte words + using conditional select to reduce branch mispredictions (it is unlikely + strlen will be repeatedly called on strings with the same length). + + If the string is longer than 32 bytes, align src so we don't need further + page cross checks, and process 32 bytes per iteration using a fast SIMD + loop. + + If the page cross check fails, we read 32 bytes from an aligned address, + and ignore any characters before the string. If it contains a NUL + character, return the length, if not, continue in the main loop. */ ENTRY (__strlen_aarch64) + PTR_ARG (0) and tmp1, srcin, MIN_PAGE_SIZE - 1 - mov zeroones, REP8_01 - cmp tmp1, MIN_PAGE_SIZE - 16 - b.gt L(page_cross) + cmp tmp1, MIN_PAGE_SIZE - 32 + b.hi L(page_cross) + + /* Look for a NUL byte in the first 16 bytes. */ ldp data1, data2, [srcin] + mov zeroones, REP8_01 + #ifdef __AARCH64EB__ /* For big-endian, carry propagation (if the final byte in the string is 0x01) means we cannot use has_nul1/2 directly. @@ -94,113 +98,103 @@ ENTRY (__strlen_aarch64) bics has_nul1, tmp1, tmp2 bic has_nul2, tmp3, tmp4 ccmp has_nul2, 0, 0, eq - beq L(main_loop_entry) + b.eq L(bytes16_31) - /* Enter with C = has_nul1 == 0. */ + /* Find the exact offset of the first NUL byte in the first 16 bytes + from the string start. Enter with C = has_nul1 == 0. */ csel has_nul1, has_nul1, has_nul2, cc mov len, 8 rev has_nul1, has_nul1 - clz tmp1, has_nul1 csel len, xzr, len, cc + clz tmp1, has_nul1 add len, len, tmp1, lsr 3 ret - /* The inner loop processes 32 bytes per iteration and uses the fast - NUL check. If we encounter non-ASCII characters, use a second - loop with the accurate NUL check. */ - .p2align 4 -L(main_loop_entry): - bic src, srcin, 15 - sub src, src, 16 -L(main_loop): - ldp data1, data2, [src, 32]! -L(page_cross_entry): - sub tmp1, data1, zeroones - sub tmp3, data2, zeroones - orr tmp2, tmp1, tmp3 - tst tmp2, zeroones, lsl 7 - bne 1f - ldp data1, data2, [src, 16] + .p2align 3 + /* Look for a NUL byte at offset 16..31 in the string. */ +L(bytes16_31): + ldp data1, data2, [srcin, 16] +#ifdef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif sub tmp1, data1, zeroones - sub tmp3, data2, zeroones - orr tmp2, tmp1, tmp3 - tst tmp2, zeroones, lsl 7 - beq L(main_loop) - add src, src, 16 -1: - /* The fast check failed, so do the slower, accurate NUL check. */ orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones orr tmp4, data2, REP8_7f bics has_nul1, tmp1, tmp2 bic has_nul2, tmp3, tmp4 ccmp has_nul2, 0, 0, eq - beq L(nonascii_loop) + b.eq L(loop_entry) - /* Enter with C = has_nul1 == 0. */ -L(tail): -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul1/2 directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ - csel data1, data1, data2, cc - rev data1, data1 - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - bic has_nul1, tmp1, tmp2 -#else + /* Find the exact offset of the first NUL byte at offset 16..31 from + the string start. Enter with C = has_nul1 == 0. */ csel has_nul1, has_nul1, has_nul2, cc -#endif - sub len, src, srcin + mov len, 24 rev has_nul1, has_nul1 - add tmp2, len, 8 + mov tmp3, 16 clz tmp1, has_nul1 - csel len, len, tmp2, cc + csel len, tmp3, len, cc add len, len, tmp1, lsr 3 ret -L(nonascii_loop): - ldp data1, data2, [src, 16]! - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, REP8_7f - bics has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - ccmp has_nul2, 0, 0, eq - bne L(tail) - ldp data1, data2, [src, 16]! - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, REP8_7f - bics has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - ccmp has_nul2, 0, 0, eq - beq L(nonascii_loop) - b L(tail) +L(loop_entry): + bic src, srcin, 31 - /* Load 16 bytes from [srcin & ~15] and force the bytes that precede - srcin to 0x7f, so we ignore any NUL bytes before the string. - Then continue in the aligned loop. */ -L(page_cross): - bic src, srcin, 15 - ldp data1, data2, [src] - lsl tmp1, srcin, 3 - mov tmp4, -1 + .p2align 5 +L(loop): + ldp dataq1, dataq2, [src, 32]! + uminp maskv.16b, datav1.16b, datav2.16b + uminp maskv.16b, maskv.16b, maskv.16b + cmeq maskv.8b, maskv.8b, 0 + fmov synd, maskd + cbz synd, L(loop) + + /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ + cmeq maskv.16b, datav1.16b, 0 + sub len, src, srcin + tst synd, 0xffffffff + b.ne 1f + cmeq maskv.16b, datav2.16b, 0 + add len, len, 16 +1: + /* Generate a bitmask and compute correct byte offset. */ #ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ + bic maskv.8h, 0xf0 #else - /* Little-endian. Early bytes are at LSB. */ - lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ + bic maskv.8h, 0x0f, lsl 8 +#endif + umaxp maskv.16b, maskv.16b, maskv.16b + fmov synd, maskd +#ifndef __AARCH64EB__ + rbit synd, synd #endif - orr tmp1, tmp1, REP8_80 - orn data1, data1, tmp1 - orn tmp2, data2, tmp1 - tst srcin, 8 - csel data1, data1, tmp4, eq - csel data2, data2, tmp2, eq - b L(page_cross_entry) + clz tmp, synd + add len, len, tmp, lsr 2 + ret + + .p2align 4 + +L(page_cross): + bic src, srcin, 31 + mov tmpw, 0x0c03 + movk tmpw, 0xc030, lsl 16 + ld1 {datav1.16b, datav2.16b}, [src] + dup maskv.4s, tmpw + cmeq datav1.16b, datav1.16b, 0 + cmeq datav2.16b, datav2.16b, 0 + and datav1.16b, datav1.16b, maskv.16b + and datav2.16b, datav2.16b, maskv.16b + addp maskv.16b, datav1.16b, datav2.16b + addp maskv.16b, maskv.16b, maskv.16b + fmov synd, maskd + lsl shift, srcin, 1 + lsr synd, synd, shift + cbz synd, L(loop) + + rbit synd, synd + clz len, synd + lsr len, len, 1 + ret END (__strlen_aarch64) diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S new file mode 100644 index 0000000..c9d6fc8 --- /dev/null +++ b/string/aarch64/strncmp-mte.S @@ -0,0 +1,307 @@ +/* + * strncmp - compare two strings + * + * Copyright (c) 2013-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + */ + +#include "../asmdefs.h" + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +/* Parameters and result. */ +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result x0 + +/* Internal variables. */ +#define data1 x3 +#define data1w w3 +#define data2 x4 +#define data2w w4 +#define has_nul x5 +#define diff x6 +#define syndrome x7 +#define tmp1 x8 +#define tmp2 x9 +#define tmp3 x10 +#define zeroones x11 +#define pos x12 +#define mask x13 +#define endloop x14 +#define count mask +#define offset pos +#define neg_offset x15 + +/* Define endian dependent shift operations. + On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. + LS_BK means shifting towards later bytes. + */ +#ifdef __AARCH64EB__ +#define LS_FW lsl +#define LS_BK lsr +#else +#define LS_FW lsr +#define LS_BK lsl +#endif + +ENTRY (__strncmp_aarch64_mte) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + cbz limit, L(ret0) + eor tmp1, src1, src2 + mov zeroones, #REP8_01 + tst tmp1, #7 + and count, src1, #7 + b.ne L(misaligned8) + cbnz count, L(mutual_align) + + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ + .p2align 4 +L(loop_aligned): + ldr data1, [src1], #8 + ldr data2, [src2], #8 +L(start_realigned): + subs limit, limit, #8 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, hi /* Last Dword or differences. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp endloop, #0, #0, eq + b.eq L(loop_aligned) + /* End of main loop */ + +L(full_check): +#ifndef __AARCH64EB__ + orr syndrome, diff, has_nul + add limit, limit, 8 /* Rewind limit to before last subs. */ +L(syndrome_check): + /* Limit was reached. Check if the NUL byte or the difference + is before the limit. */ + rev syndrome, syndrome + rev data1, data1 + clz pos, syndrome + rev data2, data2 + lsl data1, data1, pos + cmp limit, pos, lsr #3 + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + csel result, result, xzr, hi + ret +#else + /* Not reached the limit, must have found the end or a diff. */ + tbz limit, #63, L(not_limit) + add tmp1, limit, 8 + cbz limit, L(not_limit) + + lsl limit, tmp1, #3 /* Bits -> bytes. */ + mov mask, #~0 + lsr mask, mask, limit + bic data1, data1, mask + bic data2, data2, mask + + /* Make sure that the NUL byte is marked in the syndrome. */ + orr has_nul, has_nul, mask + +L(not_limit): + /* For big-endian we cannot use the trick with the syndrome value + as carry-propagation can corrupt the upper bits if the trailing + bytes in the string contain 0x01. */ + /* However, if there is no NUL byte in the dword, we can generate + the result directly. We can't just subtract the bytes as the + MSB might be significant. */ + cbnz has_nul, 1f + cmp data1, data2 + cset result, ne + cneg result, result, lo + ret +1: + /* Re-compute the NUL-byte detection, using a byte-reversed value. */ + rev tmp3, data1 + sub tmp1, tmp3, zeroones + orr tmp2, tmp3, #REP8_7f + bic has_nul, tmp1, tmp2 + rev has_nul, has_nul + orr syndrome, diff, has_nul + clz pos, syndrome + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ +L(end_quick): + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +#endif + +L(mutual_align): + /* Sources are mutually aligned, but are not currently at an + alignment boundary. Round down the addresses and then mask off + the bytes that precede the start point. + We also need to adjust the limit calculations, but without + overflowing if the limit is near ULONG_MAX. */ + bic src1, src1, #7 + bic src2, src2, #7 + ldr data1, [src1], #8 + neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ + ldr data2, [src2], #8 + mov tmp2, #~0 + LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ + /* Adjust the limit and ensure it doesn't overflow. */ + adds limit, limit, count + csinv limit, limit, xzr, lo + orr data1, data1, tmp2 + orr data2, data2, tmp2 + b L(start_realigned) + + .p2align 4 + /* Don't bother with dwords for up to 16 bytes. */ +L(misaligned8): + cmp limit, #16 + b.hs L(try_misaligned_words) + +L(byte_loop): + /* Perhaps we can do better than this. */ + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs limit, limit, #1 + ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.eq L(byte_loop) +L(done): + sub result, data1, data2 + ret + /* Align the SRC1 to a dword by doing a bytewise compare and then do + the dword loop. */ +L(try_misaligned_words): + cbz count, L(src1_aligned) + + neg count, count + and count, count, #7 + sub limit, limit, count + +L(page_end_loop): + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + cmp data1w, #1 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.ne L(done) + subs count, count, #1 + b.hi L(page_end_loop) + + /* The following diagram explains the comparison of misaligned strings. + The bytes are shown in natural order. For little-endian, it is + reversed in the registers. The "x" bytes are before the string. + The "|" separates data that is loaded at one time. + src1 | a a a a a a a a | b b b c c c c c | . . . + src2 | x x x x x a a a a a a a a b b b | c c c c c . . . + + After shifting in each step, the data looks like this: + STEP_A STEP_B STEP_C + data1 a a a a a a a a b b b c c c c c b b b c c c c c + data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c + + The bytes with "0" are eliminated from the syndrome via mask. + + Align SRC2 down to 16 bytes. This way we can read 16 bytes at a + time from SRC2. The comparison happens in 3 steps. After each step + the loop can exit, or read from SRC1 or SRC2. */ +L(src1_aligned): + /* Calculate offset from 8 byte alignment to string start in bits. No + need to mask offset since shifts are ignoring upper bits. */ + lsl offset, src2, #3 + bic src2, src2, #0xf + mov mask, -1 + neg neg_offset, offset + ldr data1, [src1], #8 + ldp tmp1, tmp2, [src2], #16 + LS_BK mask, mask, neg_offset + and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ + /* Skip the first compare if data in tmp1 is irrelevant. */ + tbnz offset, 6, L(misaligned_mid_loop) + +L(loop_misaligned): + /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ + LS_FW data2, tmp1, offset + LS_BK tmp1, tmp2, neg_offset + subs limit, limit, #8 + orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ + sub has_nul, data1, zeroones + eor diff, data1, data2 /* Non-zero if differences found. */ + orr tmp3, data1, #REP8_7f + csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ + orr tmp3, endloop, has_nul + cbnz tmp3, L(full_check) + + ldr data1, [src1], #8 +L(misaligned_mid_loop): + /* STEP_B: Compare first part of data1 to second part of tmp2. */ + LS_FW data2, tmp2, offset +#ifdef __AARCH64EB__ + /* For big-endian we do a byte reverse to avoid carry-propagation + problem described above. This way we can reuse the has_nul in the + next step and also use syndrome value trick at the end. */ + rev tmp3, data1 + #define data1_fixed tmp3 +#else + #define data1_fixed data1 +#endif + sub has_nul, data1_fixed, zeroones + orr tmp3, data1_fixed, #REP8_7f + eor diff, data2, data1 /* Non-zero if differences found. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + cmp limit, neg_offset, lsr #3 + orr syndrome, diff, has_nul + bic syndrome, syndrome, mask /* Ignore later bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + /* STEP_C: Compare second part of data1 to first part of tmp1. */ + ldp tmp1, tmp2, [src2], #16 + cmp limit, #8 + LS_BK data2, tmp1, neg_offset + eor diff, data2, data1 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + and syndrome, syndrome, mask /* Ignore earlier bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + ldr data1, [src1], #8 + sub limit, limit, #8 + b L(loop_misaligned) + +#ifdef __AARCH64EB__ +L(syndrome_check): + clz pos, syndrome + cmp pos, limit, lsl #3 + b.lo L(end_quick) +#endif + +L(ret0): + mov result, #0 + ret +END(__strncmp_aarch64_mte) + diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S index 6f31eca..234190e 100644 --- a/string/aarch64/strncmp-sve.S +++ b/string/aarch64/strncmp-sve.S @@ -1,23 +1,23 @@ /* * strncmp - compare two strings with limit * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2021, Arm Limited. * SPDX-License-Identifier: MIT */ +#include "../asmdefs.h" + +#if __ARM_FEATURE_SVE /* Assumptions: * * ARMv8-a, AArch64 * SVE Available. */ - .arch armv8-a+sve - .text - - .globl __strncmp_aarch64_sve - .type __strncmp_aarch64_sve, %function - .p2align 4 -__strncmp_aarch64_sve: +ENTRY (__strncmp_aarch64_sve) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) setffr /* initialize FFR */ mov x3, 0 /* initialize off */ @@ -63,4 +63,7 @@ __strncmp_aarch64_sve: 9: mov x0, 0 /* return equal */ ret - .size __strncmp_aarch64_sve, . - __strncmp_aarch64_sve +END (__strncmp_aarch64_sve) + +#endif + diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S index fbd08ee..738b653 100644 --- a/string/aarch64/strncmp.S +++ b/string/aarch64/strncmp.S @@ -1,7 +1,7 @@ /* * strncmp - compare two strings * - * Copyright (c) 2013, Arm Limited. + * Copyright (c) 2013-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -40,12 +40,10 @@ #define endloop x15 #define count mask - .text - .p2align 6 - .rep 7 - nop /* Pad so that the loop below fits a cache line. */ - .endr -ENTRY_ALIGN (__strncmp_aarch64, 0) +ENTRY (__strncmp_aarch64) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) cbz limit, L(ret0) eor tmp1, src1, src2 mov zeroones, #REP8_01 @@ -60,7 +58,7 @@ ENTRY_ALIGN (__strncmp_aarch64, 0) /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and can be done in parallel across the entire word. */ - /* Start of performance-critical section -- one 64B cache line. */ + .p2align 4 L(loop_aligned): ldr data1, [src1], #8 ldr data2, [src2], #8 @@ -73,7 +71,7 @@ L(start_realigned): bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp endloop, #0, #0, eq b.eq L(loop_aligned) - /* End of performance-critical section -- one 64B cache line. */ + /* End of main loop */ /* Not reached the limit, must have found the end or a diff. */ tbz limit_wd, #63, L(not_limit) @@ -178,7 +176,7 @@ L(mutual_align): add limit_wd, limit_wd, tmp3, lsr #3 b L(start_realigned) - .p2align 6 + .p2align 4 /* Don't bother with dwords for up to 16 bytes. */ L(misaligned8): cmp limit, #16 @@ -259,3 +257,4 @@ L(ret0): ret END ( __strncmp_aarch64) + diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S index 3a9be08..5b9ebf7 100644 --- a/string/aarch64/strnlen-sve.S +++ b/string/aarch64/strnlen-sve.S @@ -1,23 +1,22 @@ /* * strnlen - calculate the length of a string with limit. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2021, Arm Limited. * SPDX-License-Identifier: MIT */ +#include "../asmdefs.h" + +#if __ARM_FEATURE_SVE /* Assumptions: * * ARMv8-a, AArch64 * SVE Available. */ - .arch armv8-a+sve - .text - - .globl __strnlen_aarch64_sve - .type __strnlen_aarch64_sve, %function - .p2align 4 -__strnlen_aarch64_sve: +ENTRY (__strnlen_aarch64_sve) + PTR_ARG (0) + SIZE_ARG (1) setffr /* initialize FFR */ mov x2, 0 /* initialize len */ b 1f @@ -66,7 +65,10 @@ __strnlen_aarch64_sve: b 1b /* End of count. Return max. */ -9: mov x0, x2 +9: mov x0, x1 ret - .size __strnlen_aarch64_sve, . - __strnlen_aarch64_sve +END (__strnlen_aarch64_sve) + +#endif + diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S index df66b60..48d2495 100644 --- a/string/aarch64/strnlen.S +++ b/string/aarch64/strnlen.S @@ -1,155 +1,112 @@ /* * strnlen - calculate the length of a string with limit. * - * Copyright (c) 2013, Arm Limited. + * Copyright (c) 2020, Arm Limited. * SPDX-License-Identifier: MIT */ /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. */ #include "../asmdefs.h" -/* Arguments and results. */ #define srcin x0 -#define len x0 -#define limit x1 +#define cntin x1 +#define result x0 -/* Locals and temporaries. */ #define src x2 -#define data1 x3 -#define data2 x4 -#define data2a x5 -#define has_nul1 x6 -#define has_nul2 x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define tmp4 x11 -#define zeroones x12 -#define pos x13 -#define limit_wd x14 +#define synd x3 +#define shift x4 +#define wtmp w4 +#define tmp x4 +#define cntrem x5 + +#define qdata q0 +#define vdata v0 +#define vhas_chr v1 +#define vrepmask v2 +#define vend v3 +#define dend d3 -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 - - .text - .p2align 6 -L(start): - /* Pre-pad to ensure critical loop begins an icache line. */ - .rep 7 - nop - .endr - /* Put this code here to avoid wasting more space with pre-padding. */ -L(hit_limit): - mov len, limit +/* + Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (__strnlen_aarch64) + PTR_ARG (0) + SIZE_ARG (1) + bic src, srcin, 15 + mov wtmp, 0xf00f + cbz cntin, L(nomatch) + ld1 {vdata.16b}, [src], 16 + dup vrepmask.8h, wtmp + cmeq vhas_chr.16b, vdata.16b, 0 + lsl shift, srcin, 2 + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + lsr synd, synd, shift + cbz synd, L(start_loop) +L(finish): + rbit synd, synd + clz synd, synd + lsr result, synd, 2 + cmp cntin, result + csel result, cntin, result, ls ret -ENTRY_ALIGN (__strnlen_aarch64, 0) - cbz limit, L(hit_limit) - mov zeroones, #REP8_01 - bic src, srcin, #15 - ands tmp1, srcin, #15 - b.ne L(misaligned) - /* Calculate the number of full and partial words -1. */ - sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ - lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ - /* The inner loop deals with two Dwords at a time. This has a - slightly higher start-up cost, but we should win quite quickly, - especially on cores with a high number of issue slots per - cycle, as we get much better parallelism out of the operations. */ - - /* Start of critial section -- keep to one 64Byte cache line. */ -L(loop): - ldp data1, data2, [src], #16 -L(realigned): - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - subs limit_wd, limit_wd, #1 - orr tmp1, has_nul1, has_nul2 - ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ - b.eq L(loop) - /* End of critical section -- keep to one 64Byte cache line. */ - - orr tmp1, has_nul1, has_nul2 - cbz tmp1, L(hit_limit) /* No null in final Qword. */ - - /* We know there's a null in the final Qword. The easiest thing - to do now is work out the length of the string and return - MIN (len, limit). */ - - sub len, src, srcin - cbz has_nul1, L(nul_in_data2) -#ifdef __AARCH64EB__ - mov data2, data1 -#endif - sub len, len, #8 - mov has_nul2, has_nul1 -L(nul_in_data2): -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ - rev data2, data2 - sub tmp1, data2, zeroones - orr tmp2, data2, #REP8_7f - bic has_nul2, tmp1, tmp2 +L(start_loop): + sub tmp, src, srcin + subs cntrem, cntin, tmp + b.ls L(nomatch) + + /* Make sure that it won't overread by a 16-byte chunk */ + add tmp, cntrem, 15 + tbnz tmp, 4, L(loop32_2) + + .p2align 5 +L(loop32): + ldr qdata, [src], 16 + cmeq vhas_chr.16b, vdata.16b, 0 + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbnz synd, L(end) +L(loop32_2): + ldr qdata, [src], 16 + subs cntrem, cntrem, 32 + cmeq vhas_chr.16b, vdata.16b, 0 + b.ls L(end) + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbz synd, L(loop32) + +L(end): + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + sub src, src, 16 + mov synd, vend.d[0] + sub result, src, srcin +#ifndef __AARCH64EB__ + rbit synd, synd #endif - sub len, len, #8 - rev has_nul2, has_nul2 - clz pos, has_nul2 - add len, len, pos, lsr #3 /* Bits to bytes. */ - cmp len, limit - csel len, len, limit, ls /* Return the lower value. */ + clz synd, synd + add result, result, synd, lsr 2 + cmp cntin, result + csel result, cntin, result, ls ret -L(misaligned): - /* Deal with a partial first word. - We're doing two things in parallel here; - 1) Calculate the number of words (but avoiding overflow if - limit is near ULONG_MAX) - to do this we need to work out - limit + tmp1 - 1 as a 65-bit value before shifting it; - 2) Load and mask the initial data words - we force the bytes - before the ones we are interested in to 0xff - this ensures - early bytes will not hit any zero detection. */ - sub limit_wd, limit, #1 - neg tmp4, tmp1 - cmp tmp1, #8 - - and tmp3, limit_wd, #15 - lsr limit_wd, limit_wd, #4 - mov tmp2, #~0 - - ldp data1, data2, [src], #16 - lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ - add tmp3, tmp3, tmp1 - -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ -#endif - add limit_wd, limit_wd, tmp3, lsr #4 - - orr data1, data1, tmp2 - orr data2a, data2, tmp2 - - csinv data1, data1, xzr, le - csel data2, data2, data2a, le - b L(realigned) +L(nomatch): + mov result, cntin + ret END (__strnlen_aarch64) + diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S new file mode 100644 index 0000000..1e4fb1a --- /dev/null +++ b/string/aarch64/strrchr-mte.S @@ -0,0 +1,127 @@ +/* + * strrchr - find last position of a character in a string. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define srcin x0 +#define chrin w1 +#define result x0 + +#define src x2 +#define tmp x3 +#define wtmp w3 +#define synd x3 +#define shift x4 +#define src_match x4 +#define nul_match x5 +#define chr_match x6 + +#define vrepchr v0 +#define vdata v1 +#define vhas_nul v2 +#define vhas_chr v3 +#define vrepmask v4 +#define vrepmask2 v5 +#define vend v5 +#define dend d5 + +/* Core algorithm. + + For each 16-byte chunk we calculate a 64-bit syndrome value, with + four bits per byte (LSB is always in bits 0 and 1, for both big + and little-endian systems). For each tuple, bits 0-1 are set if + the relevant byte matched the requested character; bits 2-3 are set + if the relevant byte matched the NUL end of string. */ + +ENTRY (__strrchr_aarch64_mte) + PTR_ARG (0) + bic src, srcin, 15 + dup vrepchr.16b, chrin + mov wtmp, 0x3003 + dup vrepmask.8h, wtmp + tst srcin, 15 + beq L(loop1) + + ld1 {vdata.16b}, [src], 16 + cmeq vhas_nul.16b, vdata.16b, 0 + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + mov wtmp, 0xf00f + dup vrepmask2.8h, wtmp + bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b + and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b + lsl shift, srcin, 2 + fmov synd, dend + lsr synd, synd, shift + lsl synd, synd, shift + ands nul_match, synd, 0xcccccccccccccccc + bne L(tail) + cbnz synd, L(loop2) + + .p2align 5 +L(loop1): + ld1 {vdata.16b}, [src], 16 + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(loop1) + + cmeq vhas_nul.16b, vdata.16b, 0 + bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b + bic vhas_nul.8h, 0x0f, lsl 8 + addp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + ands nul_match, synd, 0xcccccccccccccccc + beq L(loop2) + +L(tail): + sub nul_match, nul_match, 1 + and chr_match, synd, 0x3333333333333333 + ands chr_match, chr_match, nul_match + sub result, src, 1 + clz tmp, chr_match + sub result, result, tmp, lsr 2 + csel result, result, xzr, ne + ret + + .p2align 4 +L(loop2): + cmp synd, 0 + csel src_match, src, src_match, ne + csel chr_match, synd, chr_match, ne + ld1 {vdata.16b}, [src], 16 + cmeq vhas_nul.16b, vdata.16b, 0 + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + tst synd, 0xcccccccccccccccc + beq L(loop2) + + bic vhas_nul.8h, 0x0f, lsl 8 + addp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + and nul_match, synd, 0xcccccccccccccccc + sub nul_match, nul_match, 1 + and tmp, synd, 0x3333333333333333 + ands tmp, tmp, nul_match + csel chr_match, tmp, chr_match, ne + csel src_match, src, src_match, ne + sub src_match, src_match, 1 + clz tmp, chr_match + sub result, src_match, tmp, lsr 2 + ret + +END (__strrchr_aarch64_mte) + diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S index bb522e7..d36d69a 100644 --- a/string/aarch64/strrchr-sve.S +++ b/string/aarch64/strrchr-sve.S @@ -1,23 +1,21 @@ /* * strrchr - find the last of a character in a string * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2021, Arm Limited. * SPDX-License-Identifier: MIT */ +#include "../asmdefs.h" + +#if __ARM_FEATURE_SVE /* Assumptions: * * ARMv8-a, AArch64 * SVE Available. */ - .arch armv8-a+sve - .text - - .globl __strrchr_aarch64_sve - .type __strrchr_aarch64_sve, %function - .p2align 4 -__strrchr_aarch64_sve: +ENTRY (__strrchr_aarch64_sve) + PTR_ARG (0) dup z1.b, w1 /* replicate byte across vector */ setffr /* initialize FFR */ ptrue p1.b /* all ones; loop invariant */ @@ -80,4 +78,7 @@ __strrchr_aarch64_sve: 5: mov x0, 0 ret - .size __strrchr_aarch64_sve, . - __strrchr_aarch64_sve +END (__strrchr_aarch64_sve) + +#endif + diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S index 1b4caac..56185ff 100644 --- a/string/aarch64/strrchr.S +++ b/string/aarch64/strrchr.S @@ -55,6 +55,7 @@ identify exactly which byte is causing the termination, and why. */ ENTRY (__strrchr_aarch64) + PTR_ARG (0) /* Magic constant 0x40100401 to allow us to identify which lane matches the requested byte. Magic constant 0x80200802 used similarly for NUL termination. */ @@ -84,38 +85,38 @@ ENTRY (__strrchr_aarch64) and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128 addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 - addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b // 128->64 - addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64 - mov nul_match, vhas_nul1.d[0] + addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64 + mov nul_match, vend1.d[0] lsl tmp1, tmp1, #1 mov const_m1, #~0 - mov chr_match, vhas_chr1.d[0] lsr tmp3, const_m1, tmp1 + mov chr_match, vend1.d[1] bic nul_match, nul_match, tmp3 // Mask padding bits. bic chr_match, chr_match, tmp3 // Mask padding bits. cbnz nul_match, L(tail) + .p2align 4 L(loop): cmp chr_match, #0 csel src_match, src, src_match, ne csel src_offset, chr_match, src_offset, ne L(aligned): ld1 {vdata1.16b, vdata2.16b}, [src], #32 - cmeq vhas_nul1.16b, vdata1.16b, #0 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b - cmeq vhas_nul2.16b, vdata2.16b, #0 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - addp vend1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128 + uminp vend1.16b, vdata1.16b, vdata2.16b and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b + cmeq vend1.16b, vend1.16b, 0 addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 - addp vend1.16b, vend1.16b, vend1.16b // 128->64 - addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64 + addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64 mov nul_match, vend1.d[0] - mov chr_match, vhas_chr1.d[0] + mov chr_match, vend1.d[1] cbz nul_match, L(loop) + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_nul2.16b, vdata2.16b, #0 and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b @@ -145,3 +146,4 @@ L(tail): ret END (__strrchr_aarch64) + diff --git a/string/arm/check-arch.S b/string/arm/check-arch.S new file mode 100644 index 0000000..1cff934 --- /dev/null +++ b/string/arm/check-arch.S @@ -0,0 +1,10 @@ +/* + * check ARCH setting. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if !__arm__ +# error ARCH setting does not match the compiler. +#endif diff --git a/string/arm/memchr.S b/string/arm/memchr.S index 2eff4d1..3f1ac4d 100644 --- a/string/arm/memchr.S +++ b/string/arm/memchr.S @@ -1,7 +1,7 @@ /* * memchr - scan memory for a character * - * Copyright (c) 2010, Arm Limited. + * Copyright (c) 2010-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -31,7 +31,6 @@ #else #define CHARTSTMASK(c) 1<<(c*8) #endif - .text .thumb @ --------------------------------------------------------------------------- diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S index aab78a2..86e6493 100644 --- a/string/arm/memcpy.S +++ b/string/arm/memcpy.S @@ -1,7 +1,7 @@ /* * memcpy - copy memory area * - * Copyright (c) 2013, Arm Limited. + * Copyright (c) 2013-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -124,7 +124,7 @@ ENTRY (__memcpy_arm) mov dst, dstin /* Preserve dstin, we need to return it. */ cmp count, #64 - bge L(cpy_not_short) + bhs L(cpy_not_short) /* Deal with small copies quickly by dropping straight into the exit block. */ @@ -239,10 +239,10 @@ L(cpy_not_short): 1: subs tmp2, count, #64 /* Use tmp2 for count. */ - blt L(tail63aligned) + blo L(tail63aligned) cmp tmp2, #512 - bge L(cpy_body_long) + bhs L(cpy_body_long) L(cpy_body_medium): /* Count in tmp2. */ #ifdef USE_VFP @@ -266,7 +266,7 @@ L(cpy_body_medium): /* Count in tmp2. */ add src, src, #64 vstr d1, [dst, #56] add dst, dst, #64 - bge 1b + bhs 1b tst tmp2, #0x3f beq L(done) @@ -312,7 +312,7 @@ L(tail63aligned): /* Count in tmp2. */ ldrd A_l, A_h, [src, #64]! strd A_l, A_h, [dst, #64]! subs tmp2, tmp2, #64 - bge 1b + bhs 1b tst tmp2, #0x3f bne 1f ldr tmp2,[sp], #FRAME_SIZE @@ -383,7 +383,7 @@ L(cpy_body_long): /* Count in tmp2. */ add src, src, #32 subs tmp2, tmp2, #prefetch_lines * 64 * 2 - blt 2f + blo 2f 1: cpy_line_vfp d3, 0 cpy_line_vfp d4, 64 @@ -395,7 +395,7 @@ L(cpy_body_long): /* Count in tmp2. */ add dst, dst, #2 * 64 add src, src, #2 * 64 subs tmp2, tmp2, #prefetch_lines * 64 - bge 1b + bhs 1b 2: cpy_tail_vfp d3, 0 @@ -499,15 +499,15 @@ L(cpy_notaligned): 1: pld [src, #(3 * 64)] subs count, count, #64 - ldrmi tmp2, [sp], #FRAME_SIZE - bmi L(tail63unaligned) + ldrlo tmp2, [sp], #FRAME_SIZE + blo L(tail63unaligned) pld [src, #(4 * 64)] #ifdef USE_NEON vld1.8 {d0-d3}, [src]! vld1.8 {d4-d7}, [src]! subs count, count, #64 - bmi 2f + blo 2f 1: pld [src, #(4 * 64)] vst1.8 {d0-d3}, [ALIGN (dst, 64)]! @@ -515,7 +515,7 @@ L(cpy_notaligned): vst1.8 {d4-d7}, [ALIGN (dst, 64)]! vld1.8 {d4-d7}, [src]! subs count, count, #64 - bpl 1b + bhs 1b 2: vst1.8 {d0-d3}, [ALIGN (dst, 64)]! vst1.8 {d4-d7}, [ALIGN (dst, 64)]! diff --git a/string/arm/memset.S b/string/arm/memset.S index 3ee5238..11e9273 100644 --- a/string/arm/memset.S +++ b/string/arm/memset.S @@ -1,7 +1,7 @@ /* * memset - fill memory with a constant * - * Copyright (c) 2010, Arm Limited. + * Copyright (c) 2010-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -25,7 +25,6 @@ #else #define CHARTSTMASK(c) 1<<(c*8) #endif - .text .thumb @ --------------------------------------------------------------------------- diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S index d615231..b75d414 100644 --- a/string/arm/strcmp-armv6m.S +++ b/string/arm/strcmp-armv6m.S @@ -1,10 +1,12 @@ /* * strcmp for ARMv6-M (optimized for performance, not size) * - * Copyright (c) 2014-2019, Arm Limited. + * Copyright (c) 2014-2020, Arm Limited. * SPDX-License-Identifier: MIT */ +#if __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1 + .thumb_func .syntax unified .arch armv6-m @@ -111,3 +113,5 @@ ENTRY_ALIGN (__strcmp_armv6m, 4) pop {r4, r5, r6, pc} END (__strcmp_armv6m) + +#endif /* __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1 */ diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S index 295db8b..51443e3 100644 --- a/string/arm/strcmp.S +++ b/string/arm/strcmp.S @@ -1,10 +1,12 @@ /* * strcmp for ARMv7 * - * Copyright (c) 2012-2019, Arm Limited. + * Copyright (c) 2012-2021, Arm Limited. * SPDX-License-Identifier: MIT */ +#if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 + /* Implementation of strcmp for ARMv7 when DSP instructions are available. Use ldrd to support wider loads, provided the data is sufficiently aligned. Use saturating arithmetic to optimize @@ -123,7 +125,6 @@ #endif .endm - .text .p2align 5 L(strcmp_start_addr): #if STRCMP_NO_PRECHECK == 0 @@ -470,3 +471,5 @@ L(strcmp_tail): bx lr END (__strcmp_arm) + +#endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 */ diff --git a/string/arm/strcpy.c b/string/arm/strcpy.c index 48ebbe8..02cf94f 100644 --- a/string/arm/strcpy.c +++ b/string/arm/strcpy.c @@ -1,10 +1,12 @@ /* * strcpy * - * Copyright (c) 2008-2019, Arm Limited. + * Copyright (c) 2008-2020, Arm Limited. * SPDX-License-Identifier: MIT */ +#if defined (__thumb2__) && !defined (__thumb__) + /* For GLIBC: #include <string.h> #include <memcopy.h> @@ -127,3 +129,5 @@ __strcpy_arm (char* dst, const char* src) "BX LR"); } /* For GLIBC: libc_hidden_builtin_def (strcpy) */ + +#endif /* defined (__thumb2__) && !defined (__thumb__) */ diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S index 76e6930..5ad30c9 100644 --- a/string/arm/strlen-armv6t2.S +++ b/string/arm/strlen-armv6t2.S @@ -1,10 +1,12 @@ /* * strlen - calculate the length of a string * - * Copyright (c) 2010, Arm Limited. + * Copyright (c) 2010-2020, Arm Limited. * SPDX-License-Identifier: MIT */ +#if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 + /* Assumes: ARMv6T2, AArch32 @@ -118,3 +120,5 @@ L(misaligned8): b L(start_realigned) END (__strlen_armv6t2) + +#endif /* __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 */ diff --git a/string/asmdefs.h b/string/asmdefs.h index 7d143a9..340b427 100644 --- a/string/asmdefs.h +++ b/string/asmdefs.h @@ -1,13 +1,64 @@ /* * Macros for asm code. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ #ifndef _ASMDEFS_H #define _ASMDEFS_H +#if defined(__aarch64__) + +/* Branch Target Identitication support. */ +#define BTI_C hint 34 +#define BTI_J hint 36 +/* Return address signing support (pac-ret). */ +#define PACIASP hint 25; .cfi_window_save +#define AUTIASP hint 29; .cfi_window_save + +/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ +#define FEATURE_1_AND 0xc0000000 +#define FEATURE_1_BTI 1 +#define FEATURE_1_PAC 2 + +/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a"; \ + .p2align 3; \ + .word 4; \ + .word 16; \ + .word 5; \ + .asciz "GNU"; \ + .word type; \ + .word 4; \ + .word value; \ + .word 0; \ + .text + +/* If set then the GNU Property Note section will be added to + mark objects to support BTI and PAC-RET. */ +#ifndef WANT_GNU_PROPERTY +#define WANT_GNU_PROPERTY 1 +#endif + +#if WANT_GNU_PROPERTY +/* Add property note with supported features to all asm files. */ +GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) +#endif + +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name,%function; \ + .align alignment; \ + name: \ + .cfi_startproc; \ + BTI_C; + +#else + +#define END_FILE + #define ENTRY_ALIGN(name, alignment) \ .global name; \ .type name,%function; \ @@ -15,6 +66,8 @@ name: \ .cfi_startproc; +#endif + #define ENTRY(name) ENTRY_ALIGN(name, 6) #define ENTRY_ALIAS(name) \ @@ -28,4 +81,18 @@ #define L(l) .L ## l +#ifdef __ILP32__ + /* Sanitize padding bits of pointer arguments as per aapcs64 */ +#define PTR_ARG(n) mov w##n, w##n +#else +#define PTR_ARG(n) +#endif + +#ifdef __ILP32__ + /* Sanitize padding bits of size arguments as per aapcs64 */ +#define SIZE_ARG(n) mov w##n, w##n +#else +#define SIZE_ARG(n) +#endif + #endif diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c new file mode 100644 index 0000000..d5d4ea7 --- /dev/null +++ b/string/bench/memcpy.c @@ -0,0 +1,260 @@ +/* + * memcpy benchmark. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define _GNU_SOURCE +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include "stringlib.h" +#include "benchlib.h" + +#define ITERS 5000 +#define ITERS2 20000000 +#define ITERS3 500000 +#define MAX_COPIES 8192 +#define SIZE (256*1024) + +static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64))); +static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64))); + +#define F(x) {#x, x}, + +static const struct fun +{ + const char *name; + void *(*fun)(void *, const void *, size_t); +} funtab[] = +{ + F(memcpy) +#if __aarch64__ + F(__memcpy_aarch64) +# if __ARM_NEON + F(__memcpy_aarch64_simd) +# endif +#elif __arm__ + F(__memcpy_arm) +#endif +#undef F + {0, 0} +}; + +typedef struct { uint16_t size; uint16_t freq; } freq_data_t; +typedef struct { uint8_t align; uint16_t freq; } align_data_t; + +#define SIZE_NUM 65536 +#define SIZE_MASK (SIZE_NUM-1) +static uint8_t size_arr[SIZE_NUM]; + +/* Frequency data for memcpy of less than 4096 bytes based on SPEC2017. */ +static freq_data_t size_freq[] = +{ +{32,22320}, { 16,9554}, { 8,8915}, {152,5327}, { 4,2159}, {292,2035}, +{ 12,1608}, { 24,1343}, {1152,895}, {144, 813}, {884, 733}, {284, 721}, +{120, 661}, { 2, 649}, {882, 550}, { 5, 475}, { 7, 461}, {108, 460}, +{ 10, 361}, { 9, 361}, { 6, 334}, { 3, 326}, {464, 308}, {2048,303}, +{ 1, 298}, { 64, 250}, { 11, 197}, {296, 194}, { 68, 187}, { 15, 185}, +{192, 184}, {1764,183}, { 13, 173}, {560, 126}, {160, 115}, {288, 96}, +{104, 96}, {1144, 83}, { 18, 80}, { 23, 78}, { 40, 77}, { 19, 68}, +{ 48, 63}, { 17, 57}, { 72, 54}, {1280, 51}, { 20, 49}, { 28, 47}, +{ 22, 46}, {640, 45}, { 25, 41}, { 14, 40}, { 56, 37}, { 27, 35}, +{ 35, 33}, {384, 33}, { 29, 32}, { 80, 30}, {4095, 22}, {232, 22}, +{ 36, 19}, {184, 17}, { 21, 17}, {256, 16}, { 44, 15}, { 26, 15}, +{ 31, 14}, { 88, 14}, {176, 13}, { 33, 12}, {1024, 12}, {208, 11}, +{ 62, 11}, {128, 10}, {704, 10}, {324, 10}, { 96, 10}, { 60, 9}, +{136, 9}, {124, 9}, { 34, 8}, { 30, 8}, {480, 8}, {1344, 8}, +{273, 7}, {520, 7}, {112, 6}, { 52, 6}, {344, 6}, {336, 6}, +{504, 5}, {168, 5}, {424, 5}, { 0, 4}, { 76, 3}, {200, 3}, +{512, 3}, {312, 3}, {240, 3}, {960, 3}, {264, 2}, {672, 2}, +{ 38, 2}, {328, 2}, { 84, 2}, { 39, 2}, {216, 2}, { 42, 2}, +{ 37, 2}, {1608, 2}, { 70, 2}, { 46, 2}, {536, 2}, {280, 1}, +{248, 1}, { 47, 1}, {1088, 1}, {1288, 1}, {224, 1}, { 41, 1}, +{ 50, 1}, { 49, 1}, {808, 1}, {360, 1}, {440, 1}, { 43, 1}, +{ 45, 1}, { 78, 1}, {968, 1}, {392, 1}, { 54, 1}, { 53, 1}, +{ 59, 1}, {376, 1}, {664, 1}, { 58, 1}, {272, 1}, { 66, 1}, +{2688, 1}, {472, 1}, {568, 1}, {720, 1}, { 51, 1}, { 63, 1}, +{ 86, 1}, {496, 1}, {776, 1}, { 57, 1}, {680, 1}, {792, 1}, +{122, 1}, {760, 1}, {824, 1}, {552, 1}, { 67, 1}, {456, 1}, +{984, 1}, { 74, 1}, {408, 1}, { 75, 1}, { 92, 1}, {576, 1}, +{116, 1}, { 65, 1}, {117, 1}, { 82, 1}, {352, 1}, { 55, 1}, +{100, 1}, { 90, 1}, {696, 1}, {111, 1}, {880, 1}, { 79, 1}, +{488, 1}, { 61, 1}, {114, 1}, { 94, 1}, {1032, 1}, { 98, 1}, +{ 87, 1}, {584, 1}, { 85, 1}, {648, 1}, {0, 0} +}; + +#define ALIGN_NUM 1024 +#define ALIGN_MASK (ALIGN_NUM-1) +static uint8_t src_align_arr[ALIGN_NUM]; +static uint8_t dst_align_arr[ALIGN_NUM]; + +/* Source alignment frequency for memcpy based on SPEC2017. */ +static align_data_t src_align_freq[] = +{ + {8, 300}, {16, 292}, {32, 168}, {64, 153}, {4, 79}, {2, 14}, {1, 18}, {0, 0} +}; + +static align_data_t dst_align_freq[] = +{ + {8, 265}, {16, 263}, {64, 209}, {32, 174}, {4, 90}, {2, 10}, {1, 13}, {0, 0} +}; + +typedef struct +{ + uint64_t src : 24; + uint64_t dst : 24; + uint64_t len : 16; +} copy_t; + +static copy_t copy[MAX_COPIES]; + +typedef char *(*proto_t) (char *, const char *, size_t); + +static void +init_copy_distribution (void) +{ + int i, j, freq, size, n; + + for (n = i = 0; (freq = size_freq[i].freq) != 0; i++) + for (j = 0, size = size_freq[i].size; j < freq; j++) + size_arr[n++] = size; + assert (n == SIZE_NUM); + + for (n = i = 0; (freq = src_align_freq[i].freq) != 0; i++) + for (j = 0, size = src_align_freq[i].align; j < freq; j++) + src_align_arr[n++] = size - 1; + assert (n == ALIGN_NUM); + + for (n = i = 0; (freq = dst_align_freq[i].freq) != 0; i++) + for (j = 0, size = dst_align_freq[i].align; j < freq; j++) + dst_align_arr[n++] = size - 1; + assert (n == ALIGN_NUM); +} + +static size_t +init_copies (size_t max_size) +{ + size_t total = 0; + /* Create a random set of copies with the given size and alignment + distributions. */ + for (int i = 0; i < MAX_COPIES; i++) + { + copy[i].dst = (rand32 (0) & (max_size - 1)); + copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK]; + copy[i].src = (rand32 (0) & (max_size - 1)); + copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK]; + copy[i].len = size_arr[rand32 (0) & SIZE_MASK]; + total += copy[i].len; + } + + return total; +} + +int main (void) +{ + init_copy_distribution (); + + memset (a, 1, sizeof (a)); + memset (b, 2, sizeof (b)); + + printf("Random memcpy:\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + size_t total = 0; + uint64_t tsum = 0; + printf ("%22s (B/ns) ", funtab[f].name); + rand32 (0x12345678); + + for (int size = 16384; size <= SIZE; size *= 2) + { + size_t copy_size = init_copies (size) * ITERS; + + for (int c = 0; c < MAX_COPIES; c++) + funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len); + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS; i++) + for (int c = 0; c < MAX_COPIES; c++) + funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len); + t = clock_get_ns () - t; + total += copy_size; + tsum += t; + printf ("%dK: %.2f ", size / 1024, (double)copy_size / t); + } + printf( "avg %.2f\n", (double)total / tsum); + } + + printf ("\nMedium memcpy:\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s (B/ns) ", funtab[f].name); + + for (int size = 16; size <= 512; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + funtab[f].fun (b, a, size); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); + } + printf ("\n"); + } + + printf ("\nLarge memcpy:\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s (B/ns) ", funtab[f].name); + + for (int size = 1024; size <= 32768; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + funtab[f].fun (b, a, size); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + } + printf ("\n"); + } + + printf ("\nUnaligned forwards memmove:\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s (B/ns) ", funtab[f].name); + + for (int size = 1024; size <= 32768; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + funtab[f].fun (a, a + 256 + (i & 31), size); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + } + printf ("\n"); + } + + + printf ("\nUnaligned backwards memmove:\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s (B/ns) ", funtab[f].name); + + for (int size = 1024; size <= 32768; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + funtab[f].fun (a + 256 + (i & 31), a, size); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + } + printf ("\n"); + } + + return 0; +} diff --git a/string/bench/strlen.c b/string/bench/strlen.c new file mode 100644 index 0000000..cc0f04b --- /dev/null +++ b/string/bench/strlen.c @@ -0,0 +1,221 @@ +/* + * strlen benchmark. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define _GNU_SOURCE +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include "stringlib.h" +#include "benchlib.h" + +#define ITERS 2000 +#define ITERS2 20000000 +#define ITERS3 2000000 +#define NUM_STRLEN 16384 + +#define MAX_ALIGN 32 +#define MAX_STRLEN 256 + +static char a[(MAX_STRLEN + 1) * MAX_ALIGN] __attribute__((__aligned__(4096))); + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + size_t (*fun) (const char *s); + int test_mte; +} funtab[] = { + // clang-format off + F(strlen, 0) +#if __aarch64__ + F(__strlen_aarch64, 0) + F(__strlen_aarch64_mte, 1) +# if __ARM_FEATURE_SVE + F(__strlen_aarch64_sve, 1) +# endif +#elif __arm__ +# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 + F(__strlen_armv6t2, 0) +# endif +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +static uint16_t strlen_tests[NUM_STRLEN]; + +typedef struct { uint16_t size; uint16_t freq; } freq_data_t; +typedef struct { uint8_t align; uint16_t freq; } align_data_t; + +#define SIZE_NUM 65536 +#define SIZE_MASK (SIZE_NUM - 1) +static uint8_t strlen_len_arr[SIZE_NUM]; + +/* Frequency data for strlen sizes up to 128 based on SPEC2017. */ +static freq_data_t strlen_len_freq[] = +{ + { 12,22671}, { 18,12834}, { 13, 9555}, { 6, 6348}, { 17, 6095}, { 11, 2115}, + { 10, 1335}, { 7, 814}, { 2, 646}, { 9, 483}, { 8, 471}, { 16, 418}, + { 4, 390}, { 1, 388}, { 5, 233}, { 3, 204}, { 0, 79}, { 14, 79}, + { 15, 69}, { 26, 36}, { 22, 35}, { 31, 24}, { 32, 24}, { 19, 21}, + { 25, 17}, { 28, 15}, { 21, 14}, { 33, 14}, { 20, 13}, { 24, 9}, + { 29, 9}, { 30, 9}, { 23, 7}, { 34, 7}, { 27, 6}, { 44, 5}, + { 42, 4}, { 45, 3}, { 47, 3}, { 40, 2}, { 41, 2}, { 43, 2}, + { 58, 2}, { 78, 2}, { 36, 2}, { 48, 1}, { 52, 1}, { 60, 1}, + { 64, 1}, { 56, 1}, { 76, 1}, { 68, 1}, { 80, 1}, { 84, 1}, + { 72, 1}, { 86, 1}, { 35, 1}, { 39, 1}, { 50, 1}, { 38, 1}, + { 37, 1}, { 46, 1}, { 98, 1}, {102, 1}, {128, 1}, { 51, 1}, + {107, 1}, { 0, 0} +}; + +#define ALIGN_NUM 1024 +#define ALIGN_MASK (ALIGN_NUM - 1) +static uint8_t strlen_align_arr[ALIGN_NUM]; + +/* Alignment data for strlen based on SPEC2017. */ +static align_data_t string_align_freq[] = +{ + {8, 470}, {32, 427}, {16, 99}, {1, 19}, {2, 6}, {4, 3}, {0, 0} +}; + +static void +init_strlen_distribution (void) +{ + int i, j, freq, size, n; + + for (n = i = 0; (freq = strlen_len_freq[i].freq) != 0; i++) + for (j = 0, size = strlen_len_freq[i].size; j < freq; j++) + strlen_len_arr[n++] = size; + assert (n == SIZE_NUM); + + for (n = i = 0; (freq = string_align_freq[i].freq) != 0; i++) + for (j = 0, size = string_align_freq[i].align; j < freq; j++) + strlen_align_arr[n++] = size; + assert (n == ALIGN_NUM); +} + +static void +init_strlen_tests (void) +{ + uint16_t index[MAX_ALIGN]; + + memset (a, 'x', sizeof (a)); + + /* Create indices for strings at all alignments. */ + for (int i = 0; i < MAX_ALIGN; i++) + { + index[i] = i * (MAX_STRLEN + 1); + a[index[i] + MAX_STRLEN] = 0; + } + + /* Create a random set of strlen input strings using the string length + and alignment distributions. */ + for (int n = 0; n < NUM_STRLEN; n++) + { + int align = strlen_align_arr[rand32 (0) & ALIGN_MASK]; + int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK]; + + strlen_tests[n] = + index[(align + exp_len) & (MAX_ALIGN - 1)] + MAX_STRLEN - exp_len; + } +} + +static volatile size_t maskv = 0; + +int main (void) +{ + rand32 (0x12345678); + init_strlen_distribution (); + init_strlen_tests (); + + printf ("\nRandom strlen (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + size_t res = 0, strlen_size = 0, mask = maskv; + printf ("%22s ", funtab[f].name); + + for (int c = 0; c < NUM_STRLEN; c++) + strlen_size += funtab[f].fun (a + strlen_tests[c]); + strlen_size *= ITERS; + + /* Measure latency of strlen result with (res & mask). */ + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS; i++) + for (int c = 0; c < NUM_STRLEN; c++) + res = funtab[f].fun (a + strlen_tests[c] + (res & mask)); + t = clock_get_ns () - t; + printf ("%.2f\n", (double)strlen_size / t); + } + + printf ("\nSmall aligned strlen (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s ", funtab[f].name); + + for (int size = 1; size <= 64; size *= 2) + { + memset (a, 'x', size); + a[size - 1] = 0; + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + funtab[f].fun (a); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); + } + printf ("\n"); + } + + printf ("\nSmall unaligned strlen (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s ", funtab[f].name); + + int align = 9; + for (int size = 1; size <= 64; size *= 2) + { + memset (a + align, 'x', size); + a[align + size - 1] = 0; + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + funtab[f].fun (a + align); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); + } + printf ("\n"); + } + + printf ("\nMedium strlen (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s ", funtab[f].name); + + for (int size = 128; size <= 4096; size *= 2) + { + memset (a, 'x', size); + a[size - 1] = 0; + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + funtab[f].fun (a); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + } + printf ("\n"); + } + + printf ("\n"); + + return 0; +} diff --git a/string/include/benchlib.h b/string/include/benchlib.h new file mode 100644 index 0000000..0f2ce2e --- /dev/null +++ b/string/include/benchlib.h @@ -0,0 +1,33 @@ +/* + * Benchmark support functions. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <time.h> + +/* Fast and accurate timer returning nanoseconds. */ +static inline uint64_t +clock_get_ns (void) +{ + struct timespec ts; + clock_gettime (CLOCK_MONOTONIC, &ts); + return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec; +} + +/* Fast 32-bit random number generator. Passing a non-zero seed + value resets the internal state. */ +static inline uint32_t +rand32 (uint32_t seed) +{ + static uint64_t state = 0xb707be451df0bb19ULL; + if (seed != 0) + state = seed; + uint32_t res = state >> 32; + state = state * 6364136223846793005ULL + 1; + return res; +} + + diff --git a/string/include/stringlib.h b/string/include/stringlib.h index b3b6181..378c3cd 100644 --- a/string/include/stringlib.h +++ b/string/include/stringlib.h @@ -1,7 +1,7 @@ /* * Public API. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -17,8 +17,10 @@ void *__memcpy_aarch64 (void *__restrict, const void *__restrict, size_t); void *__memmove_aarch64 (void *, const void *, size_t); void *__memset_aarch64 (void *, int, size_t); void *__memchr_aarch64 (const void *, int, size_t); +void *__memrchr_aarch64 (const void *, int, size_t); int __memcmp_aarch64 (const void *, const void *, size_t); char *__strcpy_aarch64 (char *__restrict, const char *__restrict); +char *__stpcpy_aarch64 (char *__restrict, const char *__restrict); int __strcmp_aarch64 (const char *, const char *); char *__strchr_aarch64 (const char *, int); char *__strrchr_aarch64 (const char *, int); @@ -26,6 +28,15 @@ char *__strchrnul_aarch64 (const char *, int ); size_t __strlen_aarch64 (const char *); size_t __strnlen_aarch64 (const char *, size_t); int __strncmp_aarch64 (const char *, const char *, size_t); +void * __memchr_aarch64_mte (const void *, int, size_t); +char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict); +char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict); +char *__strchr_aarch64_mte (const char *, int); +char * __strchrnul_aarch64_mte (const char *, int ); +size_t __strlen_aarch64_mte (const char *); +char *__strrchr_aarch64_mte (const char *, int); +int __strcmp_aarch64_mte (const char *, const char *); +int __strncmp_aarch64_mte (const char *, const char *, size_t); #if __ARM_NEON void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t); void *__memmove_aarch64_simd (void *, const void *, size_t); @@ -38,10 +49,15 @@ char *__strrchr_aarch64_sve (const char *, int); char *__strchrnul_aarch64_sve (const char *, int ); int __strcmp_aarch64_sve (const char *, const char *); char *__strcpy_aarch64_sve (char *__restrict, const char *__restrict); +char *__stpcpy_aarch64_sve (char *__restrict, const char *__restrict); size_t __strlen_aarch64_sve (const char *); size_t __strnlen_aarch64_sve (const char *, size_t); int __strncmp_aarch64_sve (const char *, const char *, size_t); # endif +# if __ARM_FEATURE_MEMORY_TAGGING +void *__mtag_tag_region (void *, size_t); +void *__mtag_tag_zero_region (void *, size_t); +# endif #elif __arm__ void *__memcpy_arm (void *__restrict, const void *__restrict, size_t); void *__memset_arm (void *, int, size_t); diff --git a/string/memchr.S b/string/memchr.S deleted file mode 100644 index 0a564d8..0000000 --- a/string/memchr.S +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Selected possible memchr implementations. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#if __aarch64__ -#include "aarch64/memchr.S" -# if __ARM_FEATURE_SVE -#include "aarch64/memchr-sve.S" -# endif -#elif __arm__ -#include "arm/memchr.S" -#endif diff --git a/string/memcmp.S b/string/memcmp.S deleted file mode 100644 index 22da685..0000000 --- a/string/memcmp.S +++ /dev/null @@ -1,13 +0,0 @@ -/* - * Selected possible memcpy implementations. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#if __aarch64__ -#include "aarch64/memcmp.S" -# if __ARM_FEATURE_SVE -#include "aarch64/memcmp-sve.S" -# endif -#endif diff --git a/string/memcpy.S b/string/memcpy.S deleted file mode 100644 index b52b603..0000000 --- a/string/memcpy.S +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Selected possible memcpy implementations. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#if __aarch64__ -#include "aarch64/memcpy.S" -# if __ARM_NEON -#include "aarch64/memcpy_simd.S" -# endif -#elif __arm__ -#include "arm/memcpy.S" -#endif diff --git a/string/memset.S b/string/memset.S deleted file mode 100644 index 57542ef..0000000 --- a/string/memset.S +++ /dev/null @@ -1,12 +0,0 @@ -/* - * Selected possible memset implementations. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#if __aarch64__ -#include "aarch64/memset.S" -#elif __arm__ -#include "arm/memset.S" -#endif diff --git a/string/strchr.S b/string/strchr.S deleted file mode 100644 index 8cead02..0000000 --- a/string/strchr.S +++ /dev/null @@ -1,13 +0,0 @@ -/* - * Selected possible strchr implementations. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#if __aarch64__ -#include "aarch64/strchr.S" -# if __ARM_FEATURE_SVE -#include "aarch64/strchr-sve.S" -# endif -#endif diff --git a/string/strchrnul.S b/string/strchrnul.S deleted file mode 100644 index 3dfdeef..0000000 --- a/string/strchrnul.S +++ /dev/null @@ -1,13 +0,0 @@ -/* - * Selected possible strchr implementations. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#if __aarch64__ -#include "aarch64/strchrnul.S" -# if __ARM_FEATURE_SVE -#include "aarch64/strchrnul-sve.S" -# endif -#endif diff --git a/string/strcmp.S b/string/strcmp.S deleted file mode 100644 index 12530ec..0000000 --- a/string/strcmp.S +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Selected possible strcmp implementations. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#if __aarch64__ -#include "aarch64/strcmp.S" -# if __ARM_FEATURE_SVE -#include "aarch64/strcmp-sve.S" -# endif -#elif __arm__ -# if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 -#include "arm/strcmp.S" -# elif __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1 -#include "arm/strcmp-armv6m.S" -# endif -#endif diff --git a/string/strcpy-c.c b/string/strcpy-c.c deleted file mode 100644 index 6bde24a..0000000 --- a/string/strcpy-c.c +++ /dev/null @@ -1,10 +0,0 @@ -/* - * Selected possible strcpy implementations. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#if __arm__ && defined (__thumb2__) && !defined (__thumb__) -#include "arm/strcpy.c" -#endif diff --git a/string/strcpy.S b/string/strcpy.S deleted file mode 100644 index a604b22..0000000 --- a/string/strcpy.S +++ /dev/null @@ -1,13 +0,0 @@ -/* - * Selected possible strcpy implementations. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#if __aarch64__ -#include "aarch64/strcpy.S" -# if __ARM_FEATURE_SVE -#include "aarch64/strcpy-sve.S" -# endif -#endif diff --git a/string/strlen.S b/string/strlen.S deleted file mode 100644 index d681033..0000000 --- a/string/strlen.S +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Selected possible strlen implementations. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#if __aarch64__ -#include "aarch64/strlen.S" -# if __ARM_FEATURE_SVE -#include "aarch64/strlen-sve.S" -# endif -#elif __arm__ -# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 -#include "arm/strlen-armv6t2.S" -# endif -#endif diff --git a/string/strncmp.S b/string/strncmp.S deleted file mode 100644 index 26b56b7..0000000 --- a/string/strncmp.S +++ /dev/null @@ -1,13 +0,0 @@ -/* - * Selected possible strncmp implementations. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#if __aarch64__ -#include "aarch64/strncmp.S" -# if __ARM_FEATURE_SVE -#include "aarch64/strncmp-sve.S" -# endif -#endif diff --git a/string/strnlen.S b/string/strnlen.S deleted file mode 100644 index eebe777..0000000 --- a/string/strnlen.S +++ /dev/null @@ -1,13 +0,0 @@ -/* - * Selected possible strnlen implementations. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#if __aarch64__ -#include "aarch64/strnlen.S" -# if __ARM_FEATURE_SVE -#include "aarch64/strnlen-sve.S" -# endif -#endif diff --git a/string/strrchr.S b/string/strrchr.S deleted file mode 100644 index 119b1d5..0000000 --- a/string/strrchr.S +++ /dev/null @@ -1,13 +0,0 @@ -/* - * Selected possible strrchr implementations. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#if __aarch64__ -#include "aarch64/strrchr.S" -# if __ARM_FEATURE_SVE -#include "aarch64/strrchr-sve.S" -# endif -#endif diff --git a/string/test/__mtag_tag_region.c b/string/test/__mtag_tag_region.c new file mode 100644 index 0000000..d8c02d9 --- /dev/null +++ b/string/test/__mtag_tag_region.c @@ -0,0 +1,147 @@ +/* + * __mtag_tag_region test. + * + * Copyright (c) 2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +static void +mtag_quoteat (const char *prefix, void *p, int len, int at) +{ + /* Print tag, untag and quote the context. */ + printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at)); + untag_buffer (p, len, 1); + p = untag_pointer (p); + quoteat (prefix, p, len, at); +} + +#define F(x) {#x, x}, + +static const struct fun +{ + const char *name; + void *(*fun) (void *s, size_t n); +} funtab[] = { +// clang-format off +#if __aarch64__ + F(__mtag_tag_region) +#endif + {0, 0} + // clang-format on +}; +#undef F + +#define A 64 +#define LEN 250000 +static unsigned char *sbuf; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + A - 1) & -A); +} + +static void +test (const struct fun *fun, int salign, int len) +{ + unsigned char *src = alignup (sbuf); + unsigned char *s = src + salign; + void *p; + int i; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || salign >= A) + abort (); + for (i = 0; i < len + 2 * A; i++) + src[i] = '?'; + for (i = 0; i < len; i++) + s[i] = 'a'; + + src = tag_buffer (src, len + 2 * A, 1); + s = src + salign; + /* Use different tag. */ + s = __arm_mte_increment_tag (s, 1); + p = fun->fun (s, len); + + if (p != s) + ERR ("%s(%p,..) returned %p\n", fun->name, s, p); + + for (i = 0; i < salign; i++) + { + if (src[i] != '?') + { + ERR ("%s(align %d, %d) failed\n", fun->name, salign, len); + mtag_quoteat ("got head", src, len + 2 * A, i); + return; + } + } + + for (; i < salign + len; i++) + { + if (s[i - salign] != 'a') + { + ERR ("%s(align %d, %d) failed\n", fun->name, salign, len); + mtag_quoteat ("got body", src, len + 2 * A, i); + return; + } + } + + for (; i < len + 2 * A; i++) + { + if (src[i] != '?') + { + ERR ("%s(align %d, %d) failed\n", fun->name, salign, len); + mtag_quoteat ("got tail", src, len + 2 * A, i); + return; + } + } + + untag_buffer (src, len + 2 * A, 1); +} + +int +main () +{ + if (!mte_enabled ()) + return 0; + + sbuf = mte_mmap (LEN + 3 * A); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int s = 0; s < A; s += 16) + { + int n; + for (n = 0; n < 200; n += 16) + { + test (funtab + i, s, n); + } + for (; n < LEN; n *= 2) + { + test (funtab + i, s, n); + } + } + printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name); + if (err_count) + r = -1; + } + return r; +} +#else +int +main () +{ + return 0; +} +#endif diff --git a/string/test/__mtag_tag_zero_region.c b/string/test/__mtag_tag_zero_region.c new file mode 100644 index 0000000..221c223 --- /dev/null +++ b/string/test/__mtag_tag_zero_region.c @@ -0,0 +1,147 @@ +/* + * __mtag_tag_zero_region test. + * + * Copyright (c) 2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +static void +mtag_quoteat (const char *prefix, void *p, int len, int at) +{ + /* Print tag, untag and quote the context. */ + printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at)); + untag_buffer (p, len, 1); + p = untag_pointer (p); + quoteat (prefix, p, len, at); +} + +#define F(x) {#x, x}, + +static const struct fun +{ + const char *name; + void *(*fun) (void *s, size_t n); +} funtab[] = { +// clang-format off +#if __aarch64__ + F(__mtag_tag_zero_region) +#endif + {0, 0} + // clang-format on +}; +#undef F + +#define A 64 +#define LEN 250000 +static unsigned char *sbuf; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + A - 1) & -A); +} + +static void +test (const struct fun *fun, int salign, int len) +{ + unsigned char *src = alignup (sbuf); + unsigned char *s = src + salign; + void *p; + int i; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || salign >= A) + abort (); + for (i = 0; i < len + 2 * A; i++) + src[i] = '?'; + for (i = 0; i < len; i++) + s[i] = 'a' + i % 23; + + src = tag_buffer (src, len + 2 * A, 1); + s = src + salign; + /* Use different tag. */ + s = __arm_mte_increment_tag (s, 1); + p = fun->fun (s, len); + + if (p != s) + ERR ("%s(%p,..) returned %p\n", fun->name, s, p); + + for (i = 0; i < salign; i++) + { + if (src[i] != '?') + { + ERR ("%s(align %d, %d) failed\n", fun->name, salign, len); + mtag_quoteat ("got head", src, len + 2 * A, i); + return; + } + } + + for (; i < salign + len; i++) + { + if (s[i - salign] != 0) + { + ERR ("%s(align %d, %d) failed\n", fun->name, salign, len); + mtag_quoteat ("got body", src, len + 2 * A, i); + return; + } + } + + for (; i < len + 2 * A; i++) + { + if (src[i] != '?') + { + ERR ("%s(align %d, %d) failed\n", fun->name, salign, len); + mtag_quoteat ("got tail", src, len + 2 * A, i); + return; + } + } + + untag_buffer (src, len + 2 * A, 1); +} + +int +main () +{ + if (!mte_enabled ()) + return 0; + + sbuf = mte_mmap (LEN + 3 * A); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int s = 0; s < A; s += 16) + { + int n; + for (n = 0; n < 200; n += 16) + { + test (funtab + i, s, n); + } + for (; n < LEN; n *= 2) + { + test (funtab + i, s, n); + } + } + printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name); + if (err_count) + r = -1; + } + return r; +} +#else +int +main () +{ + return 0; +} +#endif diff --git a/string/test/memchr.c b/string/test/memchr.c index 1ebc6d6..0ff77f5 100644 --- a/string/test/memchr.c +++ b/string/test/memchr.c @@ -1,7 +1,7 @@ /* * memchr test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -10,84 +10,101 @@ #include <stdlib.h> #include <string.h> #include <limits.h> +#include "mte.h" #include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, static const struct fun { - const char *name; - void *(*fun)(const void *, int c, size_t n); + const char *name; + void *(*fun) (const void *s, int c, size_t n); + int test_mte; } funtab[] = { -#define F(x) {#x, x}, -F(memchr) + // clang-format off + F(memchr, 0) #if __aarch64__ -F(__memchr_aarch64) + F(__memchr_aarch64, 0) + F(__memchr_aarch64_mte, 1) # if __ARM_FEATURE_SVE -F(__memchr_aarch64_sve) + F(__memchr_aarch64_sve, 1) # endif #elif __arm__ -F(__memchr_arm) + F(__memchr_arm, 0) #endif -#undef F - {0, 0} + {0, 0, 0} + // clang-format on }; +#undef F -static int test_status; -#define ERR(...) (test_status=1, printf(__VA_ARGS__)) - -#define A 32 -#define SP 512 -#define LEN 250000 -static unsigned char sbuf[LEN+2*A]; +#define ALIGN 32 +#define LEN 512 +static char *sbuf; -static void *alignup(void *p) +static void * +alignup (void *p) { - return (void*)(((uintptr_t)p + A-1) & -A); + return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN); } -static void test(const struct fun *fun, int align, int seekpos, int len) +static void +test (const struct fun *fun, int align, size_t seekpos, size_t len, + size_t maxlen) { - unsigned char *src = alignup(sbuf); - unsigned char *s = src + align; - unsigned char *f = len ? s + seekpos : 0; - int seekchar = 0x1; - int i; - void *p; + char *src = alignup (sbuf); + char *s = src + align; + char *f = seekpos < maxlen ? s + seekpos : NULL; + int seekchar = 1; + void *p; - if (len > LEN || seekpos >= len || align >= A) - abort(); + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || seekpos > LEN || align > ALIGN) + abort (); - for (i = 0; i < seekpos; i++) - s[i] = 'a' + i%23; - s[i++] = seekchar; - for (; i < len; i++) - s[i] = 'a' + i%23; + for (int i = 0; src + i < s; i++) + src[i] = seekchar; + for (int i = 0; i <= ALIGN; i++) + s[len + i] = seekchar; + for (int i = 0; i < len; i++) + s[i] = 'a' + (i & 31); + s[seekpos] = seekchar; + s[((len ^ align) & 1) ? seekpos + 1 : len] = seekchar; - p = fun->fun(s, seekchar, len); + int mte_len = seekpos != -1 ? seekpos + 1 : maxlen; + s = tag_buffer (s, mte_len, fun->test_mte); + p = fun->fun (s, seekchar, maxlen); + untag_buffer (s, mte_len, fun->test_mte); + p = untag_pointer (p); - if (p != f) { - ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p); - ERR("expected: %p\n", f); - abort(); - } + if (p != f) + { + ERR ("%s (%p, 0x%02x, %zu) returned %p, expected %p\n", fun->name, s, + seekchar, maxlen, p, f); + quote ("input", s, len); + } } -int main() +int +main (void) { - int r = 0; - for (int i=0; funtab[i].name; i++) { - test_status = 0; - for (int a = 0; a < A; a++) { - for (int n = 0; n < 100; n++) - for (int sp = 0; sp < n-1; sp++) - test(funtab+i, a, sp, n); - for (int n = 100; n < LEN; n *= 2) { - test(funtab+i, a, n-1, n); - test(funtab+i, a, n/2, n); - } - } - printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name); - if (test_status) - r = -1; - } - return r; + sbuf = mte_mmap (LEN + 3 * ALIGN); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int a = 0; a < ALIGN; a++) + for (int n = 0; n < LEN; n++) + { + for (int sp = 0; sp < LEN; sp++) + test (funtab + i, a, sp, n, n); + test (funtab + i, a, n, n, SIZE_MAX - a); + } + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; } diff --git a/string/test/memcmp.c b/string/test/memcmp.c index 114f1d7..7a7cf9c 100644 --- a/string/test/memcmp.c +++ b/string/test/memcmp.c @@ -1,7 +1,7 @@ /* * memcmp test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -9,88 +9,117 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include "mte.h" #include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, static const struct fun { - const char *name; - int (*fun)(const void *s1, const void *s2, size_t n); + const char *name; + int (*fun) (const void *s1, const void *s2, size_t n); + int test_mte; } funtab[] = { -#define F(x) {#x, x}, -F(memcmp) + // clang-format off + F(memcmp, 0) #if __aarch64__ -F(__memcmp_aarch64) + F(__memcmp_aarch64, 1) # if __ARM_FEATURE_SVE -F(__memcmp_aarch64_sve) + F(__memcmp_aarch64_sve, 1) # endif #endif -#undef F - {0, 0} + {0, 0, 0} + // clang-format on }; - -static int test_status; -#define ERR(...) (test_status=1, printf(__VA_ARGS__)) +#undef F #define A 32 #define LEN 250000 -static unsigned char s1buf[LEN+2*A]; -static unsigned char s2buf[LEN+2*A]; +static unsigned char *s1buf; +static unsigned char *s2buf; -static void *alignup(void *p) +static void * +alignup (void *p) { - return (void*)(((uintptr_t)p + A-1) & -A); + return (void *) (((uintptr_t) p + A - 1) & -A); } -static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos) +static void +test (const struct fun *fun, int s1align, int s2align, int len, int diffpos, + int delta) { - unsigned char *src1 = alignup(s1buf); - unsigned char *src2 = alignup(s2buf); - unsigned char *s1 = src1 + s1align; - unsigned char *s2 = src2 + s2align; - int r; + unsigned char *src1 = alignup (s1buf); + unsigned char *src2 = alignup (s2buf); + unsigned char *s1 = src1 + s1align; + unsigned char *s2 = src2 + s2align; + int r; - if (len > LEN || s1align >= A || s2align >= A) - abort(); - if (diffpos && diffpos >= len) - abort(); + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || s1align >= A || s2align >= A) + abort (); + if (diffpos >= len) + abort (); + if ((diffpos < 0) != (delta == 0)) + abort (); - for (int i = 0; i < len+A; i++) - src1[i] = src2[i] = '?'; - for (int i = 0; i < len; i++) - s1[i] = s2[i] = 'a' + i%23; - if (diffpos) - s1[diffpos]++; + for (int i = 0; i < len + A; i++) + src1[i] = src2[i] = '?'; + for (int i = 0; i < len; i++) + s1[i] = s2[i] = 'a' + i % 23; + if (delta) + s1[diffpos] += delta; - r = fun->fun(s1, s2, len); + s1 = tag_buffer (s1, len, fun->test_mte); + s2 = tag_buffer (s2, len, fun->test_mte); + r = fun->fun (s1, s2, len); + untag_buffer (s1, len, fun->test_mte); + untag_buffer (s2, len, fun->test_mte); - if ((!diffpos && r != 0) || (diffpos && r == 0)) { - ERR("%s(align %d, align %d, %d) failed, returned %d\n", - fun->name, s1align, s2align, len, r); - ERR("src1: %.*s\n", s1align+len+1, src1); - ERR("src2: %.*s\n", s2align+len+1, src2); - } + if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0)) + { + ERR ("%s(align %d, align %d, %d) failed, returned %d\n", fun->name, + s1align, s2align, len, r); + quoteat ("src1", src1, len + A, diffpos); + quoteat ("src2", src2, len + A, diffpos); + } } -int main() +int +main () { - int r = 0; - for (int i=0; funtab[i].name; i++) { - test_status = 0; - for (int d = 0; d < A; d++) - for (int s = 0; s < A; s++) { - int n; - for (n = 0; n < 100; n++) { - test(funtab+i, d, s, n, 0); - test(funtab+i, d, s, n, n / 2); - } - for (; n < LEN; n *= 2) { - test(funtab+i, d, s, n, 0); - test(funtab+i, d, s, n, n / 2); - } - } - printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name); - if (test_status) - r = -1; - } - return r; + s1buf = mte_mmap (LEN + 2 * A); + s2buf = mte_mmap (LEN + 2 * A); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int d = 0; d < A; d++) + for (int s = 0; s < A; s++) + { + int n; + test (funtab + i, d, s, 0, -1, 0); + test (funtab + i, d, s, 1, -1, 0); + test (funtab + i, d, s, 1, 0, -1); + test (funtab + i, d, s, 1, 0, 1); + for (n = 2; n < 100; n++) + { + test (funtab + i, d, s, n, -1, 0); + test (funtab + i, d, s, n, 0, -1); + test (funtab + i, d, s, n, n - 1, -1); + test (funtab + i, d, s, n, n / 2, 1); + } + for (; n < LEN; n *= 2) + { + test (funtab + i, d, s, n, -1, 0); + test (funtab + i, d, s, n, n / 2, -1); + } + } + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; } diff --git a/string/test/memcpy.c b/string/test/memcpy.c index 8572452..ce0ceee 100644 --- a/string/test/memcpy.c +++ b/string/test/memcpy.c @@ -1,7 +1,7 @@ /* * memcpy test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -9,90 +9,112 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include "mte.h" #include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, static const struct fun { - const char *name; - void *(*fun)(void *, const void *, size_t); + const char *name; + void *(*fun) (void *, const void *, size_t); + int test_mte; } funtab[] = { -#define F(x) {#x, x}, -F(memcpy) + // clang-format off + F(memcpy, 0) #if __aarch64__ -F(__memcpy_aarch64) + F(__memcpy_aarch64, 1) # if __ARM_NEON -F(__memcpy_aarch64_simd) + F(__memcpy_aarch64_simd, 1) # endif #elif __arm__ -F(__memcpy_arm) + F(__memcpy_arm, 0) #endif -#undef F - {0, 0} + {0, 0, 0} + // clang-format on }; - -static int test_status; -#define ERR(...) (test_status=1, printf(__VA_ARGS__)) +#undef F #define A 32 #define LEN 250000 -static unsigned char dbuf[LEN+2*A]; -static unsigned char sbuf[LEN+2*A]; -static unsigned char wbuf[LEN+2*A]; +static unsigned char *dbuf; +static unsigned char *sbuf; +static unsigned char wbuf[LEN + 2 * A]; -static void *alignup(void *p) +static void * +alignup (void *p) { - return (void*)(((uintptr_t)p + A-1) & -A); + return (void *) (((uintptr_t) p + A - 1) & -A); } -static void test(const struct fun *fun, int dalign, int salign, int len) +static void +test (const struct fun *fun, int dalign, int salign, int len) { - unsigned char *src = alignup(sbuf); - unsigned char *dst = alignup(dbuf); - unsigned char *want = wbuf; - unsigned char *s = src + salign; - unsigned char *d = dst + dalign; - unsigned char *w = want + dalign; - void *p; - int i; + unsigned char *src = alignup (sbuf); + unsigned char *dst = alignup (dbuf); + unsigned char *want = wbuf; + unsigned char *s = src + salign; + unsigned char *d = dst + dalign; + unsigned char *w = want + dalign; + void *p; + int i; - if (len > LEN || dalign >= A || salign >= A) - abort(); - for (i = 0; i < len+A; i++) { - src[i] = '?'; - want[i] = dst[i] = '*'; - } - for (i = 0; i < len; i++) - s[i] = w[i] = 'a' + i%23; + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || dalign >= A || salign >= A) + abort (); + for (i = 0; i < len + A; i++) + { + src[i] = '?'; + want[i] = dst[i] = '*'; + } + for (i = 0; i < len; i++) + s[i] = w[i] = 'a' + i % 23; + + s = tag_buffer (s, len, fun->test_mte); + d = tag_buffer (d, len, fun->test_mte); + p = fun->fun (d, s, len); + untag_buffer (s, len, fun->test_mte); + untag_buffer (d, len, fun->test_mte); - p = fun->fun(d, s, len); - if (p != d) - ERR("%s(%p,..) returned %p\n", fun->name, d, p); - for (i = 0; i < len+A; i++) { - if (dst[i] != want[i]) { - ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len); - ERR("got : %.*s\n", dalign+len+1, dst); - ERR("want: %.*s\n", dalign+len+1, want); - break; - } + if (p != d) + ERR ("%s(%p,..) returned %p\n", fun->name, d, p); + for (i = 0; i < len + A; i++) + { + if (dst[i] != want[i]) + { + ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, + len); + quoteat ("got", dst, len + A, i); + quoteat ("want", want, len + A, i); + break; } + } } -int main() +int +main () { - int r = 0; - for (int i=0; funtab[i].name; i++) { - test_status = 0; - for (int d = 0; d < A; d++) - for (int s = 0; s < A; s++) { - int n; - for (n = 0; n < 100; n++) - test(funtab+i, d, s, n); - for (; n < LEN; n *= 2) - test(funtab+i, d, s, n); - } - printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name); - if (test_status) - r = -1; - } - return r; + dbuf = mte_mmap (LEN + 2 * A); + sbuf = mte_mmap (LEN + 2 * A); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int d = 0; d < A; d++) + for (int s = 0; s < A; s++) + { + int n; + for (n = 0; n < 100; n++) + test (funtab + i, d, s, n); + for (; n < LEN; n *= 2) + test (funtab + i, d, s, n); + } + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; } diff --git a/string/test/memmove.c b/string/test/memmove.c index 7891b14..689b68c 100644 --- a/string/test/memmove.c +++ b/string/test/memmove.c @@ -1,7 +1,7 @@ /* * memmove test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -9,136 +9,156 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include "mte.h" #include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, static const struct fun { - const char *name; - void *(*fun)(void *, const void *, size_t); + const char *name; + void *(*fun) (void *, const void *, size_t); + int test_mte; } funtab[] = { -#define F(x) {#x, x}, -F(memmove) + // clang-format off + F(memmove, 0) #if __aarch64__ -F(__memmove_aarch64) + F(__memmove_aarch64, 1) # if __ARM_NEON -F(__memmove_aarch64_simd) + F(__memmove_aarch64_simd, 1) # endif #endif -#undef F - {0, 0} + {0, 0, 0} + // clang-format on }; - -static int test_status; -#define ERR(...) (test_status=1, printf(__VA_ARGS__)) +#undef F #define A 32 #define LEN 250000 -static unsigned char dbuf[LEN+2*A]; -static unsigned char sbuf[LEN+2*A]; -static unsigned char wbuf[LEN+2*A]; +static unsigned char *dbuf; +static unsigned char *sbuf; +static unsigned char wbuf[LEN + 2 * A]; -static void *alignup(void *p) +static void * +alignup (void *p) { - return (void*)(((uintptr_t)p + A-1) & -A); + return (void *) (((uintptr_t) p + A - 1) & -A); } -static void test(const struct fun *fun, int dalign, int salign, int len) +static void +test (const struct fun *fun, int dalign, int salign, int len) { - unsigned char *src = alignup(sbuf); - unsigned char *dst = alignup(dbuf); - unsigned char *want = wbuf; - unsigned char *s = src + salign; - unsigned char *d = dst + dalign; - unsigned char *w = want + dalign; - void *p; - int i; - - if (len > LEN || dalign >= A || salign >= A) - abort(); - for (i = 0; i < len+A; i++) { - src[i] = '?'; - want[i] = dst[i] = '*'; - } - for (i = 0; i < len; i++) - s[i] = w[i] = 'a' + i%23; - - p = fun->fun(d, s, len); - if (p != d) - ERR("%s(%p,..) returned %p\n", fun->name, d, p); - for (i = 0; i < len+A; i++) { - if (dst[i] != want[i]) { - ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len); - ERR("got : %.*s\n", dalign+len+1, dst); - ERR("want: %.*s\n", dalign+len+1, want); - break; - } + unsigned char *src = alignup (sbuf); + unsigned char *dst = alignup (dbuf); + unsigned char *want = wbuf; + unsigned char *s = src + salign; + unsigned char *d = dst + dalign; + unsigned char *w = want + dalign; + void *p; + int i; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || dalign >= A || salign >= A) + abort (); + for (i = 0; i < len + A; i++) + { + src[i] = '?'; + want[i] = dst[i] = '*'; + } + for (i = 0; i < len; i++) + s[i] = w[i] = 'a' + i % 23; + + p = fun->fun (d, s, len); + if (p != d) + ERR ("%s(%p,..) returned %p\n", fun->name, d, p); + for (i = 0; i < len + A; i++) + { + if (dst[i] != want[i]) + { + ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, + len); + quoteat ("got", dst, len + A, i); + quoteat ("want", want, len + A, i); + break; } + } } -static void test_overlap(const struct fun *fun, int dalign, int salign, int len) +static void +test_overlap (const struct fun *fun, int dalign, int salign, int len) { - unsigned char *src = alignup(sbuf); - unsigned char *dst = alignup(sbuf); - unsigned char *want = wbuf; - unsigned char *s = src + salign; - unsigned char *d = dst + dalign; - unsigned char *w = wbuf + dalign; - void *p; - - if (len > LEN || dalign >= A || salign >= A) - abort(); - - for (int i = 0; i < len+A; i++) - src[i] = want[i] = '?'; - - for (int i = 0; i < len; i++) - s[i] = w[i] = 'a' + i%23; - - /* Copy the potential overlap range. */ - if (s < d) { - for (int i = 0; i < (uintptr_t)d-(uintptr_t)s; i++) - want[salign+i] = src[salign+i]; - } else { - for (int i = 0; i < (uintptr_t)s-(uintptr_t)d; i++) - want[len + dalign + i] = src[len + dalign + i]; - } - - p = fun->fun(d, s, len); - if (p != d) - ERR("%s(%p,..) returned %p\n", fun->name, d, p); - for (int i = 0; i < len+A; i++) { - if (dst[i] != want[i]) { - ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len); - ERR("got : %.*s\n", dalign+len+1, dst); - ERR("want: %.*s\n", dalign+len+1, want); - abort(); - break; - } + unsigned char *src = alignup (sbuf); + unsigned char *dst = src; + unsigned char *want = wbuf; + unsigned char *s = src + salign; + unsigned char *d = dst + dalign; + unsigned char *w = wbuf + dalign; + void *p; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || dalign >= A || salign >= A) + abort (); + + for (int i = 0; i < len + A; i++) + src[i] = want[i] = '?'; + + for (int i = 0; i < len; i++) + s[i] = want[salign + i] = 'a' + i % 23; + for (int i = 0; i < len; i++) + w[i] = s[i]; + + s = tag_buffer (s, len, fun->test_mte); + d = tag_buffer (d, len, fun->test_mte); + p = fun->fun (d, s, len); + untag_buffer (s, len, fun->test_mte); + untag_buffer (d, len, fun->test_mte); + + if (p != d) + ERR ("%s(%p,..) returned %p\n", fun->name, d, p); + for (int i = 0; i < len + A; i++) + { + if (dst[i] != want[i]) + { + ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, + len); + quoteat ("got", dst, len + A, i); + quoteat ("want", want, len + A, i); + break; } + } } -int main() +int +main () { - test_overlap(funtab+0, 2, 1, 1); - - int r = 0; - for (int i=0; funtab[i].name; i++) { - test_status = 0; - for (int d = 0; d < A; d++) - for (int s = 0; s < A; s++) { - int n; - for (n = 0; n < 100; n++) { - test(funtab+i, d, s, n); - test_overlap(funtab+i, d, s, n); - } - for (; n < LEN; n *= 2) { - test(funtab+i, d, s, n); - test_overlap(funtab+i, d, s, n); - } - } - printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name); - if (test_status) - r = -1; - } - return r; + dbuf = mte_mmap (LEN + 2 * A); + sbuf = mte_mmap (LEN + 2 * A); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int d = 0; d < A; d++) + for (int s = 0; s < A; s++) + { + int n; + for (n = 0; n < 100; n++) + { + test (funtab + i, d, s, n); + test_overlap (funtab + i, d, s, n); + } + for (; n < LEN; n *= 2) + { + test (funtab + i, d, s, n); + test_overlap (funtab + i, d, s, n); + } + } + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; } diff --git a/string/test/memrchr.c b/string/test/memrchr.c new file mode 100644 index 0000000..adf96f0 --- /dev/null +++ b/string/test/memrchr.c @@ -0,0 +1,106 @@ +/* + * memchr test. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + void *(*fun) (const void *s, int c, size_t n); + int test_mte; +} funtab[] = { + // clang-format off + F(memrchr, 0) +#if __aarch64__ + F(__memrchr_aarch64, 1) +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +#define ALIGN 32 +#define LEN 512 +static char *sbuf; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + ALIGN) & -ALIGN); +} + +static void +test (const struct fun *fun, int align, size_t seekpos, size_t len, + size_t maxlen) +{ + char *src = alignup (sbuf); + char *s = src + align; + char *f = seekpos < maxlen ? s + seekpos : NULL; + int seekchar = 1; + void *p; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || seekpos > LEN || align > ALIGN) + abort (); + + for (int i = 0; src + i < s; i++) + src[i] = seekchar; + for (int i = 0; i <= ALIGN; i++) + s[len + i] = seekchar; + for (int i = 0; i < len; i++) + s[i] = 'a' + (i & 31); + s[seekpos] = seekchar; + s[((len ^ align) & 1) && seekpos < maxlen ? seekpos - 1 : len] = seekchar; + + s = tag_buffer (s, maxlen, fun->test_mte); + p = fun->fun (s, seekchar, maxlen); + untag_buffer (s, maxlen, fun->test_mte); + p = untag_pointer (p); + + if (p != f) + { + ERR ("%s (%p, 0x%02x, %zu) returned %p, expected %p\n", fun->name, s, + seekchar, maxlen, p, f); + quote ("input", s, len); + } +} + +int +main (void) +{ + sbuf = mte_mmap (LEN + 3 * ALIGN); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int a = 0; a < ALIGN; a++) + for (int n = 0; n < LEN; n++) + { + for (int sp = 0; sp < LEN; sp++) + test (funtab + i, a, sp, n, n); + } + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; +} diff --git a/string/test/memset.c b/string/test/memset.c index 48c10fa..f172144 100644 --- a/string/test/memset.c +++ b/string/test/memset.c @@ -1,7 +1,7 @@ /* * memset test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -9,103 +9,121 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include "mte.h" #include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, static const struct fun { - const char *name; - void *(*fun)(void *s, int c, size_t n); + const char *name; + void *(*fun) (void *s, int c, size_t n); + int test_mte; } funtab[] = { -#define F(x) {#x, x}, -F(memset) + // clang-format off + F(memset, 0) #if __aarch64__ -F(__memset_aarch64) + F(__memset_aarch64, 1) #elif __arm__ -F(__memset_arm) + F(__memset_arm, 0) #endif -#undef F - {0, 0} + {0, 0, 0} + // clang-format on }; - -static int test_status; -#define ERR(...) (test_status=1, printf(__VA_ARGS__)) +#undef F #define A 32 #define LEN 250000 -static unsigned char sbuf[LEN+2*A]; +static unsigned char *sbuf; -static void *alignup(void *p) +static void * +alignup (void *p) { - return (void*)(((uintptr_t)p + A-1) & -A); + return (void *) (((uintptr_t) p + A - 1) & -A); } -static void err(const char *name, unsigned char *src, int salign, int c, int len) +static void +test (const struct fun *fun, int salign, int c, int len) { - ERR("%s(align %d, %d, %d) failed\n", name, salign, c, len); - ERR("got : %.*s\n", salign+len+1, src); -} + unsigned char *src = alignup (sbuf); + unsigned char *s = src + salign; + void *p; + int i; -static void test(const struct fun *fun, int salign, int c, int len) -{ - unsigned char *src = alignup(sbuf); - unsigned char *s = src + salign; - void *p; - int i; + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || salign >= A) + abort (); + for (i = 0; i < len + A; i++) + src[i] = '?'; + for (i = 0; i < len; i++) + s[i] = 'a' + i % 23; - if (len > LEN || salign >= A) - abort(); - for (i = 0; i < len+A; i++) - src[i] = '?'; - for (i = 0; i < len; i++) - s[i] = 'a' + i%23; - for (; i<len%A; i++) - s[i] = '*'; + s = tag_buffer (s, len, fun->test_mte); + p = fun->fun (s, c, len); + untag_buffer (s, len, fun->test_mte); - p = fun->fun(s, c, len); - if (p != s) - ERR("%s(%p,..) returned %p\n", fun->name, s, p); + if (p != s) + ERR ("%s(%p,..) returned %p\n", fun->name, s, p); - for (i = 0; i < salign; i++) { - if (src[i] != '?') { - err(fun->name, src, salign, c, len); - return; - } + for (i = 0; i < salign; i++) + { + if (src[i] != '?') + { + ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len); + quoteat ("got", src, len + A, i); + return; } - for (i = salign; i < len; i++) { - if (src[i] != (unsigned char)c) { - err(fun->name, src, salign, c, len); - return; - } + } + for (; i < salign + len; i++) + { + if (src[i] != (unsigned char) c) + { + ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len); + quoteat ("got", src, len + A, i); + return; } - for (; i < len%A; i++) { - if (src[i] != '*') { - err(fun->name, src, salign, c, len); - return; - } + } + for (; i < len + A; i++) + { + if (src[i] != '?') + { + ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len); + quoteat ("got", src, len + A, i); + return; } + } } -int main() +int +main () { - int r = 0; - for (int i=0; funtab[i].name; i++) { - test_status = 0; - for (int s = 0; s < A; s++) { - int n; - for (n = 0; n < 100; n++) { - test(funtab+i, s, 0, n); - test(funtab+i, s, 0x25, n); - test(funtab+i, s, 0xaa25, n); - } - for (; n < LEN; n *= 2) { - test(funtab+i, s, 0, n); - test(funtab+i, s, 0x25, n); - test(funtab+i, s, 0xaa25, n); - } - } - printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name); - if (test_status) - r = -1; + sbuf = mte_mmap (LEN + 2 * A); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int s = 0; s < A; s++) + { + int n; + for (n = 0; n < 100; n++) + { + test (funtab + i, s, 0, n); + test (funtab + i, s, 0x25, n); + test (funtab + i, s, 0xaa25, n); + } + for (; n < LEN; n *= 2) + { + test (funtab + i, s, 0, n); + test (funtab + i, s, 0x25, n); + test (funtab + i, s, 0xaa25, n); + } } - return r; + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; } diff --git a/string/test/mte.h b/string/test/mte.h new file mode 100644 index 0000000..e67cbd9 --- /dev/null +++ b/string/test/mte.h @@ -0,0 +1,142 @@ +/* + * Memory tagging testing code. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#ifndef __TEST_MTE_H +#define __TEST_MTE_H + +#include <stdlib.h> + +#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST +#include <arm_acle.h> +#include <sys/mman.h> +#include <sys/prctl.h> + +// These depend on a not yet merged kernel ABI. +#define PR_SET_TAGGED_ADDR_CTRL 55 +#define PR_TAGGED_ADDR_ENABLE (1UL << 0) +#define PR_MTE_TCF_SHIFT 1 +#define PR_MTE_TCF_SYNC (1UL << PR_MTE_TCF_SHIFT) +#define PR_MTE_TAG_SHIFT 3 +#define PROT_MTE 0x20 + +#define MTE_GRANULE_SIZE 16 + +int +mte_enabled () +{ + static int enabled = -1; + if (enabled == -1) + { + int res = prctl (PR_SET_TAGGED_ADDR_CTRL, + PR_TAGGED_ADDR_ENABLE | PR_MTE_TCF_SYNC + | (0xfffe << PR_MTE_TAG_SHIFT), + 0, 0, 0); + enabled = (res == 0); + } + return enabled; +} + +static void * +mte_mmap (size_t size) +{ + if (mte_enabled ()) + { + return mmap (NULL, size, PROT_READ | PROT_WRITE | PROT_MTE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + } + else + { + return malloc (size); + } +} + +void * +alignup_mte (void *p) +{ + return (void *) (((uintptr_t) p + MTE_GRANULE_SIZE - 1) + & ~(MTE_GRANULE_SIZE - 1)); +} + +void * +aligndown_mte (void *p) +{ + return (void *) ((uintptr_t) p & ~(MTE_GRANULE_SIZE - 1)); +} + +void * +untag_pointer (void *p) +{ + return (void *) ((unsigned long long) p & (~0ULL >> 8)); +} + +void +tag_buffer_helper (void *p, int len) +{ + char *ptr = p; + char *end = alignup_mte (ptr + len); + ptr = aligndown_mte (p); + for (; ptr < end; ptr += MTE_GRANULE_SIZE) + { + __arm_mte_set_tag (ptr); + } +} + +void * +tag_buffer (void *p, int len, int test_mte) +{ + if (test_mte && mte_enabled ()) + { + p = __arm_mte_increment_tag (p, 1); + tag_buffer_helper (p, len); + } + return p; +} + +void * +untag_buffer (void *p, int len, int test_mte) +{ + p = untag_pointer (p); + if (test_mte && mte_enabled ()) + { + tag_buffer_helper (p, len); + } + return p; +} + +#else // __ARM_FEATURE_MEMORY_TAGGING +int +mte_enabled () +{ + return 0; +} +static void * +mte_mmap (size_t size) +{ + return malloc (size); +} +void * +tag_buffer (void *p, int len, int test_mte) +{ + (void) len; + (void) test_mte; + return p; +} +void * +untag_buffer (void *p, int len, int test_mte) +{ + (void) len; + (void) test_mte; + return p; +} +void * +untag_pointer (void *p) +{ + return p; +} +#endif // __ARM_FEATURE_MEMORY_TAGGING + +#endif diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c new file mode 100644 index 0000000..1827e68 --- /dev/null +++ b/string/test/stpcpy.c @@ -0,0 +1,125 @@ +/* + * stpcpy test. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + char *(*fun) (char *dest, const char *src); + int test_mte; +} funtab[] = { + // clang-format off + F(stpcpy, 0) +#if __aarch64__ + F(__stpcpy_aarch64, 0) + F(__stpcpy_aarch64_mte, 1) +# if __ARM_FEATURE_SVE + F(__stpcpy_aarch64_sve, 1) +# endif +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +#define ALIGN 32 +#define LEN 512 +static char *dbuf; +static char *sbuf; +static char wbuf[LEN + 3 * ALIGN]; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN); +} + +static void +test (const struct fun *fun, int dalign, int salign, int len) +{ + char *src = alignup (sbuf); + char *dst = alignup (dbuf); + char *want = wbuf; + char *s = src + salign; + char *d = dst + dalign; + char *w = want + dalign; + void *p; + int i; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || dalign >= ALIGN || salign >= ALIGN) + abort (); + for (i = 0; i < len + ALIGN; i++) + { + src[i] = '?'; + want[i] = dst[i] = '*'; + } + for (int i = 0; src + i < s; i++) + src[i] = 0; + for (int i = 1; i <= ALIGN; i++) + s[len + i] = (len + salign) & 1 ? 1 : 0; + for (i = 0; i < len; i++) + s[i] = w[i] = 'a' + (i & 31); + s[len] = w[len] = '\0'; + + s = tag_buffer (s, len + 1, fun->test_mte); + d = tag_buffer (d, len + 1, fun->test_mte); + p = fun->fun (d, s); + untag_buffer (s, len + 1, fun->test_mte); + untag_buffer (d, len + 1, fun->test_mte); + + if (p != d + len) + ERR ("%s (%p,..) returned %p expected %p\n", fun->name, d, p, d + len); + + for (i = 0; i < len + ALIGN; i++) + { + if (dst[i] != want[i]) + { + ERR ("%s (align %d, align %d, %d) failed\n", + fun->name, dalign, salign, len); + quoteat ("got", dst, len + ALIGN, i); + quoteat ("want", want, len + ALIGN, i); + break; + } + } +} + +int +main (void) +{ + sbuf = mte_mmap (LEN + 3 * ALIGN); + dbuf = mte_mmap (LEN + 3 * ALIGN); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int d = 0; d < ALIGN; d++) + for (int s = 0; s < ALIGN; s++) + for (int n = 0; n < LEN; n++) + test (funtab + i, d, s, n); + + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; +} diff --git a/string/test/strchr.c b/string/test/strchr.c index a625567..f3ae982 100644 --- a/string/test/strchr.c +++ b/string/test/strchr.c @@ -1,7 +1,7 @@ /* * strchr test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -10,88 +10,112 @@ #include <stdlib.h> #include <string.h> #include <limits.h> +#include "mte.h" #include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, static const struct fun { - const char *name; - char *(*fun)(const char *s, int c); + const char *name; + char *(*fun) (const char *s, int c); + int test_mte; } funtab[] = { -#define F(x) {#x, x}, -F(strchr) + // clang-format off + F(strchr, 0) #if __aarch64__ -F(__strchr_aarch64) + F(__strchr_aarch64, 0) + F(__strchr_aarch64_mte, 1) # if __ARM_FEATURE_SVE -F(__strchr_aarch64_sve) + F(__strchr_aarch64_sve, 1) # endif #endif -#undef F - {0, 0} + {0, 0, 0} + // clang-format on }; +#undef F -static int test_status; -#define ERR(...) (test_status=1, printf(__VA_ARGS__)) - -#define A 32 -#define SP 512 -#define LEN 250000 -static char sbuf[LEN+2*A]; +#define ALIGN 32 +#define LEN 512 +static char *sbuf; -static void *alignup(void *p) +static void * +alignup (void *p) { - return (void*)(((uintptr_t)p + A-1) & -A); + return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN); } -static void test(const struct fun *fun, int align, int seekpos, int len) +static void +test (const struct fun *fun, int align, int seekpos, int len) { - char *src = alignup(sbuf); - char *s = src + align; - char *f = seekpos != -1 ? s + seekpos : 0; - int seekchar = 0x1; - void *p; + char *src = alignup (sbuf); + char *s = src + align; + char *f = seekpos != -1 ? s + seekpos : 0; + int seekchar = 0x1; + void *p; - if (len > LEN || seekpos >= len - 1 || align >= A) - abort(); - if (seekchar >= 'a' && seekchar <= 'a' + 23) - abort(); + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || seekpos >= len || align >= ALIGN) + abort (); - for (int i = 0; i < len + A; i++) - src[i] = '?'; - for (int i = 0; i < len - 2; i++) - s[i] = 'a' + i%23; - if (seekpos != -1) - s[seekpos] = seekchar; - s[len - 1] = '\0'; + for (int i = 0; src + i < s; i++) + src[i] = (i + len) & 1 ? seekchar : 0; + for (int i = 1; i <= ALIGN; i++) + s[len + i] = (i + len) & 1 ? seekchar : 0; + for (int i = 0; i < len; i++) + s[i] = 'a' + (i & 31); + if (seekpos != -1) + s[seekpos] = seekchar; + if (seekpos != -1 && (len + align) & 1) + s[seekpos + 1] = seekchar; + s[len] = '\0'; - p = fun->fun(s, seekchar); + s = tag_buffer (s, len + 1, fun->test_mte); + p = fun->fun (s, seekchar); + untag_buffer (s, len + 1, fun->test_mte); + p = untag_pointer (p); - if (p != f) { - ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p); - ERR("expected: %p\n", f); - abort(); - } + if (p != f) + { + ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n", + fun->name, s, seekchar, len, p, f, seekpos); + quote ("input", s, len); + } + + s = tag_buffer (s, len + 1, fun->test_mte); + p = fun->fun (s, 0); + untag_buffer (s, len + 1, fun->test_mte); + + if (p != s + len) + { + ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n", + fun->name, s, 0, len, p, f, len); + quote ("input", s, len); + } } -int main() +int +main (void) { - int r = 0; - for (int i=0; funtab[i].name; i++) { - test_status = 0; - for (int a = 0; a < A; a++) { - int n; - for (n = 1; n < 100; n++) { - for (int sp = 0; sp < n - 1; sp++) - test(funtab+i, a, sp, n); - test(funtab+i, a, -1, n); - } - for (; n < LEN; n *= 2) { - test(funtab+i, a, -1, n); - test(funtab+i, a, n / 2, n); - } - } - printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name); - if (test_status) - r = -1; - } - return r; + sbuf = mte_mmap (LEN + 3 * ALIGN); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int a = 0; a < ALIGN; a++) + for (int n = 0; n < LEN; n++) + { + for (int sp = 0; sp < n; sp++) + test (funtab + i, a, sp, n); + test (funtab + i, a, -1, n); + } + + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; } diff --git a/string/test/strchrnul.c b/string/test/strchrnul.c index 814dd1e..6c30ab2 100644 --- a/string/test/strchrnul.c +++ b/string/test/strchrnul.c @@ -1,99 +1,126 @@ /* * strchrnul test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ +#ifndef _GNU_SOURCE #define _GNU_SOURCE +#endif #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <limits.h> +#include "mte.h" #include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, static const struct fun { - const char *name; - char *(*fun)(const char *s, int c); + const char *name; + char *(*fun) (const char *s, int c); + int test_mte; } funtab[] = { -#define F(x) {#x, x}, -F(strchrnul) + // clang-format off + F(strchrnul, 0) #if __aarch64__ -F(__strchrnul_aarch64) + F(__strchrnul_aarch64, 0) + F(__strchrnul_aarch64_mte, 1) # if __ARM_FEATURE_SVE -F(__strchrnul_aarch64_sve) + F(__strchrnul_aarch64_sve, 1) # endif #endif -#undef F - {0, 0} + {0, 0, 0} + // clang-format on }; +#undef F -static int test_status; -#define ERR(...) (test_status=1, printf(__VA_ARGS__)) - -#define A 32 -#define SP 512 -#define LEN 250000 -static char sbuf[LEN+2*A]; +#define ALIGN 32 +#define LEN 512 +static char *sbuf; -static void *alignup(void *p) +static void * +alignup (void *p) { - return (void*)(((uintptr_t)p + A-1) & -A); + return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN); } -static void test(const struct fun *fun, int align, int seekpos, int len) +static void +test (const struct fun *fun, int align, int seekpos, int len) { - char *src = alignup(sbuf); - char *s = src + align; - char *f = seekpos != -1 ? s + seekpos : s + len - 1; - int seekchar = 0x1; - void *p; + char *src = alignup (sbuf); + char *s = src + align; + char *f = seekpos != -1 ? s + seekpos : s + len; + int seekchar = 0x1; + void *p; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || seekpos >= len || align >= ALIGN) + abort (); - if (len > LEN || seekpos >= len - 1 || align >= A) - abort(); - if (seekchar >= 'a' && seekchar <= 'a' + 23) - abort(); + for (int i = 0; src + i < s; i++) + src[i] = (i + len) & 1 ? seekchar : 0; + for (int i = 1; i <= ALIGN; i++) + s[len + i] = (i + len) & 1 ? seekchar : 0; + for (int i = 0; i < len; i++) + s[i] = 'a' + (i & 31); + if (seekpos != -1) + s[seekpos] = seekchar; + if (seekpos != -1 && (len + align) & 1) + s[seekpos + 1] = seekchar; + s[len] = '\0'; - for (int i = 0; i < len + A; i++) - src[i] = '?'; - for (int i = 0; i < len - 2; i++) - s[i] = 'a' + i%23; - if (seekpos != -1) - s[seekpos] = seekchar; - s[len - 1] = '\0'; + int mte_len = seekpos != -1 ? seekpos + 1 : len + 1; + s = tag_buffer (s, mte_len, fun->test_mte); + p = fun->fun (s, seekchar); + untag_buffer (s, mte_len, fun->test_mte); + p = untag_pointer (p); - p = fun->fun(s, seekchar); + if (p != f) + { + ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n", + fun->name, s, seekchar, len, p, f, seekpos); + quote ("input", s, len); + } - if (p != f) { - ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p); - ERR("expected: %p\n", f); - abort(); - } + s = tag_buffer (s, len + 1, fun->test_mte); + p = fun->fun (s, 0); + untag_buffer (s, len + 1, fun->test_mte); + + if (p != s + len) + { + ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n", + fun->name, s, 0, len, p, f, len); + quote ("input", s, len); + } } -int main() +int +main (void) { - int r = 0; - for (int i=0; funtab[i].name; i++) { - test_status = 0; - for (int a = 0; a < A; a++) { - int n; - for (n = 1; n < 100; n++) { - for (int sp = 0; sp < n - 1; sp++) - test(funtab+i, a, sp, n); - test(funtab+i, a, -1, n); - } - for (; n < LEN; n *= 2) { - test(funtab+i, a, -1, n); - test(funtab+i, a, n / 2, n); - } - } - printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name); - if (test_status) - r = -1; - } - return r; + sbuf = mte_mmap (LEN + 3 * ALIGN); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int a = 0; a < ALIGN; a++) + for (int n = 0; n < LEN; n++) + { + for (int sp = 0; sp < n; sp++) + test (funtab + i, a, sp, n); + test (funtab + i, a, -1, n); + } + + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; } diff --git a/string/test/strcmp.c b/string/test/strcmp.c index 91fa9dd..d57b54e 100644 --- a/string/test/strcmp.c +++ b/string/test/strcmp.c @@ -1,7 +1,7 @@ /* * strcmp test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -9,95 +9,124 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include "mte.h" #include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, static const struct fun { - const char *name; - int (*fun)(const char *s1, const char *s2); + const char *name; + int (*fun) (const char *s1, const char *s2); + int test_mte; } funtab[] = { -#define F(x) {#x, x}, -F(strcmp) + // clang-format off + F(strcmp, 0) #if __aarch64__ -F(__strcmp_aarch64) + F(__strcmp_aarch64, 0) + F(__strcmp_aarch64_mte, 1) # if __ARM_FEATURE_SVE -F(__strcmp_aarch64_sve) + F(__strcmp_aarch64_sve, 1) # endif #elif __arm__ # if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 -F(__strcmp_arm) + F(__strcmp_arm, 0) # elif __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1 -F(__strcmp_armv6m) + F(__strcmp_armv6m, 0) # endif #endif -#undef F - {0, 0} + {0, 0, 0} + // clang-format on }; - -static int test_status; -#define ERR(...) (test_status=1, printf(__VA_ARGS__)) +#undef F #define A 32 #define LEN 250000 -static char s1buf[LEN+2*A]; -static char s2buf[LEN+2*A]; +static char *s1buf; +static char *s2buf; -static void *alignup(void *p) +static void * +alignup (void *p) { - return (void*)(((uintptr_t)p + A-1) & -A); + return (void *) (((uintptr_t) p + A - 1) & -A); } -static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos) +static void +test (const struct fun *fun, int s1align, int s2align, int len, int diffpos, + int delta) { - char *src1 = alignup(s1buf); - char *src2 = alignup(s2buf); - char *s1 = src1 + s1align; - char *s2 = src2 + s2align; - int r; + char *src1 = alignup (s1buf); + char *src2 = alignup (s2buf); + char *s1 = src1 + s1align; + char *s2 = src2 + s2align; + int r; - if (len > LEN || s1align >= A || s2align >= A) - abort(); - if (diffpos > 1 && diffpos >= len-1) - abort(); + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || s1align >= A || s2align >= A) + abort (); + if (diffpos >= len) + abort (); + if ((diffpos < 0) != (delta == 0)) + abort (); - for (int i = 0; i < len+A; i++) - src1[i] = src2[i] = '?'; - for (int i = 0; i < len-1; i++) - s1[i] = s2[i] = 'a' + i%23; - if (diffpos > 1) - s1[diffpos]++; - s1[len] = s2[len] = '\0'; + for (int i = 0; i < len + A; i++) + src1[i] = src2[i] = '?'; + for (int i = 0; i < len; i++) + s1[i] = s2[i] = 'a' + i % 23; + if (delta) + s1[diffpos] += delta; + s1[len] = s2[len] = '\0'; - r = fun->fun(s1, s2); + s1 = tag_buffer (s1, len + 1, fun->test_mte); + s2 = tag_buffer (s2, len + 1, fun->test_mte); + r = fun->fun (s1, s2); + untag_buffer (s1, len + 1, fun->test_mte); + untag_buffer (s2, len + 1, fun->test_mte); - if (((diffpos <= 1) && r != 0) || (diffpos > 1 && r == 0)) { - ERR("%s(align %d, align %d, %d) failed, returned %d\n", - fun->name, s1align, s2align, len, r); - ERR("src1: %.*s\n", s1align+len+1, src1); - ERR("src2: %.*s\n", s2align+len+1, src2); - } + if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0)) + { + ERR ("%s(align %d, align %d, %d) failed, returned %d\n", fun->name, + s1align, s2align, len, r); + quoteat ("src1", src1, len + A, diffpos); + quoteat ("src2", src2, len + A, diffpos); + } } -int main() +int +main () { - int r = 0; - for (int i=0; funtab[i].name; i++) { - test_status = 0; - for (int d = 0; d < A; d++) - for (int s = 0; s < A; s++) { - int n; - for (n = 0; n < 100; n++) { - test(funtab+i, d, s, n, 0); - test(funtab+i, d, s, n, n / 2); - } - for (; n < LEN; n *= 2) { - test(funtab+i, d, s, n, 0); - test(funtab+i, d, s, n, n / 2); - } - } - printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name); - if (test_status) - r = -1; - } - return r; + s1buf = mte_mmap (LEN + 2 * A + 1); + s2buf = mte_mmap (LEN + 2 * A + 1); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int d = 0; d < A; d++) + for (int s = 0; s < A; s++) + { + int n; + test (funtab + i, d, s, 0, -1, 0); + test (funtab + i, d, s, 1, -1, 0); + test (funtab + i, d, s, 1, 0, 1); + test (funtab + i, d, s, 1, 0, -1); + for (n = 2; n < 100; n++) + { + test (funtab + i, d, s, n, -1, 0); + test (funtab + i, d, s, n, n - 1, -1); + test (funtab + i, d, s, n, n / 2, 1); + } + for (; n < LEN; n *= 2) + { + test (funtab + i, d, s, n, -1, 0); + test (funtab + i, d, s, n, n / 2, -1); + } + } + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; } diff --git a/string/test/strcpy.c b/string/test/strcpy.c index 4882c9f..e84cace 100644 --- a/string/test/strcpy.c +++ b/string/test/strcpy.c @@ -1,7 +1,7 @@ /* * strcpy test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -9,91 +9,115 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include "mte.h" #include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, static const struct fun { - const char *name; - char *(*fun)(char *dest, const char *src); + const char *name; + char *(*fun) (char *dest, const char *src); + int test_mte; } funtab[] = { -#define F(x) {#x, x}, -F(strcpy) + // clang-format off + F(strcpy, 0) #if __aarch64__ -F(__strcpy_aarch64) + F(__strcpy_aarch64, 0) + F(__strcpy_aarch64_mte, 1) # if __ARM_FEATURE_SVE -F(__strcpy_aarch64_sve) + F(__strcpy_aarch64_sve, 1) # endif #elif __arm__ && defined (__thumb2__) && !defined (__thumb__) -F(__strcpy_arm) + F(__strcpy_arm, 0) #endif -#undef F - {0, 0} + {0, 0, 0} + // clang-format on }; +#undef F -static int test_status; -#define ERR(...) (test_status=1, printf(__VA_ARGS__)) - -#define A 32 -#define LEN 250000 -static char dbuf[LEN+2*A]; -static char sbuf[LEN+2*A]; -static char wbuf[LEN+2*A]; +#define ALIGN 32 +#define LEN 512 +static char *dbuf; +static char *sbuf; +static char wbuf[LEN + 3 * ALIGN]; -static void *alignup(void *p) +static void * +alignup (void *p) { - return (void*)(((uintptr_t)p + A-1) & -A); + return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN); } -static void test(const struct fun *fun, int dalign, int salign, int len) +static void +test (const struct fun *fun, int dalign, int salign, int len) { - char *src = alignup(sbuf); - char *dst = alignup(dbuf); - char *want = wbuf; - char *s = src + salign; - char *d = dst + dalign; - char *w = want + dalign; - void *p; - int i; + char *src = alignup (sbuf); + char *dst = alignup (dbuf); + char *want = wbuf; + char *s = src + salign; + char *d = dst + dalign; + char *w = want + dalign; + void *p; + int i; - if (len > LEN || dalign >= A || salign >= A) - abort(); - for (i = 0; i < len+A; i++) { - src[i] = '?'; - want[i] = dst[i] = '*'; - } - for (i = 0; i < len-1; i++) - s[i] = w[i] = 'a' + i%23; - s[i] = w[i] = '\0'; + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || dalign >= ALIGN || salign >= ALIGN) + abort (); + for (i = 0; i < len + ALIGN; i++) + { + src[i] = '?'; + want[i] = dst[i] = '*'; + } + for (int i = 0; src + i < s; i++) + src[i] = 0; + for (int i = 1; i <= ALIGN; i++) + s[len + i] = (len + salign) & 1 ? 1 : 0; + for (i = 0; i < len; i++) + s[i] = w[i] = 'a' + (i & 31); + s[len] = w[len] = '\0'; - p = fun->fun(d, s); - if (p != d) - ERR("%s(%p,..) returned %p\n", fun->name, d, p); - for (i = 0; i < len+A; i++) { - if (dst[i] != want[i]) { - ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len); - ERR("got : %.*s\n", dalign+len+1, dst); - ERR("want: %.*s\n", dalign+len+1, want); - break; - } + s = tag_buffer (s, len + 1, fun->test_mte); + d = tag_buffer (d, len + 1, fun->test_mte); + p = fun->fun (d, s); + untag_buffer (s, len + 1, fun->test_mte); + untag_buffer (d, len + 1, fun->test_mte); + + if (p != d) + ERR ("%s (%p,..) returned %p\n", fun->name, d, p); + + for (i = 0; i < len + ALIGN; i++) + { + if (dst[i] != want[i]) + { + ERR ("%s (align %d, align %d, %d) failed\n", + fun->name, dalign, salign, len); + quoteat ("got", dst, len + ALIGN, i); + quoteat ("want", want, len + ALIGN, i); + break; } + } } -int main() +int +main (void) { - int r = 0; - for (int i=0; funtab[i].name; i++) { - test_status = 0; - for (int d = 0; d < A; d++) - for (int s = 0; s < A; s++) { - int n; - for (n = 0; n < 100; n++) - test(funtab+i, d, s, n); - for (; n < LEN; n *= 2) - test(funtab+i, d, s, n); - } - printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name); - if (test_status) - r = -1; - } - return r; + sbuf = mte_mmap (LEN + 3 * ALIGN); + dbuf = mte_mmap (LEN + 3 * ALIGN); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int d = 0; d < ALIGN; d++) + for (int s = 0; s < ALIGN; s++) + for (int n = 0; n < LEN; n++) + test (funtab + i, d, s, n); + + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; } diff --git a/string/test/stringtest.h b/string/test/stringtest.h new file mode 100644 index 0000000..fe855fc --- /dev/null +++ b/string/test/stringtest.h @@ -0,0 +1,55 @@ +/* + * Common string test code. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <ctype.h> +#include <stdio.h> + +/* Accounting errors for a test case. */ +static int err_count; +#define ERR_LIMIT 10 +#define ERR(...) (err_count++, printf (__VA_ARGS__)) + +static inline void +quotechar (unsigned char c) +{ + if (isprint (c)) + putchar (c); + else + printf ("\\x%02x", c); +} + +/* quoted print around at or the entire string if at < 0. */ +static void +quoteat (const char *prefix, const void *p, int len, int at) +{ + static const int CTXLEN = 15; + int i; + const char *pre = "\""; + const char *post = "\""; + const char *s = p; + if (at > CTXLEN) + { + s += at - CTXLEN; + len -= at - CTXLEN; + pre = "...\""; + } + if (at >= 0 && len > 2 * CTXLEN + 1) + { + len = 2 * CTXLEN + 1; + post = "\"..."; + } + printf ("%4s: %s", prefix, pre); + for (i = 0; i < len; i++) + quotechar (s[i]); + printf ("%s\n", post); +} + +static inline void +quote (const char *prefix, const void *p, int len) +{ + quoteat (prefix, p, len, -1); +} diff --git a/string/test/strlen.c b/string/test/strlen.c index ff8e328..6278380 100644 --- a/string/test/strlen.c +++ b/string/test/strlen.c @@ -1,7 +1,7 @@ /* * strlen test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -9,82 +9,95 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <sys/mman.h> #include <limits.h> +#include "mte.h" #include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, static const struct fun { - const char *name; - size_t (*fun)(const char *s); + const char *name; + size_t (*fun) (const char *s); + int test_mte; } funtab[] = { -#define F(x) {#x, x}, -F(strlen) + // clang-format off + F(strlen, 0) #if __aarch64__ -F(__strlen_aarch64) + F(__strlen_aarch64, 0) + F(__strlen_aarch64_mte, 1) # if __ARM_FEATURE_SVE -F(__strlen_aarch64_sve) + F(__strlen_aarch64_sve, 1) # endif #elif __arm__ # if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 -F(__strlen_armv6t2) + F(__strlen_armv6t2, 0) # endif #endif -#undef F - {0, 0} + {0, 0, 0} + // clang-format on }; +#undef F -static int test_status; -#define ERR(...) (test_status=1, printf(__VA_ARGS__)) - -#define A 32 -#define SP 512 -#define LEN 250000 -static char sbuf[LEN+2*A]; +#define ALIGN 32 +#define LEN 512 +static char *sbuf; -static void *alignup(void *p) +static void * +alignup (void *p) { - return (void*)(((uintptr_t)p + A-1) & -A); + return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN); } -static void test(const struct fun *fun, int align, int len) +static void +test (const struct fun *fun, int align, int len) { - char *src = alignup(sbuf); - char *s = src + align; - size_t r; + char *src = alignup (sbuf); + char *s = src + align; + size_t r; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || align >= ALIGN) + abort (); - if (len > LEN || align >= A) - abort(); + for (int i = 0; src + i < s; i++) + src[i] = 0; + for (int i = 1; i <= ALIGN; i++) + s[len + i] = (len + align) & 1 ? 1 : 0; + for (int i = 0; i < len; i++) + s[i] = 'a' + (i & 31); + s[len] = '\0'; - for (int i = 0; i < len + A; i++) - src[i] = '?'; - for (int i = 0; i < len - 2; i++) - s[i] = 'a' + i%23; - s[len - 1] = '\0'; + s = tag_buffer (s, len + 1, fun->test_mte); + r = fun->fun (s); + untag_buffer (s, len + 1, fun->test_mte); - r = fun->fun(s); - if (r != len-1) { - ERR("%s(%p) returned %zu\n", fun->name, s, r); - ERR("input: %.*s\n", align+len+1, src); - ERR("expected: %d\n", len); - abort(); - } + if (r != len) + { + ERR ("%s (%p) returned %zu expected %d\n", fun->name, s, r, len); + quote ("input", src, len); + } } -int main() +int +main (void) { - int r = 0; - for (int i=0; funtab[i].name; i++) { - test_status = 0; - for (int a = 0; a < A; a++) { - int n; - for (n = 1; n < 100; n++) - test(funtab+i, a, n); - for (; n < LEN; n *= 2) - test(funtab+i, a, n); - } - printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name); - if (test_status) - r = -1; - } - return r; + sbuf = mte_mmap (LEN + 3 * ALIGN); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int a = 0; a < ALIGN; a++) + for (int n = 0; n < LEN; n++) + test (funtab + i, a, n); + + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; } diff --git a/string/test/strncmp.c b/string/test/strncmp.c index 43f941d..018a8a4 100644 --- a/string/test/strncmp.c +++ b/string/test/strncmp.c @@ -1,7 +1,7 @@ /* * strncmp test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -9,95 +9,131 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include "mte.h" #include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, static const struct fun { - const char *name; - int (*fun)(const char *, const char *, size_t); + const char *name; + int (*fun) (const char *, const char *, size_t); + int test_mte; } funtab[] = { -#define F(x) {#x, x}, -F(strncmp) + // clang-format off + F(strncmp, 0) #if __aarch64__ -F(__strncmp_aarch64) + F(__strncmp_aarch64, 0) + F(__strncmp_aarch64_mte, 1) # if __ARM_FEATURE_SVE -F(__strncmp_aarch64_sve) + F(__strncmp_aarch64_sve, 1) # endif #endif -#undef F - {0, 0} + {0, 0, 0} + // clang-format on }; - -static int test_status; -#define ERR(...) (test_status=1, printf(__VA_ARGS__)) +#undef F #define A 32 #define LEN 250000 -static char s1buf[LEN+2*A]; -static char s2buf[LEN+2*A]; +static char *s1buf; +static char *s2buf; -static void *alignup(void *p) +static void * +alignup (void *p) { - return (void*)(((uintptr_t)p + A-1) & -A); + return (void *) (((uintptr_t) p + A - 1) & -A); } -static void test(const struct fun *fun, int s1align, int s2align, int maxlen, int diffpos, int len) +static void +test (const struct fun *fun, int s1align, int s2align, int maxlen, int diffpos, + int len, int delta) { - char *src1 = alignup(s1buf); - char *src2 = alignup(s2buf); - char *s1 = src1 + s1align; - char *s2 = src2 + s2align; - int r; - - if (len > LEN || s1align >= A || s2align >= A) - abort(); - if (diffpos > 1 && diffpos >= len-1) - abort(); + char *src1 = alignup (s1buf); + char *src2 = alignup (s2buf); + char *s1 = src1 + s1align; + char *s2 = src2 + s2align; + int r; - for (int i = 0; i < len+A; i++) - src1[i] = src2[i] = '?'; - for (int i = 0; i < len-1; i++) - s1[i] = s2[i] = 'a' + i%23; - if (diffpos > 1) - s1[diffpos]++; - s1[len] = s2[len] = '\0'; + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || s1align >= A || s2align >= A) + abort (); + if (diffpos >= len) + abort (); + if ((diffpos < 0) != (delta == 0)) + abort (); - r = fun->fun(s1, s2, maxlen); + for (int i = 0; i < len + A; i++) + src1[i] = src2[i] = '?'; + for (int i = 0; i < len; i++) + s1[i] = s2[i] = 'a' + i % 23; + if (delta) + s1[diffpos] += delta; + s1[len] = s2[len] = '\0'; - diffpos = maxlen <= diffpos ? 0 : diffpos; + size_t mte_len = maxlen < len + 1 ? maxlen : len + 1; + s1 = tag_buffer (s1, mte_len, fun->test_mte); + s2 = tag_buffer (s2, mte_len, fun->test_mte); + r = fun->fun (s1, s2, maxlen); + untag_buffer (s1, mte_len, fun->test_mte); + untag_buffer (s2, mte_len, fun->test_mte); - if (((diffpos <= 1) && r != 0) || (diffpos > 1 && r == 0)) { - ERR("%s(align %d, align %d, %d (%d)) failed, returned %d (%d)\n", - fun->name, s1align, s2align, maxlen, len, r, diffpos); - ERR("src1: %.*s\n", s1align+len+1, src1); - ERR("src2: %.*s\n", s2align+len+1, src2); - } + if (diffpos >= maxlen) + { + diffpos = -1; + delta = 0; + } + if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0)) + { + ERR ( + "%s(align %d, align %d, %d) (len=%d, diffpos=%d) failed, returned %d\n", + fun->name, s1align, s2align, maxlen, len, diffpos, r); + quoteat ("src1", src1, len + A, diffpos); + quoteat ("src2", src2, len + A, diffpos); + } } -int main() +int +main () { - int r = 0; - for (int i=0; funtab[i].name; i++) { - test_status = 0; - for (int d = 0; d < A; d++) - for (int s = 0; s < A; s++) { - int n; - for (n = 0; n < 100; n++) { - test(funtab+i, d, s, n, 0, n); - test(funtab+i, d, s, n, n/2, n); - test(funtab+i, d, s, n/2, 0, n); - test(funtab+i, d, s, n/2, n/2, n); - } - for (; n < LEN; n *= 2) { - test(funtab+i, d, s, n, 0, n); - test(funtab+i, d, s, n, n/2, n); - test(funtab+i, d, s, n/2, 0, n); - test(funtab+i, d, s, n/2, n/2, n); - } - } - printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name); - if (test_status) - r = -1; - } - return r; + s1buf = mte_mmap (LEN + 2 * A + 1); + s2buf = mte_mmap (LEN + 2 * A + 1); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int d = 0; d < A; d++) + for (int s = 0; s < A; s++) + { + int n; + test (funtab + i, d, s, 0, -1, 0, 0); + test (funtab + i, d, s, 1, -1, 0, 0); + test (funtab + i, d, s, 0, -1, 1, 0); + test (funtab + i, d, s, 1, -1, 1, 0); + test (funtab + i, d, s, 2, -1, 1, 0); + test (funtab + i, d, s, 1, 0, 1, 1); + test (funtab + i, d, s, 1, 0, 1, -1); + for (n = 2; n < 100; n++) + { + test (funtab + i, d, s, n, -1, n, 0); + test (funtab + i, d, s, n, n / 2, n, 1); + test (funtab + i, d, s, n / 2, -1, n, 0); + test (funtab + i, d, s, n / 2, n / 2, n, -1); + } + for (; n < LEN; n *= 2) + { + test (funtab + i, d, s, n, -1, n, 0); + test (funtab + i, d, s, n, n / 2, n, -1); + test (funtab + i, d, s, n / 2, -1, n, 0); + test (funtab + i, d, s, n / 2, n / 2, n, 1); + } + } + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; } diff --git a/string/test/strnlen.c b/string/test/strnlen.c index db41f2a..0dea00e 100644 --- a/string/test/strnlen.c +++ b/string/test/strnlen.c @@ -1,93 +1,109 @@ /* * strnlen test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ -#define _POSIX_C_SOURCE 200809L +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <limits.h> +#include "mte.h" #include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, static const struct fun { - const char *name; - size_t (*fun)(const char *s, size_t m); + const char *name; + size_t (*fun) (const char *s, size_t m); + int test_mte; } funtab[] = { -#define F(x) {#x, x}, -F(strnlen) + // clang-format off + F(strnlen, 0) #if __aarch64__ -F(__strnlen_aarch64) + F(__strnlen_aarch64, 1) # if __ARM_FEATURE_SVE -F(__strnlen_aarch64_sve) + F(__strnlen_aarch64_sve, 1) # endif #endif -#undef F - {0, 0} + {0, 0, 0} + // clang-format on }; +#undef F -static int test_status; -#define ERR(...) (test_status=1, printf(__VA_ARGS__)) - -#define A 32 -#define SP 512 -#define LEN 250000 -static char sbuf[LEN+2*A]; +#define ALIGN 32 +#define LEN 512 +static char *sbuf; -static void *alignup(void *p) +static void * +alignup (void *p) { - return (void*)(((uintptr_t)p + A-1) & -A); + return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN); } -static void test(const struct fun *fun, int align, int maxlen, int len) +static void +test (const struct fun *fun, int align, size_t maxlen, size_t len) { - char *src = alignup(sbuf); - char *s = src + align; - size_t r; - size_t e = maxlen < len ? maxlen : len - 1; + char *src = alignup (sbuf); + char *s = src + align; + size_t r; + size_t e = maxlen < len ? maxlen : len; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || align >= ALIGN) + abort (); - if (len > LEN || align >= A) - abort(); + for (int i = 0; src + i < s; i++) + src[i] = 0; + for (int i = 1; i <= ALIGN; i++) + s[len + i] = (len + align) & 1 ? 1 : 0; + for (int i = 0; i < len; i++) + s[i] = 'a' + (i & 31); + s[len] = 0; + if ((len + align) & 1) + s[e + 1] = 0; - for (int i = 0; i < len + A; i++) - src[i] = '?'; - for (int i = 0; i < len - 2; i++) - s[i] = 'a' + i%23; - s[len - 1] = '\0'; + size_t mte_len = maxlen < len + 1 ? maxlen : len + 1; + s = tag_buffer (s, mte_len, fun->test_mte); + r = fun->fun (s, maxlen); + untag_buffer (s, mte_len, fun->test_mte); - r = fun->fun(s, maxlen); - if (r != e) { - ERR("%s(%p) returned %zu\n", fun->name, s, r); - ERR("input: %.*s\n", align+len+1, src); - ERR("expected: %d\n", len); - abort(); - } + if (r != e) + { + ERR ("%s (%p, %zu) len %zu returned %zu, expected %zu\n", + fun->name, s, maxlen, len, r, e); + quote ("input", s, len); + } } -int main() +int +main (void) { - int r = 0; - for (int i=0; funtab[i].name; i++) { - test_status = 0; - for (int a = 0; a < A; a++) { - int n; - for (n = 1; n < 100; n++) - for (int maxlen = 0; maxlen < 100; maxlen++) - test(funtab+i, a, maxlen, n); - for (; n < LEN; n *= 2) { - test(funtab+i, a, n*2, n); - test(funtab+i, a, n, n); - test(funtab+i, a, n/2, n); - } - } - printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name); - if (test_status) - r = -1; - } - return r; + sbuf = mte_mmap (LEN + 3 * ALIGN); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int a = 0; a < ALIGN; a++) + for (int n = 0; n < LEN; n++) + { + for (int maxlen = 0; maxlen < LEN; maxlen++) + test (funtab + i, a, maxlen, n); + test (funtab + i, a, SIZE_MAX - a, n); + } + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; } diff --git a/string/test/strrchr.c b/string/test/strrchr.c index 532fa51..fedbdc5 100644 --- a/string/test/strrchr.c +++ b/string/test/strrchr.c @@ -1,7 +1,7 @@ /* * strrchr test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -10,88 +10,112 @@ #include <stdlib.h> #include <string.h> #include <limits.h> +#include "mte.h" #include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, static const struct fun { - const char *name; - char *(*fun)(const char *s, int c); + const char *name; + char *(*fun) (const char *s, int c); + int test_mte; } funtab[] = { -#define F(x) {#x, x}, -F(strrchr) + // clang-format off + F(strrchr, 0) #if __aarch64__ -F(__strrchr_aarch64) + F(__strrchr_aarch64, 0) + F(__strrchr_aarch64_mte, 1) # if __ARM_FEATURE_SVE -F(__strrchr_aarch64_sve) + F(__strrchr_aarch64_sve, 1) # endif #endif -#undef F - {0, 0} + {0, 0, 0} + // clang-format on }; +#undef F -static int test_status; -#define ERR(...) (test_status=1, printf(__VA_ARGS__)) - -#define A 32 -#define SP 512 -#define LEN 250000 -static char sbuf[LEN+2*A]; +#define ALIGN 32 +#define LEN 512 +static char *sbuf; -static void *alignup(void *p) +static void * +alignup (void *p) { - return (void*)(((uintptr_t)p + A-1) & -A); + return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN); } -static void test(const struct fun *fun, int align, int seekpos, int len) +static void +test (const struct fun *fun, int align, int seekpos, int len) { - char *src = alignup(sbuf); - char *s = src + align; - char *f = seekpos != -1 ? s + seekpos : 0; - int seekchar = 0x1; - void *p; + char *src = alignup (sbuf); + char *s = src + align; + char *f = seekpos != -1 ? s + seekpos : 0; + int seekchar = 0x1; + void *p; - if (len > LEN || seekpos >= len - 1 || align >= A) - abort(); - if (seekchar >= 'a' && seekchar <= 'a' + 23) - abort(); + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || seekpos >= len || align >= ALIGN) + abort (); - for (int i = 0; i < len + A; i++) - src[i] = '?'; - for (int i = 0; i < len - 2; i++) - s[i] = 'a' + i%23; - if (seekpos != -1) - s[seekpos/2] = s[seekpos] = seekchar; - s[len - 1] = '\0'; + for (int i = 0; src + i < s; i++) + src[i] = (i + len) & 1 ? seekchar : 0; + for (int i = 1; i <= ALIGN; i++) + s[len + i] = (i + len) & 1 ? seekchar : 0; + for (int i = 0; i < len; i++) + s[i] = 'a' + (i & 31); + if (seekpos != -1) + s[seekpos / 2] = s[seekpos] = seekchar; + if (seekpos > 0 && (len + align) & 1) + s[seekpos - 1] = seekchar; + s[len] = '\0'; - p = fun->fun(s, seekchar); + s = tag_buffer (s, len + 1, fun->test_mte); + p = fun->fun (s, seekchar); + untag_buffer (s, len + 1, fun->test_mte); + p = untag_pointer (p); - if (p != f) { - ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p); - ERR("expected: %p\n", f); - abort(); - } + if (p != f) + { + ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n", + fun->name, s, seekchar, len, p, f, seekpos); + quote ("input", s, len); + } + + s = tag_buffer (s, len + 1, fun->test_mte); + p = fun->fun (s, 0); + untag_buffer (s, len + 1, fun->test_mte); + + if (p != s + len) + { + ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n", + fun->name, s, 0, len, p, s + len, len); + quote ("input", s, len); + } } -int main() +int +main (void) { - int r = 0; - for (int i=0; funtab[i].name; i++) { - test_status = 0; - for (int a = 0; a < A; a++) { - int n; - for (n = 1; n < 100; n++) { - for (int sp = 0; sp < n - 1; sp++) - test(funtab+i, a, sp, n); - test(funtab+i, a, -1, n); - } - for (; n < LEN; n *= 2) { - test(funtab+i, a, -1, n); - test(funtab+i, a, n / 2, n); - } - } - printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name); - if (test_status) - r = -1; - } - return r; + sbuf = mte_mmap (LEN + 3 * ALIGN); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int a = 0; a < ALIGN; a++) + for (int n = 0; n < LEN; n++) + { + for (int sp = 0; sp < n; sp++) + test (funtab + i, a, sp, n); + test (funtab + i, a, -1, n); + } + + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; } diff --git a/string/x86_64/check-arch.S b/string/x86_64/check-arch.S new file mode 100644 index 0000000..26ade0a --- /dev/null +++ b/string/x86_64/check-arch.S @@ -0,0 +1,10 @@ +/* + * check ARCH setting. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if !__x86_64__ +# error ARCH setting does not match the compiler. +#endif |