diff options
author | Pierre Blanchard <pierre.blanchard@arm.com> | 2022-12-19 12:06:01 +0000 |
---|---|---|
committer | Pierre Blanchard <pierre.blanchard@arm.com> | 2022-12-19 12:06:09 +0000 |
commit | 0976cbd22158e1152070079642c21a6e02c21141 (patch) | |
tree | 255f11ec0de7218a7311401e6f9b2a8b3c92ae21 | |
parent | 202e46317ee8983516b6413066a57bd624ffa044 (diff) | |
download | arm-optimized-routines-0976cbd22158e1152070079642c21a6e02c21141.tar.gz |
pl/math: Improve vector/Neon log2f
A new implementation based on the same approach as
Neon logf, that is accurate to 2.48 ULPs.
Flags set correctly regardless of WANT_ERRNO.
-rw-r--r-- | pl/math/math_config.h | 10 | ||||
-rw-r--r-- | pl/math/s_log2f_2u5.c (renamed from pl/math/s_log2f_2u6.c) | 2 | ||||
-rw-r--r-- | pl/math/tools/v_log2f.sollya | 38 | ||||
-rw-r--r-- | pl/math/v_log2f_2u5.c | 68 | ||||
-rw-r--r-- | pl/math/v_log2f_2u6.c | 132 | ||||
-rw-r--r-- | pl/math/v_log2f_data.c | 36 | ||||
-rw-r--r-- | pl/math/vn_log2f_2u5.c (renamed from pl/math/vn_log2f_2u6.c) | 4 |
7 files changed, 118 insertions, 172 deletions
diff --git a/pl/math/math_config.h b/pl/math/math_config.h index 99132a0..81da863 100644 --- a/pl/math/math_config.h +++ b/pl/math/math_config.h @@ -476,16 +476,10 @@ extern const struct tanf_poly_data float poly_cotan[TANF_Q_POLY_NCOEFFS]; } __tanf_poly_data HIDDEN; -#define V_LOG2F_TABLE_BITS 4 -#define V_LOG2F_POLY_ORDER 4 +#define V_LOG2F_POLY_NCOEFFS 9 extern const struct v_log2f_data { - struct - { - /* Pad with dummy for quad-aligned memory access. */ - float invc_hi, invc_lo, logc, dummy; - } tab[1 << V_LOG2F_TABLE_BITS]; - float poly[V_LOG2F_POLY_ORDER]; + float poly[V_LOG2F_POLY_NCOEFFS]; } __v_log2f_data HIDDEN; #define V_LOG2_TABLE_BITS 7 diff --git a/pl/math/s_log2f_2u6.c b/pl/math/s_log2f_2u5.c index 8e5569d..7077814 100644 --- a/pl/math/s_log2f_2u6.c +++ b/pl/math/s_log2f_2u5.c @@ -3,4 +3,4 @@ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define SCALAR 1 -#include "v_log2f_2u6.c" +#include "v_log2f_2u5.c" diff --git a/pl/math/tools/v_log2f.sollya b/pl/math/tools/v_log2f.sollya new file mode 100644 index 0000000..18869a5 --- /dev/null +++ b/pl/math/tools/v_log2f.sollya @@ -0,0 +1,38 @@ +// polynomial used for __v_log2f(x) +// +// Copyright (c) 2022, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 9; // poly degree +a = -1/3; +b = 1/3; + +ln2 = evaluate(log(2),0); +invln2 = single(1/ln2); + +// find log2(1+x)/x polynomial with minimal relative error +// (minimal relative error polynomial for log2(1+x) is the same * x) +deg = deg-1; // because of /x + +// f = log2(1+x)/x; using taylor series +f = 0; +for i from 0 to 60 do { f = f + (-x)^i/(i+1); }; +f = f * invln2; + +// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)| +approx = proc(poly,d) { + return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10); +}; + +// first coeff is fixed, iteratively find optimal double prec coeffs +poly = invln2; +for i from 1 to deg do { + p = roundcoefficients(approx(poly,i), [|SG ...|]); + poly = poly + x^i*coeff(p,0); +}; + +display = hexadecimal; +print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do coeff(poly,i); diff --git a/pl/math/v_log2f_2u5.c b/pl/math/v_log2f_2u5.c new file mode 100644 index 0000000..343185c --- /dev/null +++ b/pl/math/v_log2f_2u5.c @@ -0,0 +1,68 @@ +/* + * Single-precision vector log2 function. + * + * Copyright (c) 2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pairwise_hornerf.h" +#include "pl_sig.h" +#include "pl_test.h" +#if V_SUPPORTED + +#define C(i) v_f32 (__v_log2f_data.poly[i]) + +#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */ +#define Min v_u32 (0x00800000) +#define Max v_u32 (0x7f800000) +#define Mask v_u32 (0x007fffff) +#define Off v_u32 (0x3f2aaaab) /* 0.666667 */ + +VPCS_ATTR +NOINLINE static v_f32_t +specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) +{ + /* Fall back to scalar code. */ + return v_call_f32 (log2f, x, y, cmp); +} + +/* Fast implementation for single precision log2, + relies on same argument reduction as Neon logf. + Maximum error: 2.48 ULPs + __v_log2f(0x1.558174p+0) got 0x1.a9be84p-2 + want 0x1.a9be8p-2. */ +VPCS_ATTR +v_f32_t V_NAME (log2f) (v_f32_t x) +{ + v_u32_t u = v_as_u32_f32 (x); + v_u32_t cmp = v_cond_u32 (u - Min >= Max - Min); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u -= Off; + v_f32_t n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend. */ + u &= Mask; + u += Off; + v_f32_t r = v_as_f32_u32 (u) - v_f32 (1.0f); + + /* y = log2(1+r) + n. */ + v_f32_t r2 = r * r; + v_f32_t p = PAIRWISE_HORNER_8 (r, r2, C); + v_f32_t y = v_fma_f32 (p, r, n); + + if (unlikely (v_any_u32 (cmp))) + return specialcase (x, y, cmp); + return y; +} +VPCS_ALIAS + +PL_SIG (V, F, 1, log2, 0.01, 11.1) +PL_TEST_ULP (V_NAME (log2f), 1.99) +PL_TEST_EXPECT_FENV (V_NAME (log2f), WANT_ERRNO) +PL_TEST_INTERVAL (V_NAME (log2f), -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME (log2f), 1.0, 100, 50000) +PL_TEST_INTERVAL (V_NAME (log2f), 100, inf, 50000) +#endif diff --git a/pl/math/v_log2f_2u6.c b/pl/math/v_log2f_2u6.c deleted file mode 100644 index aa011cd..0000000 --- a/pl/math/v_log2f_2u6.c +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Single-precision vector log2 function. - * - * Copyright (c) 2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if V_SUPPORTED - -#define N (1 << V_LOG2F_TABLE_BITS) -#define T __v_log2f_data.tab -#define A __v_log2f_data.poly -#define OFF 0x3f330000 -#define SubnormLim 0x800000 -#define One v_u32 (0x3f800000) - -static float -handle_special (float x) -{ - if (x != x) - /* NaN - return NaN but do not trigger invalid. */ - return x; - if (x < 0) - /* log2f(-anything) = NaN. */ - return __math_invalidf (x); - if (x == 0) - /* log2f(0) = Inf. */ - return __math_divzerof (1); - /* log2f(Inf) = Inf. */ - return x; -} - -static float -normalise (float x) -{ - return asfloat (asuint (x * 0x1p23f) - (23 << 23)); -} - -#ifdef SCALAR - -#define DEFINE_LOOKUP_FUNC(p) \ - static inline float lookup_##p (uint32_t i) { return T[i].p; } - -#else - -#define DEFINE_LOOKUP_FUNC(p) \ - static inline v_f32_t lookup_##p (v_u32_t i) \ - { \ - return (v_f32_t){T[i[0]].p, T[i[1]].p, T[i[2]].p, T[i[3]].p}; \ - } - -#endif - -DEFINE_LOOKUP_FUNC (invc_lo) -DEFINE_LOOKUP_FUNC (invc_hi) -DEFINE_LOOKUP_FUNC (logc) - -/* Single-precision vector log2 routine. Implements the same algorithms as - scalar log2f, but using only single-precision arithmetic, with invc - represented as a two-limb float. Accurate to 2.6 ulp. The maximum error is - near sqrt(2): - __v_log2f(0x1.6a0484p+0) got 0x1.ffea02p-2 - want 0x1.ffea08p-2. */ -VPCS_ATTR v_f32_t V_NAME (log2f) (v_f32_t x) -{ - v_u32_t ix = v_as_u32_f32 (x); - - /* x is +-Inf, +-NaN, 0 or -ve. */ - v_u32_t special = v_cond_u32 (ix >= 0x7f800000) | v_cond_u32 (ix == 0); - /* |x| < 2^126 (i.e. x is subnormal). */ - v_u32_t subnorm = v_cond_u32 (ix < SubnormLim); - - /* Sidestep special lanes so they do not inadvertently trigger fenv - exceptions. They will be fixed up later. */ - if (unlikely (v_any_u32 (special))) - ix = v_sel_u32 (special, One, ix); - - if (unlikely (v_any_u32 (subnorm))) - { - /* Normalize any subnormals. */ - v_f32_t tmp_x = v_as_f32_u32 (ix); - ix = v_as_u32_f32 (v_call_f32 (normalise, tmp_x, tmp_x, subnorm)); - } - - /* x = 2^k z; where z is in range [OFF,2*OFF] and exact. - The range is split into N subintervals. - The ith subinterval contains z and c is near its center. */ - v_u32_t tmp = ix - OFF; - v_u32_t i = (tmp >> (23 - V_LOG2F_TABLE_BITS)) % N; - v_u32_t top = tmp & 0xff800000; - v_u32_t iz = ix - top; - v_f32_t k = v_to_f32_s32 (v_as_s32_u32 (tmp) >> 23); /* Arithmetic shift. */ - v_f32_t z = v_as_f32_u32 (iz); - - v_f32_t invc_lo = lookup_invc_lo (i); - v_f32_t invc_hi = lookup_invc_hi (i); - v_f32_t logc = lookup_logc (i); - - /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */ - v_f32_t r = v_fma_f32 (z, invc_hi, v_f32 (-1)); - r = v_fma_f32 (z, invc_lo, r); - v_f32_t y0 = logc + k; - - /* Pipelined polynomial evaluation to approximate log1p(r)/ln2. */ - v_f32_t r2 = r * r; - v_f32_t y = v_fma_f32 (v_f32 (A[1]), r, v_f32 (A[2])); - y = v_fma_f32 (v_f32 (A[0]), r2, y); - v_f32_t p = v_fma_f32 (v_f32 (A[3]), r, y0); - y = v_fma_f32 (y, r2, p); - - if (unlikely (v_any_u32 (special))) - return v_call_f32 (handle_special, x, y, special); - - return y; -} -VPCS_ALIAS - -PL_SIG (V, F, 1, log2, 0.01, 11.1) -PL_TEST_ULP (V_NAME (log2f), 2.10) -PL_TEST_EXPECT_FENV (V_NAME (log2f), WANT_ERRNO) -PL_TEST_INTERVAL (V_NAME (log2f), -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (V_NAME (log2f), 1.0, 100, 50000) -PL_TEST_INTERVAL (V_NAME (log2f), 100, inf, 50000) -#endif diff --git a/pl/math/v_log2f_data.c b/pl/math/v_log2f_data.c index e6c1f71..7e5cb1e 100644 --- a/pl/math/v_log2f_data.c +++ b/pl/math/v_log2f_data.c @@ -1,5 +1,5 @@ /* - * Coefficients and table entries for vector log2f + * Coefficients for vector log2f * * Copyright (c) 2022, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception @@ -7,31 +7,9 @@ #include "math_config.h" -const struct v_log2f_data __v_log2f_data = { -/* All values here are derived from the values in math/log2f_data.c. - For all i: - tab[i].invc_hi = (float) log2f_data.invc - tab[i].invc_lo = log2f_data.invc - (double) tab[i].invc_hi - tab[i].logc = (float) log2f_data.logc - poly[i] = (float) log2f_data.poly[i]. */ - .tab = { - { 0x1.661ec8p+0, -0x1.81c31p-26, -0x1.efec66p-2, 0}, - { 0x1.571ed4p+0, 0x1.55f108p-25, -0x1.b0b684p-2, 0}, - { 0x1.4953ap+0, -0x1.e1fdeap-25, -0x1.7418bp-2, 0}, - { 0x1.3c995cp+0, -0x1.e8ff9p-25, -0x1.39de92p-2, 0}, - { 0x1.30d19p+0, 0x1.910c94p-25, -0x1.01d9cp-2, 0}, - { 0x1.25e228p+0, -0x1.3d1c58p-26, -0x1.97c1d2p-3, 0}, - { 0x1.1bb4a4p+0, 0x1.434688p-25, -0x1.2f9e3ap-3, 0}, - { 0x1.12359p+0, -0x1.eea348p-25, -0x1.960cbcp-4, 0}, - { 0x1.0953f4p+0, 0x1.9900a8p-28, -0x1.a6f9dcp-5, 0}, - { 0x1p+0, 0x0p+0, 0x0p+0, 0}, - { 0x1.e608dp-1, -0x1.32dc2ap-28, 0x1.338caap-4, 0}, - { 0x1.ca4b32p-1, -0x1.fb2acp-30, 0x1.476a96p-3, 0}, - { 0x1.b20366p-1, -0x1.12a064p-26, 0x1.e840b4p-3, 0}, - { 0x1.9c2d16p-1, 0x1.d0d516p-28, 0x1.40646p-2, 0}, - { 0x1.886e6p-1, 0x1.bc20f6p-28, 0x1.88e9c2p-2, 0}, - { 0x1.767ddp-1, -0x1.5596f4p-26, 0x1.ce0a44p-2, 0}, - }, - .poly = { -0x1.712b70p-2, 0x1.ecabf4p-2, - -0x1.71547ap-1, 0x1.715476p+0 } -}; +/* See tools/v_log2f.sollya for the algorithm used to generate these + coefficients. */ +const struct v_log2f_data __v_log2f_data + = {.poly = {0x1.715476p0f, /* (float)(1 / ln(2)). */ + -0x1.715458p-1f, 0x1.ec701cp-2f, -0x1.7171a4p-2f, 0x1.27a0b8p-2f, + -0x1.e5143ep-3f, 0x1.9d8ecap-3f, -0x1.c675bp-3f, 0x1.9e495p-3f}}; diff --git a/pl/math/vn_log2f_2u6.c b/pl/math/vn_log2f_2u5.c index 18effaf..b1e491a 100644 --- a/pl/math/vn_log2f_2u6.c +++ b/pl/math/vn_log2f_2u5.c @@ -7,6 +7,6 @@ #include "include/mathlib.h" #ifdef __vpcs #define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_log2f, _ZGVnN4v_log2f) -#include "v_log2f_2u6.c" +#define VPCS_ALIAS strong_alias (__vn_log2f, _ZGVnN4v_log2f) +#include "v_log2f_2u5.c" #endif |