pl/math: Add polynomial helpers

Add macros for simplifying polynomial evaluation using either Horner, pairwise Horner or Estrin. Several routines have been modified to use the new helpers. Readability is improved slightly, and we expect that this will make prototyping new routines simpler.
author: Joe Ramsay <Joe.Ramsay@arm.com> 2022-12-09 12:19:38 +0000
committer: Joe Ramsay <joe.ramsay@arm.com> 2022-12-09 12:19:38 +0000
commit: bc7cc9d2a762a26b2fcbf150b3fc9c6993ffa16c (patch)
tree: 0b06cfa5fab3f8f594ff16a1d093a92412e8fb86
parent: 132d2f5da6155e64ff39a54fdbb46145a3892d6a (diff)
download: arm-optimized-routines-bc7cc9d2a762a26b2fcbf150b3fc9c6993ffa16c.tar.gz
30 files changed, 299 insertions, 342 deletions
diff --git a/pl/math/asinh_2u5.c b/pl/math/asinh_2u5.c
index f22b342..9cbdd33 100644
--- a/pl/math/asinh_2u5.c
+++ b/pl/math/asinh_2u5.c
@@ -5,48 +5,17 @@
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "math_config.h"
+#include "estrin.h"
 
 #define AbsMask 0x7fffffffffffffff
 #define ExpM26 0x3e50000000000000 /* asuint64(0x1.0p-26).  */
 #define One 0x3ff0000000000000	  /* asuint64(1.0).  */
 #define Exp511 0x5fe0000000000000 /* asuint64(0x1.0p511).  */
 #define Ln2 0x1.62e42fefa39efp-1
-#define C(i) __asinh_data.poly[i]
 
 double
 optr_aor_log_f64 (double);
 
-static inline double
-eval_poly (double z)
-{
-  /* Evaluate polynomial using Estrin scheme.  */
-  double p_01 = fma (z, C (1), C (0));
-  double p_23 = fma (z, C (3), C (2));
-  double p_45 = fma (z, C (5), C (4));
-  double p_67 = fma (z, C (7), C (6));
-  double p_89 = fma (z, C (9), C (8));
-  double p_ab = fma (z, C (11), C (10));
-  double p_cd = fma (z, C (13), C (12));
-  double p_ef = fma (z, C (15), C (14));
-  double p_gh = fma (z, C (17), C (16));
-
-  double z2 = z * z;
-  double p_03 = fma (z2, p_23, p_01);
-  double p_47 = fma (z2, p_67, p_45);
-  double p_8b = fma (z2, p_ab, p_89);
-  double p_cf = fma (z2, p_ef, p_cd);
-
-  double z4 = z2 * z2;
-  double p_07 = fma (z4, p_47, p_03);
-  double p_8f = fma (z4, p_cf, p_8b);
-
-  double z8 = z4 * z4;
-  double p_0f = fma (z8, p_8f, p_07);
-
-  double z16 = z8 * z8;
-  return fma (z16, p_gh, p_0f);
-}
-
 /* Scalar double-precision asinh implementation. This routine uses different
    approaches on different intervals:
 
@@ -86,7 +55,11 @@ asinh (double x)
   if (ia < One)
     {
       double x2 = x * x;
-      double p = eval_poly (x2);
+      double z2 = x2 * x2;
+      double z4 = z2 * z2;
+      double z8 = z4 * z4;
+#define C(i) __asinh_data.poly[i]
+      double p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C);
       double y = fma (p, x2 * ax, ax);
       return asdouble (asuint64 (y) | sign);
     }
diff --git a/pl/math/asinhf_3u5.c b/pl/math/asinhf_3u5.c
index 8aa62ad..48acdef 100644
--- a/pl/math/asinhf_3u5.c
+++ b/pl/math/asinhf_3u5.c
@@ -5,6 +5,7 @@
  */
 
 #include "math_config.h"
+#include "estrinf.h"
 
 #define AbsMask (0x7fffffff)
 #define SqrtFltMax (0x1.749e96p+10f)
@@ -53,17 +54,7 @@ asinhf (float x)
   if (ia12 < One)
     {
       float x2 = ax * ax;
-      float x4 = x2 * x2;
-
-      float p_01 = fmaf (ax, C (1), C (0));
-      float p_23 = fmaf (ax, C (3), C (2));
-      float p_45 = fmaf (ax, C (5), C (4));
-      float p_67 = fmaf (ax, C (7), C (6));
-
-      float p_03 = fmaf (x2, p_23, p_01);
-      float p_47 = fmaf (x2, p_67, p_45);
-
-      float p = fmaf (x4, p_47, p_03);
+      float p = ESTRIN_7 (ax, x2, x2 * x2, C);
       float y = fmaf (x2, p, ax);
       return asfloat (asuint (y) | sign);
     }
diff --git a/pl/math/atan_common.h b/pl/math/atan_common.h
index 1690e7e..331c1bb 100644
--- a/pl/math/atan_common.h
+++ b/pl/math/atan_common.h
@@ -7,19 +7,18 @@
  */
 
 #include "math_config.h"
+#include "estrin.h"
 
 #if V_SUPPORTED
 
 #include "v_math.h"
 
 #define DBL_T v_f64_t
-#define FMA v_fma_f64
 #define P(i) v_f64 (__atan_poly_data.poly[i])
 
 #else
 
 #define DBL_T double
-#define FMA fma
 #define P(i) __atan_poly_data.poly[i]
 
 #endif
@@ -29,37 +28,14 @@
 static inline DBL_T
 eval_poly (DBL_T z, DBL_T az, DBL_T shift)
 {
-  /* Use full Estrin scheme for P(z^2) with deg(P)=19.  */
+  /* Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
+     full scheme to avoid underflow in x^16.  */
   DBL_T z2 = z * z;
-  /* Level 1.  */
-  DBL_T P_1_0 = FMA (P (1), z2, P (0));
-  DBL_T P_3_2 = FMA (P (3), z2, P (2));
-  DBL_T P_5_4 = FMA (P (5), z2, P (4));
-  DBL_T P_7_6 = FMA (P (7), z2, P (6));
-  DBL_T P_9_8 = FMA (P (9), z2, P (8));
-  DBL_T P_11_10 = FMA (P (11), z2, P (10));
-  DBL_T P_13_12 = FMA (P (13), z2, P (12));
-  DBL_T P_15_14 = FMA (P (15), z2, P (14));
-  DBL_T P_17_16 = FMA (P (17), z2, P (16));
-  DBL_T P_19_18 = FMA (P (19), z2, P (18));
-
-  /* Level 2.  */
   DBL_T x2 = z2 * z2;
-  DBL_T P_3_0 = FMA (P_3_2, x2, P_1_0);
-  DBL_T P_7_4 = FMA (P_7_6, x2, P_5_4);
-  DBL_T P_11_8 = FMA (P_11_10, x2, P_9_8);
-  DBL_T P_15_12 = FMA (P_15_14, x2, P_13_12);
-  DBL_T P_19_16 = FMA (P_19_18, x2, P_17_16);
-
-  /* Level 3.  */
   DBL_T x4 = x2 * x2;
-  DBL_T P_7_0 = FMA (P_7_4, x4, P_3_0);
-  DBL_T P_15_8 = FMA (P_15_12, x4, P_11_8);
-
-  /* Level 4.  */
   DBL_T x8 = x4 * x4;
-  DBL_T y = FMA (P_19_16, x8, P_15_8);
-  y = FMA (y, x8, P_7_0);
+  DBL_T y
+    = FMA (ESTRIN_11_ (z2, x2, x4, x8, P, 8), x8, ESTRIN_7 (z2, x2, x4, P));
 
   /* Finalize. y = shift + z + z^3 * P(z^2).  */
   y = FMA (y, z2 * az, az);
diff --git a/pl/math/atanf_common.h b/pl/math/atanf_common.h
index 436b88b..3038e54 100644
--- a/pl/math/atanf_common.h
+++ b/pl/math/atanf_common.h
@@ -10,19 +10,18 @@
 #define PL_MATH_ATANF_COMMON_H
 
 #include "math_config.h"
+#include "estrinf.h"
 
 #if V_SUPPORTED
 
 #include "v_math.h"
 
 #define FLT_T v_f32_t
-#define FMA v_fma_f32
 #define P(i) v_f32 (__atanf_poly_data.poly[i])
 
 #else
 
 #define FLT_T float
-#define FMA fmaf
 #define P(i) __atanf_poly_data.poly[i]
 
 #endif
@@ -42,10 +41,7 @@ eval_poly (FLT_T z, FLT_T az, FLT_T shift)
   FLT_T z4 = z2 * z2;
 
   /* Then assemble polynomial.  */
-  FLT_T y
-    = FMA (z4,
-	   z4 * FMA (z4, (FMA (z2, P (7), P (6))), (FMA (z2, P (5), P (4)))),
-	   FMA (z4, (FMA (z2, P (3), P (2))), (FMA (z2, P (1), P (0)))));
+  FLT_T y = FMA (z4, z4 * ESTRIN_3_ (z2, z4, P, 4), ESTRIN_3 (z2, z4, P));
 
   /* Finalize:
      y = shift + z * P(z^2).  */
diff --git a/pl/math/cbrtf_1u5.c b/pl/math/cbrtf_1u5.c
index 73b9049..d544a68 100644
--- a/pl/math/cbrtf_1u5.c
+++ b/pl/math/cbrtf_1u5.c
@@ -8,6 +8,7 @@
 #include <math.h>
 
 #include "math_config.h"
+#include "estrinf.h"
 
 #define AbsMask 0x7fffffff
 #define SignMask 0x80000000
@@ -40,10 +41,7 @@ cbrtf (float x)
   /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
      the less accurate the next stage of the algorithm needs to be. An order-4
      polynomial is enough for one Newton iteration.  */
-  float p_01 = fmaf (C (1), m, C (0));
-  float p_23 = fmaf (C (3), m, C (2));
-  float p = fmaf (m * m, p_23, p_01);
-
+  float p = ESTRIN_3 (m, m * m, C);
   /* One iteration of Newton's method for iteratively approximating cbrt.  */
   float m_by_3 = m / 3;
   float a = fmaf (TwoThirds, p, m_by_3 / (p * p));
diff --git a/pl/math/erfc_4u5.c b/pl/math/erfc_4u5.c
index 003a0cc..8088562 100644
--- a/pl/math/erfc_4u5.c
+++ b/pl/math/erfc_4u5.c
@@ -9,6 +9,7 @@
 #include <math.h>
 #include <errno.h>
 #include "math_config.h"
+#include "pairwise_horner.h"
 
 #define AbsMask (0x7fffffffffffffff)
 
@@ -19,28 +20,12 @@
 double
 __exp_dd (double x, double xtail);
 
-/* Evaluate order-12 polynomials using
-   pairwise summation and Horner scheme
-   in double precision.  */
 static inline double
 eval_poly_horner (double z, int i)
 {
-  double r1, r2, r3, r4, r5, r6, z2;
-  r1 = fma (z, PX[i][1], PX[i][0]);
-  r2 = fma (z, PX[i][3], PX[i][2]);
-  r3 = fma (z, PX[i][5], PX[i][4]);
-  r4 = fma (z, PX[i][7], PX[i][6]);
-  r5 = fma (z, PX[i][9], PX[i][8]);
-  r6 = fma (z, PX[i][11], PX[i][10]);
-  z2 = z * z;
-  double r = PX[i][12];
-  r = fma (z2, r, r6);
-  r = fma (z2, r, r5);
-  r = fma (z2, r, r4);
-  r = fma (z2, r, r3);
-  r = fma (z2, r, r2);
-  r = fma (z2, r, r1);
-  return r;
+  double z2 = z * z;
+#define C(j) PX[i][j]
+  return PAIRWISE_HORNER_12 (z, z2, C);
 }
 
 /* Accurate evaluation of exp(x^2)
diff --git a/pl/math/erfcf.h b/pl/math/erfcf.h
index 6adc6b4..98ead38 100644
--- a/pl/math/erfcf.h
+++ b/pl/math/erfcf.h
@@ -8,39 +8,24 @@
 #ifndef PL_MATH_ERFCF_H
 #define PL_MATH_ERFCF_H
 
-#include <math.h>
+#include "math_config.h"
+
+#define FMA fma
+#include "estrin_wrap.h"
 
 /* Accurate exponential from optimized-routines.  */
 double
 __exp_dd (double x, double xtail);
 
-/* Evaluate order-12 polynomials using pairwise summation and Horner scheme in
-   double precision.  */
 static inline double
-eval_poly_horner_lvl2 (double z, const double *coeff)
+eval_poly (double z, const double *coeff)
 {
-  double r1, r2, r3, r4, r5, r6, r7, r8;
-  double R1, R2, R3, R4;
-  double Q1, Q2;
-  double z2, z4, z8;
-  z2 = z * z;
-  r1 = fma (z, coeff[1], coeff[0]);
-  r2 = fma (z, coeff[3], coeff[2]);
-  z4 = z2 * z2;
-  z8 = z4 * z4;
-  R1 = fma (z2, r2, r1);
-  r3 = fma (z, coeff[5], coeff[4]);
-  r4 = fma (z, coeff[7], coeff[6]);
-  R2 = fma (z2, r4, r3);
-  Q1 = fma (z4, R2, R1);
-  r5 = fma (z, coeff[9], coeff[8]);
-  r6 = fma (z, coeff[11], coeff[10]);
-  R3 = fma (z2, r6, r5);
-  r7 = fma (z, coeff[13], coeff[12]);
-  r8 = fma (z, coeff[15], coeff[14]);
-  R4 = fma (z2, r8, r7);
-  Q2 = fma (z4, R4, R3);
-  return fma (z8, Q2, Q1);
+  double z2 = z * z;
+  double z4 = z2 * z2;
+  double z8 = z4 * z4;
+#define C(i) coeff[i]
+  return ESTRIN_15 (z, z2, z4, z8, C);
+#undef C
 }
 
 static inline double
@@ -49,4 +34,5 @@ eval_exp_mx2 (double x)
   return __exp_dd (-(x * x), 0.0);
 }
 
+#undef FMA
 #endif // PL_MATH_ERFCF_H
diff --git a/pl/math/erfcf_2u.c b/pl/math/erfcf_2u.c
index 80dba83..8d4bba1 100644
--- a/pl/math/erfcf_2u.c
+++ b/pl/math/erfcf_2u.c
@@ -21,7 +21,7 @@ approx_erfcf_hi (float x, uint32_t sign, const double *coeff)
 
   /* Polynomial contribution.  */
   double z = (double) fabs (x);
-  float p = (float) eval_poly_horner_lvl2 (z, coeff);
+  float p = (float) eval_poly (z, coeff);
   /* Gaussian contribution.  */
   float e_mx2 = (float) eval_exp_mx2 (z);
 
@@ -34,7 +34,7 @@ approx_erfcf_lo (float x, uint32_t sign, const double *coeff)
 {
   /* Polynomial contribution.  */
   double z = (double) fabs (x);
-  float p = (float) eval_poly_horner_lvl2 (z, coeff);
+  float p = (float) eval_poly (z, coeff);
   /* Gaussian contribution.  */
   float e_mx2 = (float) eval_exp_mx2 (z);
 
diff --git a/pl/math/erff_1u5.c b/pl/math/erff_1u5.c
index 1073603..bad68a6 100644
--- a/pl/math/erff_1u5.c
+++ b/pl/math/erff_1u5.c
@@ -7,7 +7,10 @@
 
 #include <stdint.h>
 #include <math.h>
+
 #include "math_config.h"
+#include "hornerf.h"
+#include "estrinf.h"
 
 #define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
 #define A __erff_data.erff_poly_A
@@ -26,7 +29,7 @@ top12 (float x)
 float
 erff (float x)
 {
-  float r, x2, u;
+  float r, x2;
 
   /* Get top word.  */
   uint32_t ix = asuint (x);
@@ -54,23 +57,18 @@ erff (float x)
 
       /* Normalized cases (|x| < 0.921875) - Use Horner scheme for x+x*P(x^2).
        */
-      r = A[5];
-      r = fmaf (r, x2, A[4]);
-      r = fmaf (r, x2, A[3]);
-      r = fmaf (r, x2, A[2]);
-      r = fmaf (r, x2, A[1]);
-      r = fmaf (r, x2, A[0]);
-      r = fmaf (r, x, x);
+#define C(i) A[i]
+      r = fmaf (HORNER_5 (x2, C), x, x);
+#undef C
     }
   else if (ia12 < 0x408)
     { /* |x| < 4.0 - Use a custom Estrin scheme.  */
 
       float a = fabsf (x);
       /* Use Estrin scheme on high order (small magnitude) coefficients.  */
-      r = fmaf (B[6], a, B[5]);
-      u = fmaf (B[4], a, B[3]);
-      x2 = x * x;
-      r = fmaf (r, x2, u);
+#define C(i) B[i]
+      r = ESTRIN_3_ (a, x * x, C, 3);
+#undef C
       /* Then switch to pure Horner scheme.  */
       r = fmaf (r, a, B[2]);
       r = fmaf (r, a, B[1]);
diff --git a/pl/math/estrin.h b/pl/math/estrin.h
new file mode 100644
index 0000000..89df329
--- /dev/null
+++ b/pl/math/estrin.h
@@ -0,0 +1,16 @@
+/*
+ * Helper macros for double-precision Estrin polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#if V_SUPPORTED
+#define FMA v_fma_f64
+#else
+#define FMA fma
+#endif
+
+#include "estrin_wrap.h"
diff --git a/pl/math/estrin_wrap.h b/pl/math/estrin_wrap.h
new file mode 100644
index 0000000..93af2ab
--- /dev/null
+++ b/pl/math/estrin_wrap.h
@@ -0,0 +1,48 @@
+/*
+ * Helper macros for double-precision Estrin polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+// clang-format off
+#define  ESTRIN_1_(x,                  c, i) FMA(x,   c(1 + i),                        c(i))
+#define  ESTRIN_2_(x, x2,              c, i) FMA(x2,  c(2 + i),                        ESTRIN_1_(x,              c, i))
+#define  ESTRIN_3_(x, x2,              c, i) FMA(x2,  ESTRIN_1_(x,         c,  2 + i), ESTRIN_1_(x,              c, i))
+#define  ESTRIN_4_(x, x2, x4,          c, i) FMA(x4,  c(4 + i),                        ESTRIN_3_(x, x2,          c, i))
+#define  ESTRIN_5_(x, x2, x4,          c, i) FMA(x4,  ESTRIN_1_(x,         c,  4 + i), ESTRIN_3_(x, x2,          c, i))
+#define  ESTRIN_6_(x, x2, x4,          c, i) FMA(x4,  ESTRIN_2_(x, x2,     c,  4 + i), ESTRIN_3_(x, x2,          c, i))
+#define  ESTRIN_7_(x, x2, x4,          c, i) FMA(x4,  ESTRIN_3_(x, x2,     c,  4 + i), ESTRIN_3_(x, x2,          c, i))
+#define  ESTRIN_8_(x, x2, x4, x8,      c, i) FMA(x8,  c(8 + i),                        ESTRIN_7_(x, x2, x4,      c, i))
+#define  ESTRIN_9_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_1_(x,         c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_10_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_2_(x, x2,     c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_11_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_3_(x, x2,     c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_12_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_4_(x, x2, x4, c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_13_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_5_(x, x2, x4, c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_14_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_6_(x, x2, x4, c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_15_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_7_(x, x2, x4, c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_16_(x, x2, x4, x8, x16, c, i) FMA(x16, c(16 + i),                       ESTRIN_15_(x, x2, x4, x8, c, i))
+#define ESTRIN_17_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_1_(x,         c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
+#define ESTRIN_18_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_2_(x, x2,     c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
+#define ESTRIN_19_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_3_(x, x2,     c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
+
+#define  ESTRIN_1(x,                  c)  ESTRIN_1_(x,                  c, 0)
+#define  ESTRIN_2(x, x2,              c)  ESTRIN_2_(x, x2,              c, 0)
+#define  ESTRIN_3(x, x2,              c)  ESTRIN_3_(x, x2,              c, 0)
+#define  ESTRIN_4(x, x2, x4,          c)  ESTRIN_4_(x, x2, x4,          c, 0)
+#define  ESTRIN_5(x, x2, x4,          c)  ESTRIN_5_(x, x2, x4,          c, 0)
+#define  ESTRIN_6(x, x2, x4,          c)  ESTRIN_6_(x, x2, x4,          c, 0)
+#define  ESTRIN_7(x, x2, x4,          c)  ESTRIN_7_(x, x2, x4,          c, 0)
+#define  ESTRIN_8(x, x2, x4, x8,      c)  ESTRIN_8_(x, x2, x4, x8,      c, 0)
+#define  ESTRIN_9(x, x2, x4, x8,      c)  ESTRIN_9_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_10(x, x2, x4, x8,      c) ESTRIN_10_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_11(x, x2, x4, x8,      c) ESTRIN_11_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_12(x, x2, x4, x8,      c) ESTRIN_12_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_13(x, x2, x4, x8,      c) ESTRIN_13_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_14(x, x2, x4, x8,      c) ESTRIN_14_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_15(x, x2, x4, x8,      c) ESTRIN_15_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_16(x, x2, x4, x8, x16, c) ESTRIN_16_(x, x2, x4, x8, x16, c, 0)
+#define ESTRIN_17(x, x2, x4, x8, x16, c) ESTRIN_17_(x, x2, x4, x8, x16, c, 0)
+#define ESTRIN_18(x, x2, x4, x8, x16, c) ESTRIN_18_(x, x2, x4, x8, x16, c, 0)
+#define ESTRIN_19(x, x2, x4, x8, x16, c) ESTRIN_19_(x, x2, x4, x8, x16, c, 0)
+// clang-format on
diff --git a/pl/math/estrinf.h b/pl/math/estrinf.h
new file mode 100644
index 0000000..be52ab5
--- /dev/null
+++ b/pl/math/estrinf.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for single-precision Estrin polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f32
+#else
+#define FMA fmaf
+#endif
+
+#include "estrin_wrap.h"
diff --git a/pl/math/expm1_2u5.c b/pl/math/expm1_2u5.c
index c701d7e..98ef078 100644
--- a/pl/math/expm1_2u5.c
+++ b/pl/math/expm1_2u5.c
@@ -6,6 +6,7 @@
  */
 
 #include "math_config.h"
+#include "estrin.h"
 
 #define InvLn2 0x1.71547652b82fep0
 #define Ln2hi 0x1.62e42fefa39efp-1
@@ -19,25 +20,6 @@
 
 #define C(i) __expm1_poly[i]
 
-static inline double
-eval_poly (double f, double f2)
-{
-  /* Evaluate custom polynomial using Estrin scheme.  */
-  double p_01 = fma (f, C (1), C (0));
-  double p_23 = fma (f, C (3), C (2));
-  double p_45 = fma (f, C (5), C (4));
-  double p_67 = fma (f, C (7), C (6));
-  double p_89 = fma (f, C (9), C (8));
-
-  double p_03 = fma (f2, p_23, p_01);
-  double p_47 = fma (f2, p_67, p_45);
-  double p_8a = fma (f2, C (10), p_89);
-
-  double f4 = f2 * f2;
-  double p_07 = fma (f4, p_47, p_03);
-  return fma (f4 * f4, p_8a, p_07);
-}
-
 /* Approximation for exp(x) - 1 using polynomial on a reduced interval.
    The maximum error observed error is 2.17 ULP:
    expm1(0x1.63f90a866748dp-2) got 0x1.a9af56603878ap-2
@@ -80,7 +62,8 @@ expm1 (double x)
      So we calculate the polynomial P(f) = a + bf + cf^2 + ...
      and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
   double f2 = f * f;
-  double p = fma (f2, eval_poly (f, f2), f);
+  double f4 = f2 * f2;
+  double p = fma (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f);
 
   /* Assemble the result, using a slight rearrangement to achieve acceptable
      accuracy.
diff --git a/pl/math/expm1f_1u6.c b/pl/math/expm1f_1u6.c
index 44981ca..0904652 100644
--- a/pl/math/expm1f_1u6.c
+++ b/pl/math/expm1f_1u6.c
@@ -6,6 +6,7 @@
  */
 
 #include "math_config.h"
+#include "hornerf.h"
 
 #define Shift (0x1.8p23f)
 #define InvLn2 (0x1.715476p+0f)
@@ -59,12 +60,7 @@ expm1f (float x)
 	 x + ax^2 + bx^3 + cx^4 ....
      So we calculate the polynomial P(f) = a + bf + cf^2 + ...
      and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
-  float p = fmaf (C (4), f, C (3));
-  p = fmaf (p, f, C (2));
-  p = fmaf (p, f, C (1));
-  p = fmaf (p, f, C (0));
-  p = fmaf (f * f, p, f);
-
+  float p = fmaf (f * f, HORNER_4 (f, C), f);
   /* Assemble the result, using a slight rearrangement to achieve acceptable
      accuracy.
      expm1(x) ~= 2^i * (p + 1) - 1
diff --git a/pl/math/horner.h b/pl/math/horner.h
new file mode 100644
index 0000000..4dbc122
--- /dev/null
+++ b/pl/math/horner.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for single-precision Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f64
+#else
+#define FMA fma
+#endif
+
+#include "horner_wrap.h"
diff --git a/pl/math/horner_wrap.h b/pl/math/horner_wrap.h
new file mode 100644
index 0000000..892d63b
--- /dev/null
+++ b/pl/math/horner_wrap.h
@@ -0,0 +1,34 @@
+/*
+ * Helper macros for Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+// clang-format off
+#define  HORNER_1_(x, c, i) FMA(C(i + 1), x, c(i))
+#define  HORNER_2_(x, c, i) FMA(HORNER_1_ (x, c, i + 1), x, c(i))
+#define  HORNER_3_(x, c, i) FMA(HORNER_2_ (x, c, i + 1), x, c(i))
+#define  HORNER_4_(x, c, i) FMA(HORNER_3_ (x, c, i + 1), x, c(i))
+#define  HORNER_5_(x, c, i) FMA(HORNER_4_ (x, c, i + 1), x, c(i))
+#define  HORNER_6_(x, c, i) FMA(HORNER_5_ (x, c, i + 1), x, c(i))
+#define  HORNER_7_(x, c, i) FMA(HORNER_6_ (x, c, i + 1), x, c(i))
+#define  HORNER_8_(x, c, i) FMA(HORNER_7_ (x, c, i + 1), x, c(i))
+#define  HORNER_9_(x, c, i) FMA(HORNER_8_ (x, c, i + 1), x, c(i))
+#define HORNER_10_(x, c, i) FMA(HORNER_9_ (x, c, i + 1), x, c(i))
+#define HORNER_11_(x, c, i) FMA(HORNER_10_(x, c, i + 1), x, c(i))
+#define HORNER_12_(x, c, i) FMA(HORNER_11_(x, c, i + 1), x, c(i))
+
+#define  HORNER_1(x, c) HORNER_1_ (x, c, 0)
+#define  HORNER_2(x, c) HORNER_2_ (x, c, 0)
+#define  HORNER_3(x, c) HORNER_3_ (x, c, 0)
+#define  HORNER_4(x, c) HORNER_4_ (x, c, 0)
+#define  HORNER_5(x, c) HORNER_5_ (x, c, 0)
+#define  HORNER_6(x, c) HORNER_6_ (x, c, 0)
+#define  HORNER_7(x, c) HORNER_7_ (x, c, 0)
+#define  HORNER_8(x, c) HORNER_8_ (x, c, 0)
+#define  HORNER_9(x, c) HORNER_9_ (x, c, 0)
+#define HORNER_10(x, c) HORNER_10_(x, c, 0)
+#define HORNER_11(x, c) HORNER_11_(x, c, 0)
+#define HORNER_12(x, c) HORNER_12_(x, c, 0)
+// clang-format on
diff --git a/pl/math/hornerf.h b/pl/math/hornerf.h
new file mode 100644
index 0000000..bec1593
--- /dev/null
+++ b/pl/math/hornerf.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for double-precision Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f32
+#else
+#define FMA fmaf
+#endif
+
+#include "horner_wrap.h"
diff --git a/pl/math/log1p_2u.c b/pl/math/log1p_2u.c
index b9c7e9e..ade5d87 100644
--- a/pl/math/log1p_2u.c
+++ b/pl/math/log1p_2u.c
@@ -5,8 +5,7 @@
  */
 
 #include "math_config.h"
-
-#include "log1p_common.h"
+#include "estrin.h"
 
 #define Ln2Hi 0x1.62e42fefa3800p-1
 #define Ln2Lo 0x1.ef35793c76730p-45
@@ -19,6 +18,16 @@
 #define Rt2MOne 0x3fda827999fcef32
 #define AbsMask 0x7fffffffffffffff
 #define ExpM63 0x3c00
+#define C(i) __log1p_data.coeffs[i]
+
+static inline double
+eval_poly (double f)
+{
+  double f2 = f * f;
+  double f4 = f2 * f2;
+  double f8 = f4 * f4;
+  return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C);
+}
 
 /* log1p approximation using polynomial on reduced interval. Largest
    observed errors are near the lower boundary of the region where k
diff --git a/pl/math/log1p_common.h b/pl/math/log1p_common.h
deleted file mode 100644
index 24e6f20..0000000
--- a/pl/math/log1p_common.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Double-precision polynomial evaluation function for scalar and vector
- * log1p(x)
- *
- * Copyright (c) 2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef PL_MATH_LOG1P_COMMON_H
-#define PL_MATH_LOG1P_COMMON_H
-
-#include "math_config.h"
-
-#if V_SUPPORTED
-
-#include "v_math.h"
-
-#define DBL_T v_f64_t
-#define FMA v_fma_f64
-#define C(i) v_f64 (__log1p_data.coeffs[i])
-
-#else
-
-#define DBL_T double
-#define FMA fma
-#define C(i) __log1p_data.coeffs[i]
-
-#endif
-
-static inline DBL_T
-eval_poly (DBL_T f)
-{
-  /* Evaluate polynomial using Estrin's method.  */
-  DBL_T p_01 = FMA (f, C (1), C (0));
-  DBL_T p_23 = FMA (f, C (3), C (2));
-  DBL_T p_45 = FMA (f, C (5), C (4));
-  DBL_T p_67 = FMA (f, C (7), C (6));
-  DBL_T p_89 = FMA (f, C (9), C (8));
-  DBL_T p_ab = FMA (f, C (11), C (10));
-  DBL_T p_cd = FMA (f, C (13), C (12));
-  DBL_T p_ef = FMA (f, C (15), C (14));
-  DBL_T p_gh = FMA (f, C (17), C (16));
-
-  DBL_T f2 = f * f;
-  DBL_T p_03 = FMA (f2, p_23, p_01);
-  DBL_T p_47 = FMA (f2, p_67, p_45);
-  DBL_T p_8b = FMA (f2, p_ab, p_89);
-  DBL_T p_cf = FMA (f2, p_ef, p_cd);
-  DBL_T p_gi = FMA (f2, C (18), p_gh);
-
-  DBL_T f4 = f2 * f2;
-  DBL_T p_07 = FMA (f4, p_47, p_03);
-  DBL_T p_8f = FMA (f4, p_cf, p_8b);
-
-  DBL_T f8 = f4 * f4;
-  DBL_T p_0f = FMA (f8, p_8f, p_07);
-
-  return FMA (f8 * f8, p_gi, p_0f);
-}
-
-#endif // PL_MATH_LOG1P_COMMON_H
diff --git a/pl/math/log1pf_2u1.c b/pl/math/log1pf_2u1.c
index 5b0d542..9b7cb94 100644
--- a/pl/math/log1pf_2u1.c
+++ b/pl/math/log1pf_2u1.c
@@ -5,6 +5,7 @@
  */
 
 #include "math_config.h"
+#include "hornerf.h"
 
 #define Ln2 (0x1.62e43p-1f)
 #define SignMask (0x80000000)
@@ -21,8 +22,8 @@ eval_poly (float m, uint32_t e)
 {
 #ifdef LOG1PF_2U5
 
-  /* 2.5 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using Estrin
-     scheme.  */
+  /* 2.5 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using
+     slightly modified Estrin scheme (no x^0 term, and x term is just x).  */
   float p_12 = fmaf (m, C (1), C (0));
   float p_34 = fmaf (m, C (3), C (2));
   float p_56 = fmaf (m, C (5), C (4));
@@ -49,15 +50,7 @@ eval_poly (float m, uint32_t e)
      x + C1 * x^2 + C2 * x^3 + C3 * x^4 + ...
      Hence approximation has the form m + m^2 * P(m)
        where P(x) = C1 + C2 * x + C3 * x^2 + ... .  */
-  float p = fmaf (C (8), m, C (7));
-  p = fmaf (p, m, C (6));
-  p = fmaf (p, m, C (5));
-  p = fmaf (p, m, C (4));
-  p = fmaf (p, m, C (3));
-  p = fmaf (p, m, C (2));
-  p = fmaf (p, m, C (1));
-  p = fmaf (p, m, C (0));
-  return fmaf (m, m * p, m);
+  return fmaf (m, m * HORNER_8 (m, C), m);
 
 #else
 #error No log1pf approximation exists with the requested precision. Options are 13 or 25.
diff --git a/pl/math/pairwise_horner.h b/pl/math/pairwise_horner.h
new file mode 100644
index 0000000..bee7592
--- /dev/null
+++ b/pl/math/pairwise_horner.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for double-precision pairwise Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f64
+#else
+#define FMA fma
+#endif
+
+#include "pairwise_horner_wrap.h"
diff --git a/pl/math/pairwise_horner_wrap.h b/pl/math/pairwise_horner_wrap.h
new file mode 100644
index 0000000..5bc287b
--- /dev/null
+++ b/pl/math/pairwise_horner_wrap.h
@@ -0,0 +1,36 @@
+/*
+ * Helper macros for pairwise Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+// clang-format off
+#define  PW_HORNER_1_(x, c,     i) FMA(x,  C(i + 1),                      C(i))
+#define  PW_HORNER_3_(x, x2, c, i) FMA(x2, PW_HORNER_1_(x,     c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_5_(x, x2, c, i) FMA(x2, PW_HORNER_3_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_7_(x, x2, c, i) FMA(x2, PW_HORNER_5_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_9_(x, x2, c, i) FMA(x2, PW_HORNER_7_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_11_(x, x2, c, i) FMA(x2, PW_HORNER_9_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+
+#define  PAIRWISE_HORNER_1(x,     c) PW_HORNER_1_ (x, c, 0)
+#define  PAIRWISE_HORNER_3(x, x2, c) PW_HORNER_3_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_5(x, x2, c) PW_HORNER_5_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_7(x, x2, c) PW_HORNER_7_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_9(x, x2, c) PW_HORNER_9_ (x, x2, c, 0)
+#define PAIRWISE_HORNER_11(x, x2, c) PW_HORNER_11_(x, x2, c, 0)
+
+#define  PW_HORNER_2_(x, x2, c, i) FMA(x2, c(i + 2),                       PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_4_(x, x2, c, i) FMA(x2, PW_HORNER_2_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_6_(x, x2, c, i) FMA(x2, PW_HORNER_4_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_8_(x, x2, c, i) FMA(x2, PW_HORNER_6_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_10_(x, x2, c, i) FMA(x2, PW_HORNER_8_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_12_(x, x2, c, i) FMA(x2, PW_HORNER_10_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+
+#define  PAIRWISE_HORNER_2(x, x2, c) PW_HORNER_2_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_4(x, x2, c) PW_HORNER_4_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_6(x, x2, c) PW_HORNER_6_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_8(x, x2, c) PW_HORNER_8_(x, x2, c, 0)
+#define PAIRWISE_HORNER_10(x, x2, c) PW_HORNER_10_(x, x2, c, 0)
+#define PAIRWISE_HORNER_12(x, x2, c) PW_HORNER_12_(x, x2, c, 0)
+// clang-format on
diff --git a/pl/math/pairwise_hornerf.h b/pl/math/pairwise_hornerf.h
new file mode 100644
index 0000000..a8aa4d1
--- /dev/null
+++ b/pl/math/pairwise_hornerf.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for single-precision pairwise Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f32
+#else
+#define FMA fmaf
+#endif
+
+#include "pairwise_horner_wrap.h"
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 14abfc6..bd4c4c2 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -789,7 +789,7 @@ cbrtf  __vn_cbrtf      $runvn   fenv
 cbrtf  _ZGVnN4v_cbrtf  $runvn   fenv
 asinh  __s_asinh       $runs    fenv
 # Test vector asinh 3 times, with control lane < 1, > 1 and special.
-#  Ensures the v_sel is choosing the right option in all cases.
+# Ensures the v_sel is choosing the right option in all cases.
 asinh  __v_asinh       $runv    fenv -c 0.5
 asinh  __vn_asinh      $runvn   fenv -c 0.5
 asinh  _ZGVnN2v_asinh  $runvn   fenv -c 0.5
diff --git a/pl/math/v_asinh_2u5.c b/pl/math/v_asinh_2u5.c
index 508ced1..974e6df 100644
--- a/pl/math/v_asinh_2u5.c
+++ b/pl/math/v_asinh_2u5.c
@@ -5,6 +5,7 @@
  */
 
 #include "v_math.h"
+#include "estrin.h"
 
 #if V_SUPPORTED
 
@@ -74,38 +75,6 @@ log_inline (v_f64_t x)
   return y;
 }
 
-static inline v_f64_t
-eval_poly (v_f64_t z)
-{
-  /* Custom polynomial, shared with scalar routine, for calculating asinh(x) in
-     [2^-26, 1]. Evaluated with Estrin scheme.  */
-  v_f64_t p_01 = v_fma_f64 (z, C (1), C (0));
-  v_f64_t p_23 = v_fma_f64 (z, C (3), C (2));
-  v_f64_t p_45 = v_fma_f64 (z, C (5), C (4));
-  v_f64_t p_67 = v_fma_f64 (z, C (7), C (6));
-  v_f64_t p_89 = v_fma_f64 (z, C (9), C (8));
-  v_f64_t p_ab = v_fma_f64 (z, C (11), C (10));
-  v_f64_t p_cd = v_fma_f64 (z, C (13), C (12));
-  v_f64_t p_ef = v_fma_f64 (z, C (15), C (14));
-  v_f64_t p_gh = v_fma_f64 (z, C (17), C (16));
-
-  v_f64_t z2 = z * z;
-  v_f64_t p_03 = v_fma_f64 (z2, p_23, p_01);
-  v_f64_t p_47 = v_fma_f64 (z2, p_67, p_45);
-  v_f64_t p_8b = v_fma_f64 (z2, p_ab, p_89);
-  v_f64_t p_cf = v_fma_f64 (z2, p_ef, p_cd);
-
-  v_f64_t z4 = z2 * z2;
-  v_f64_t p_07 = v_fma_f64 (z4, p_47, p_03);
-  v_f64_t p_8f = v_fma_f64 (z4, p_cf, p_8b);
-
-  v_f64_t z8 = z4 * z4;
-  v_f64_t p_0f = v_fma_f64 (z8, p_8f, p_07);
-
-  v_f64_t z16 = z8 * z8;
-  return v_fma_f64 (z16, p_gh, p_0f);
-}
-
 /* Double-precision implementation of vector asinh(x).
    asinh is very sensitive around 1, so it is impractical to devise a single
    low-cost algorithm which is sufficiently accurate on a wide range of input.
@@ -162,7 +131,10 @@ VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x)
       ax = v_sel_f64 (tiny | gt1, v_f64 (0), ax);
 #endif
       v_f64_t x2 = ax * ax;
-      v_f64_t p = eval_poly (x2);
+      v_f64_t z2 = x2 * x2;
+      v_f64_t z4 = z2 * z2;
+      v_f64_t z8 = z4 * z4;
+      v_f64_t p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C);
       option_2 = v_fma_f64 (p, x2 * ax, ax);
 #if WANT_ERRNO
       option_2 = v_sel_f64 (tiny, x, option_2);
diff --git a/pl/math/v_erfc_4u.c b/pl/math/v_erfc_4u.c
index 603d8f5..80e11e7 100644
--- a/pl/math/v_erfc_4u.c
+++ b/pl/math/v_erfc_4u.c
@@ -7,6 +7,7 @@
 
 #include "math_config.h"
 #include "v_math.h"
+#include "horner.h"
 #if V_SUPPORTED
 
 /* Accurate exponential (vector variant of exp_dd).  */
@@ -56,28 +57,6 @@ lookup (v_u64_t i)
   return e;
 }
 
-/* Evaluate order-12 polynomials using pairwise summation and Horner
-   scheme.  */
-static inline v_f64_t
-v_eval_poly (v_f64_t z, struct entry e)
-{
-  v_f64_t r = e.P[12];
-  r = v_fma_f64 (z, r, e.P[11]);
-  r = v_fma_f64 (z, r, e.P[10]);
-  r = v_fma_f64 (z, r, e.P[9]);
-  r = v_fma_f64 (z, r, e.P[8]);
-  r = v_fma_f64 (z, r, e.P[7]);
-  r = v_fma_f64 (z, r, e.P[6]);
-  r = v_fma_f64 (z, r, e.P[5]);
-  r = v_fma_f64 (z, r, e.P[4]);
-  r = v_fma_f64 (z, r, e.P[3]);
-  r = v_fma_f64 (z, r, e.P[2]);
-  r = v_fma_f64 (z, r, e.P[1]);
-  r = v_fma_f64 (z, r, e.P[0]);
-
-  return r;
-}
-
 /* Accurate evaluation of exp(x^2) using compensated product
    (x^2 ~ x*x + e2) and custom exp(y+d) routine for small
    corrections d<<y.  */
@@ -154,7 +133,8 @@ v_f64_t V_NAME (erfc) (v_f64_t x)
 
   /* Evaluate Polynomial: P(|x|-x_i).  */
   z = a - dat.xi;
-  p = v_eval_poly (z, dat);
+#define C(i) dat.P[i]
+  p = HORNER_12 (z, C);
 
   /* Evaluate Gaussian: exp(-x^2).  */
   v_f64_t e = v_eval_gauss (a);
diff --git a/pl/math/v_erfcf_1u.c b/pl/math/v_erfcf_1u.c
index fc9571d..d9c65a5 100644
--- a/pl/math/v_erfcf_1u.c
+++ b/pl/math/v_erfcf_1u.c
@@ -7,6 +7,7 @@
 
 #include "v_math.h"
 #include "erfcf.h"
+#include "estrin.h"
 
 #if V_SUPPORTED
 
@@ -41,39 +42,12 @@ interval_index (uint32_t ia12)
 #endif
 
 static inline v_f64_t
-v_eval_poly_estrin (v_f64_t z, const double *coeff1, const double *coeff2)
-{
-  v_f64_t z2 = z * z;
-  v_f64_t z4 = z2 * z2;
-  v_f64_t z8 = z4 * z4;
-
-  v_f64_t c0_zc1 = v_fma_f64 (z, C (1), C (0));
-  v_f64_t c2_zc3 = v_fma_f64 (z, C (3), C (2));
-  v_f64_t c4_zc5 = v_fma_f64 (z, C (5), C (4));
-  v_f64_t c6_zc7 = v_fma_f64 (z, C (7), C (6));
-  v_f64_t c8_zc9 = v_fma_f64 (z, C (9), C (8));
-  v_f64_t c10_zc11 = v_fma_f64 (z, C (11), C (10));
-  v_f64_t c12_zc13 = v_fma_f64 (z, C (13), C (12));
-  v_f64_t c14_zc15 = v_fma_f64 (z, C (15), C (14));
-
-  v_f64_t c0_z2c3 = v_fma_f64 (z2, c2_zc3, c0_zc1);
-  v_f64_t c4_z2c7 = v_fma_f64 (z2, c6_zc7, c4_zc5);
-  v_f64_t c8_z2c11 = v_fma_f64 (z2, c10_zc11, c8_zc9);
-  v_f64_t c12_z2c15 = v_fma_f64 (z2, c14_zc15, c12_zc13);
-
-  v_f64_t c0_z4c7 = v_fma_f64 (z4, c4_z2c7, c0_z2c3);
-  v_f64_t c8_z4c15 = v_fma_f64 (z4, c12_z2c15, c8_z2c11);
-
-  return v_fma_f64 (z8, c8_z4c15, c0_z4c7);
-}
-
-#undef C
-
-static inline v_f64_t
 v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1,
 			   const double *coeff2)
 {
-  v_f64_t poly = v_eval_poly_estrin (x, coeff1, coeff2);
+  v_f64_t x2 = x * x;
+  v_f64_t x4 = x2 * x2;
+  v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C);
   v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0));
   return poly * gauss;
 }
@@ -81,7 +55,7 @@ v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1,
 static inline float
 approx_poly_gauss (float abs_x, const double *coeff)
 {
-  return (float) (eval_poly_horner_lvl2 (abs_x, coeff) * eval_exp_mx2 (abs_x));
+  return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x));
 }
 
 static v_f32_t
diff --git a/pl/math/v_log1p_2u5.c b/pl/math/v_log1p_2u5.c
index 51d0d51..d97a622 100644
--- a/pl/math/v_log1p_2u5.c
+++ b/pl/math/v_log1p_2u5.c
@@ -7,7 +7,7 @@
 #include "v_math.h"
 #if V_SUPPORTED
 
-#include "log1p_common.h"
+#include "estrin.h"
 
 #define Ln2Hi v_f64 (0x1.62e42fefa3800p-1)
 #define Ln2Lo v_f64 (0x1.ef35793c76730p-45)
@@ -18,6 +18,16 @@
 #define OneTop12 0x3ff
 #define BottomMask 0xffffffff
 #define AbsMask 0x7fffffffffffffff
+#define C(i) v_f64 (__log1p_data.coeffs[i])
+
+static inline v_f64_t
+eval_poly (v_f64_t f)
+{
+  v_f64_t f2 = f * f;
+  v_f64_t f4 = f2 * f2;
+  v_f64_t f8 = f4 * f4;
+  return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C);
+}
 
 VPCS_ATTR
 NOINLINE static v_f64_t
diff --git a/pl/math/v_tanf_3u2.c b/pl/math/v_tanf_3u2.c
index a6d9dd1..01f7f65 100644
--- a/pl/math/v_tanf_3u2.c
+++ b/pl/math/v_tanf_3u2.c
@@ -6,6 +6,8 @@
  */
 
 #include "v_math.h"
+#include "estrinf.h"
+
 #if V_SUPPORTED
 
 /* Constants.  */
@@ -32,13 +34,7 @@ eval_poly (v_f32_t z)
 {
   v_f32_t z2 = z * z;
   v_f32_t z4 = z2 * z2;
-  v_f32_t y_10 = v_fma_f32 (z, poly (1), poly (0));
-  v_f32_t y_32 = v_fma_f32 (z, poly (3), poly (2));
-  v_f32_t y_54 = v_fma_f32 (z, poly (5), poly (4));
-  v_f32_t y_6_54 = v_fma_f32 (z2, poly (6), y_54);
-  v_f32_t y_32_10 = v_fma_f32 (z2, y_32, y_10);
-  v_f32_t y = v_fma_f32 (z4, y_6_54, y_32_10);
-  return y;
+  return ESTRIN_6 (z, z2, z4, poly);
 }
 
 /* Fast implementation of Neon tanf.
diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index 571fd8b..67e4520 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -6,6 +6,7 @@
 
 #include "v_math.h"
 #include "mathlib.h"
+#include "estrinf.h"
 
 #if V_SUPPORTED
 
@@ -39,10 +40,7 @@ expm1f_inline (v_f32_t x)
   /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
      Uses Estrin scheme, where the main __v_expm1f routine uses Horner.  */
   v_f32_t f2 = f * f;
-  v_f32_t p_01 = v_fma_f32 (f, C (1), C (0));
-  v_f32_t p_23 = v_fma_f32 (f, C (3), C (2));
-  v_f32_t p = v_fma_f32 (f2, p_23, p_01);
-  p = v_fma_f32 (f2 * f2, C (4), p);
+  v_f32_t p = ESTRIN_4 (f, f2, f2 * f2, C);
   p = v_fma_f32 (f2, p, f);
 
   /* t = 2^i.  */
author	Joe Ramsay <Joe.Ramsay@arm.com>	2022-12-09 12:19:38 +0000
committer	Joe Ramsay <joe.ramsay@arm.com>	2022-12-09 12:19:38 +0000
commit	bc7cc9d2a762a26b2fcbf150b3fc9c6993ffa16c (patch)
tree	0b06cfa5fab3f8f594ff16a1d093a92412e8fb86
parent	132d2f5da6155e64ff39a54fdbb46145a3892d6a (diff)
download	arm-optimized-routines-bc7cc9d2a762a26b2fcbf150b3fc9c6993ffa16c.tar.gz