From 77f01ab5d1d2eead1bd4a9135d6a76ebd3fe21e5 Mon Sep 17 00:00:00 2001 From: Joseph Myers Date: Tue, 2 Jul 2013 14:55:32 +0000 Subject: [PATCH] Implement fma in soft-fp. --- ChangeLog | 51 +++++ ports/ChangeLog.mips | 18 ++ ports/sysdeps/mips/ieee754/s_fma.c | 5 + ports/sysdeps/mips/ieee754/s_fmaf.c | 5 + ports/sysdeps/mips/ieee754/s_fmal.c | 7 + ports/sysdeps/mips/mips32/Implies | 1 + ports/sysdeps/mips/mips64/n32/s_fma.c | 6 - ports/sysdeps/mips/mips64/n64/s_fma.c | 6 - .../sysdeps/mips/mips64/soft-fp/sfp-machine.h | 7 + ports/sysdeps/mips/soft-fp/sfp-machine.h | 7 + soft-fp/double.h | 13 ++ soft-fp/extended.h | 13 ++ soft-fp/fmadf4.c | 56 +++++ soft-fp/fmasf4.c | 51 +++++ soft-fp/fmatf4.c | 49 ++++ soft-fp/op-1.h | 36 ++- soft-fp/op-2.h | 84 ++++--- soft-fp/op-4.h | 128 ++++++----- soft-fp/op-common.h | 211 ++++++++++++++++++ soft-fp/quad.h | 13 ++ soft-fp/single.h | 23 ++ 21 files changed, 681 insertions(+), 109 deletions(-) create mode 100644 ports/sysdeps/mips/ieee754/s_fma.c create mode 100644 ports/sysdeps/mips/ieee754/s_fmaf.c create mode 100644 ports/sysdeps/mips/ieee754/s_fmal.c delete mode 100644 ports/sysdeps/mips/mips64/n32/s_fma.c delete mode 100644 ports/sysdeps/mips/mips64/n64/s_fma.c create mode 100644 soft-fp/fmadf4.c create mode 100644 soft-fp/fmasf4.c create mode 100644 soft-fp/fmatf4.c diff --git a/ChangeLog b/ChangeLog index 406ca28bd2..4a2d19ad03 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,54 @@ +2013-07-02 Joseph Myers + + [BZ #13304] + * soft-fp/op-common.h (_FP_FMA): New macro. + * soft-fp/op-1.h (_FP_FRAC_HIGHBIT_DW_1): New macro. + (_FP_MUL_MEAT_DW_1_imm): Likewise. Split out of ... + (_FP_MUL_MEAT_1_imm): ... here. + (_FP_MUL_MEAT_DW_1_wide): New macro. Split out of ... + (_FP_MUL_MEAT_1_wide): ... here. + (_FP_MUL_MEAT_DW_1_hard): Likewise. Split out of ... + (_FP_MUL_MEAT_1_hard): ... here. + * soft-fp/op-2.h (_FP_FRAC_HIGHBIT_DW_2): New macro. + (_FP_MUL_MEAT_DW_2_wide): Likewise. Split out of ... + (_FP_MUL_MEAT_2_wide): ... here. + (_FP_MUL_MEAT_DW_2_wide_3mul): New macro. Split out of ... + (_FP_MUL_MEAT_2_wide_3mul): ... here. + (_FP_MUL_MEAT_DW_2_gmp): New macro. Split out of ... + (_FP_MUL_MEAT_2_gmp): ... here. + * soft-fp/op-4.h (_FP_FRAC_HIGHBIT_DW_4): New macro. + (_FP_MUL_MEAT_DW_4_wide): Likewise. Split out of ... + (_FP_MUL_MEAT_4_wide): ... here. + (_FP_MUL_MEAT_DW_4_gmp): New macro. Split out of ... + (_FP_MUL_MEAT_4_gmp): ... here. + * soft-fp/single.h (_FP_FRACTBITS_DW_S): New macro. + (_FP_WFRACBITS_DW_S): Likewise. + (_FP_WFRACXBITS_DW_S): Likewise. + (_FP_HIGHBIT_DW_S): Likewise. + (FP_FMA_S): Likewise. + (_FP_FRAC_HIGH_DW_S): Likewise. + * soft-fp/double.h (_FP_FRACTBITS_DW_D): New macro. + (_FP_WFRACBITS_DW_D): Likewise. + (_FP_WFRACXBITS_DW_D): Likewise. + (_FP_HIGHBIT_DW_D): Likewise. + (FP_FMA_D): Likewise. + (_FP_FRAC_HIGH_DW_D): Likewise. + * soft-fp/extended.h (_FP_FRACTBITS_DW_E): New macro. + (_FP_WFRACBITS_DW_E): Likewise. + (_FP_WFRACXBITS_DW_E): Likewise. + (_FP_HIGHBIT_DW_E): Likewise. + (FP_FMA_E): Likewise. + (_FP_FRAC_HIGH_DW_E): Likewise. + * soft-fp/quad.h (_FP_FRACTBITS_DW_Q): New macro. + (_FP_WFRACBITS_DW_Q): Likewise. + (_FP_WFRACXBITS_DW_Q): Likewise. + (_FP_HIGHBIT_DW_Q): Likewise. + (FP_FMA_Q): Likewise. + (_FP_FRAC_HIGH_DW_Q): Likewise. + * soft-fp/fmasf4.c: New file. + * soft-fp/fmadf4.c: Likewise. + * soft-fp/fmatf4.c: Likewise. + 2013-06-28 Liubov Dmitrieva * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): Set diff --git a/ports/ChangeLog.mips b/ports/ChangeLog.mips index 1f69593a2d..e985de998e 100644 --- a/ports/ChangeLog.mips +++ b/ports/ChangeLog.mips @@ -1,3 +1,21 @@ +2013-07-02 Joseph Myers + + [BZ #13304] + * sysdeps/mips/ieee754/s_fma.c: New file. + * sysdeps/mips/ieee754/s_fmaf.c: Likewise. + * sysdeps/mips/ieee754/s_fmal.c: Likewise. + * sysdeps/mips/mips32/Implies: Add mips/soft-fp. + * sysdeps/mips/mips64/n32/s_fma.c: Remove file. + * sysdeps/mips/mips64/n64/s_fma.c: Likewise. + * sysdeps/mips/mips64/soft-fp/sfp-machine.h (_FP_MUL_MEAT_DW_S): + New macro. + (_FP_MUL_MEAT_DW_D): Likewise. + (_FP_MUL_MEAT_DW_Q): Likewise. + * sysdeps/mips/soft-fp/sfp-machine.h (_FP_MUL_MEAT_DW_S): New + macro. + (_FP_MUL_MEAT_DW_D): Likewise. + (_FP_MUL_MEAT_DW_Q): Likewise. + 2013-06-28 Ryan S. Arnold * sysdeps/mips/dl-procinfo.h (_dl_procinfo): Add TYPE parameter diff --git a/ports/sysdeps/mips/ieee754/s_fma.c b/ports/sysdeps/mips/ieee754/s_fma.c new file mode 100644 index 0000000000..574141416b --- /dev/null +++ b/ports/sysdeps/mips/ieee754/s_fma.c @@ -0,0 +1,5 @@ +#ifdef __mips_hard_float +# include +#else +# include +#endif diff --git a/ports/sysdeps/mips/ieee754/s_fmaf.c b/ports/sysdeps/mips/ieee754/s_fmaf.c new file mode 100644 index 0000000000..30bcdae620 --- /dev/null +++ b/ports/sysdeps/mips/ieee754/s_fmaf.c @@ -0,0 +1,5 @@ +#ifdef __mips_hard_float +# include +#else +# include +#endif diff --git a/ports/sysdeps/mips/ieee754/s_fmal.c b/ports/sysdeps/mips/ieee754/s_fmal.c new file mode 100644 index 0000000000..6b83e914fe --- /dev/null +++ b/ports/sysdeps/mips/ieee754/s_fmal.c @@ -0,0 +1,7 @@ +#include + +#if _MIPS_SIM == _ABIO32 +# error "long double fma being compiled for o32 ABI" +#endif + +#include diff --git a/ports/sysdeps/mips/mips32/Implies b/ports/sysdeps/mips/mips32/Implies index 6473f2517c..42df98f45e 100644 --- a/ports/sysdeps/mips/mips32/Implies +++ b/ports/sysdeps/mips/mips32/Implies @@ -1,3 +1,4 @@ mips/ieee754 +mips/soft-fp mips wordsize-32 diff --git a/ports/sysdeps/mips/mips64/n32/s_fma.c b/ports/sysdeps/mips/mips64/n32/s_fma.c deleted file mode 100644 index 74a1e01fc1..0000000000 --- a/ports/sysdeps/mips/mips64/n32/s_fma.c +++ /dev/null @@ -1,6 +0,0 @@ -/* MIPS long double is implemented in software by fp-bit (as of GCC - 4.7) without support for exceptions or rounding modes, so the fma - implementation in terms of long double is slow and will not produce - correctly rounding results. */ - -#include diff --git a/ports/sysdeps/mips/mips64/n64/s_fma.c b/ports/sysdeps/mips/mips64/n64/s_fma.c deleted file mode 100644 index 74a1e01fc1..0000000000 --- a/ports/sysdeps/mips/mips64/n64/s_fma.c +++ /dev/null @@ -1,6 +0,0 @@ -/* MIPS long double is implemented in software by fp-bit (as of GCC - 4.7) without support for exceptions or rounding modes, so the fma - implementation in terms of long double is slow and will not produce - correctly rounding results. */ - -#include diff --git a/ports/sysdeps/mips/mips64/soft-fp/sfp-machine.h b/ports/sysdeps/mips/mips64/soft-fp/sfp-machine.h index 1bdde5ace8..9cfd6fbb7b 100644 --- a/ports/sysdeps/mips/mips64/soft-fp/sfp-machine.h +++ b/ports/sysdeps/mips/mips64/soft-fp/sfp-machine.h @@ -13,6 +13,13 @@ #define _FP_MUL_MEAT_Q(R,X,Y) \ _FP_MUL_MEAT_2_wide_3mul(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm) +#define _FP_MUL_MEAT_DW_S(R,X,Y) \ + _FP_MUL_MEAT_DW_1_imm(_FP_WFRACBITS_S,R,X,Y) +#define _FP_MUL_MEAT_DW_D(R,X,Y) \ + _FP_MUL_MEAT_DW_1_wide(_FP_WFRACBITS_D,R,X,Y,umul_ppmm) +#define _FP_MUL_MEAT_DW_Q(R,X,Y) \ + _FP_MUL_MEAT_DW_2_wide_3mul(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm) + #define _FP_DIV_MEAT_S(R,X,Y) _FP_DIV_MEAT_1_imm(S,R,X,Y,_FP_DIV_HELP_imm) #define _FP_DIV_MEAT_D(R,X,Y) _FP_DIV_MEAT_1_udiv_norm(D,R,X,Y) #define _FP_DIV_MEAT_Q(R,X,Y) _FP_DIV_MEAT_2_udiv(Q,R,X,Y) diff --git a/ports/sysdeps/mips/soft-fp/sfp-machine.h b/ports/sysdeps/mips/soft-fp/sfp-machine.h index 8ccfaa60fd..a60bef7665 100644 --- a/ports/sysdeps/mips/soft-fp/sfp-machine.h +++ b/ports/sysdeps/mips/soft-fp/sfp-machine.h @@ -10,6 +10,13 @@ #define _FP_MUL_MEAT_Q(R,X,Y) \ _FP_MUL_MEAT_4_wide(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm) +#define _FP_MUL_MEAT_DW_S(R,X,Y) \ + _FP_MUL_MEAT_DW_1_wide(_FP_WFRACBITS_S,R,X,Y,umul_ppmm) +#define _FP_MUL_MEAT_DW_D(R,X,Y) \ + _FP_MUL_MEAT_DW_2_wide(_FP_WFRACBITS_D,R,X,Y,umul_ppmm) +#define _FP_MUL_MEAT_DW_Q(R,X,Y) \ + _FP_MUL_MEAT_DW_4_wide(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm) + #define _FP_DIV_MEAT_S(R,X,Y) _FP_DIV_MEAT_1_udiv_norm(S,R,X,Y) #define _FP_DIV_MEAT_D(R,X,Y) _FP_DIV_MEAT_2_udiv(D,R,X,Y) #define _FP_DIV_MEAT_Q(R,X,Y) _FP_DIV_MEAT_4_udiv(Q,R,X,Y) diff --git a/soft-fp/double.h b/soft-fp/double.h index 759c2eb661..8653f69138 100644 --- a/soft-fp/double.h +++ b/soft-fp/double.h @@ -36,8 +36,10 @@ #if _FP_W_TYPE_SIZE < 64 #define _FP_FRACTBITS_D (2 * _FP_W_TYPE_SIZE) +#define _FP_FRACTBITS_DW_D (4 * _FP_W_TYPE_SIZE) #else #define _FP_FRACTBITS_D _FP_W_TYPE_SIZE +#define _FP_FRACTBITS_DW_D (2 * _FP_W_TYPE_SIZE) #endif #define _FP_FRACBITS_D 53 @@ -59,6 +61,11 @@ #define _FP_OVERFLOW_D \ ((_FP_W_TYPE)1 << _FP_WFRACBITS_D % _FP_W_TYPE_SIZE) +#define _FP_WFRACBITS_DW_D (2 * _FP_WFRACBITS_D) +#define _FP_WFRACXBITS_DW_D (_FP_FRACTBITS_DW_D - _FP_WFRACBITS_DW_D) +#define _FP_HIGHBIT_DW_D \ + ((_FP_W_TYPE)1 << (_FP_WFRACBITS_DW_D - 1) % _FP_W_TYPE_SIZE) + typedef float DFtype __attribute__((mode(DF))); #if _FP_W_TYPE_SIZE < 64 @@ -149,6 +156,7 @@ union _FP_UNION_D #define FP_DIV_D(R,X,Y) _FP_DIV(D,2,R,X,Y) #define FP_SQRT_D(R,X) _FP_SQRT(D,2,R,X) #define _FP_SQRT_MEAT_D(R,S,T,X,Q) _FP_SQRT_MEAT_2(R,S,T,X,Q) +#define FP_FMA_D(R,X,Y,Z) _FP_FMA(D,2,4,R,X,Y,Z) #define FP_CMP_D(r,X,Y,un) _FP_CMP(D,2,r,X,Y,un) #define FP_CMP_EQ_D(r,X,Y) _FP_CMP_EQ(D,2,r,X,Y) @@ -160,6 +168,8 @@ union _FP_UNION_D #define _FP_FRAC_HIGH_D(X) _FP_FRAC_HIGH_2(X) #define _FP_FRAC_HIGH_RAW_D(X) _FP_FRAC_HIGH_2(X) +#define _FP_FRAC_HIGH_DW_D(X) _FP_FRAC_HIGH_4(X) + #else union _FP_UNION_D @@ -246,6 +256,7 @@ union _FP_UNION_D #define FP_DIV_D(R,X,Y) _FP_DIV(D,1,R,X,Y) #define FP_SQRT_D(R,X) _FP_SQRT(D,1,R,X) #define _FP_SQRT_MEAT_D(R,S,T,X,Q) _FP_SQRT_MEAT_1(R,S,T,X,Q) +#define FP_FMA_D(R,X,Y,Z) _FP_FMA(D,1,2,R,X,Y,Z) /* The implementation of _FP_MUL_D and _FP_DIV_D should be chosen by the target machine. */ @@ -260,4 +271,6 @@ union _FP_UNION_D #define _FP_FRAC_HIGH_D(X) _FP_FRAC_HIGH_1(X) #define _FP_FRAC_HIGH_RAW_D(X) _FP_FRAC_HIGH_1(X) +#define _FP_FRAC_HIGH_DW_D(X) _FP_FRAC_HIGH_2(X) + #endif /* W_TYPE_SIZE < 64 */ diff --git a/soft-fp/extended.h b/soft-fp/extended.h index 74927550eb..c8b1583086 100644 --- a/soft-fp/extended.h +++ b/soft-fp/extended.h @@ -33,8 +33,10 @@ #if _FP_W_TYPE_SIZE < 64 #define _FP_FRACTBITS_E (4*_FP_W_TYPE_SIZE) +#define _FP_FRACTBITS_DW_E (8*_FP_W_TYPE_SIZE) #else #define _FP_FRACTBITS_E (2*_FP_W_TYPE_SIZE) +#define _FP_FRACTBITS_DW_E (4*_FP_W_TYPE_SIZE) #endif #define _FP_FRACBITS_E 64 @@ -56,6 +58,11 @@ #define _FP_OVERFLOW_E \ ((_FP_W_TYPE)1 << (_FP_WFRACBITS_E % _FP_W_TYPE_SIZE)) +#define _FP_WFRACBITS_DW_E (2 * _FP_WFRACBITS_E) +#define _FP_WFRACXBITS_DW_E (_FP_FRACTBITS_DW_E - _FP_WFRACBITS_DW_E) +#define _FP_HIGHBIT_DW_E \ + ((_FP_W_TYPE)1 << (_FP_WFRACBITS_DW_E - 1) % _FP_W_TYPE_SIZE) + typedef float XFtype __attribute__((mode(XF))); #if _FP_W_TYPE_SIZE < 64 @@ -192,6 +199,7 @@ union _FP_UNION_E #define FP_MUL_E(R,X,Y) _FP_MUL(E,4,R,X,Y) #define FP_DIV_E(R,X,Y) _FP_DIV(E,4,R,X,Y) #define FP_SQRT_E(R,X) _FP_SQRT(E,4,R,X) +#define FP_FMA_E(R,X,Y,Z) _FP_FMA(E,4,8,R,X,Y,Z) /* * Square root algorithms: @@ -258,6 +266,8 @@ union _FP_UNION_E #define _FP_FRAC_HIGH_E(X) (X##_f[2]) #define _FP_FRAC_HIGH_RAW_E(X) (X##_f[1]) +#define _FP_FRAC_HIGH_DW_E(X) (X##_f[4]) + #else /* not _FP_W_TYPE_SIZE < 64 */ union _FP_UNION_E { @@ -383,6 +393,7 @@ union _FP_UNION_E #define FP_MUL_E(R,X,Y) _FP_MUL(E,2,R,X,Y) #define FP_DIV_E(R,X,Y) _FP_DIV(E,2,R,X,Y) #define FP_SQRT_E(R,X) _FP_SQRT(E,2,R,X) +#define FP_FMA_E(R,X,Y,Z) _FP_FMA(E,2,4,R,X,Y,Z) /* * Square root algorithms: @@ -427,4 +438,6 @@ union _FP_UNION_E #define _FP_FRAC_HIGH_E(X) (X##_f1) #define _FP_FRAC_HIGH_RAW_E(X) (X##_f0) +#define _FP_FRAC_HIGH_DW_E(X) (X##_f[2]) + #endif /* not _FP_W_TYPE_SIZE < 64 */ diff --git a/soft-fp/fmadf4.c b/soft-fp/fmadf4.c new file mode 100644 index 0000000000..ebdc2b1d22 --- /dev/null +++ b/soft-fp/fmadf4.c @@ -0,0 +1,56 @@ +/* Implement fma using soft-fp. + Copyright (C) 2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + In addition to the permissions in the GNU Lesser General Public + License, the Free Software Foundation gives you unlimited + permission to link the compiled version of this file into + combinations with other programs, and to distribute those + combinations without any restriction coming from the use of this + file. (The Lesser General Public License restrictions do apply in + other respects; for example, they cover modification of the file, + and distribution when not linked into a combine executable.) + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include "soft-fp.h" +#include "double.h" + +double +__fma (double a, double b, double c) +{ + FP_DECL_EX; + FP_DECL_D(A); FP_DECL_D(B); FP_DECL_D(C); FP_DECL_D(R); + double r; + + FP_INIT_ROUNDMODE; + FP_UNPACK_D(A, a); + FP_UNPACK_D(B, b); + FP_UNPACK_D(C, c); + FP_FMA_D(R, A, B, C); + FP_PACK_D(r, R); + FP_HANDLE_EXCEPTIONS; + + return r; +} +#ifndef __fma +weak_alias (__fma, fma) +#endif + +#ifdef NO_LONG_DOUBLE +strong_alias (__fma, __fmal) +weak_alias (__fmal, fmal) +#endif diff --git a/soft-fp/fmasf4.c b/soft-fp/fmasf4.c new file mode 100644 index 0000000000..e8d60fb191 --- /dev/null +++ b/soft-fp/fmasf4.c @@ -0,0 +1,51 @@ +/* Implement fmaf using soft-fp. + Copyright (C) 2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + In addition to the permissions in the GNU Lesser General Public + License, the Free Software Foundation gives you unlimited + permission to link the compiled version of this file into + combinations with other programs, and to distribute those + combinations without any restriction coming from the use of this + file. (The Lesser General Public License restrictions do apply in + other respects; for example, they cover modification of the file, + and distribution when not linked into a combine executable.) + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include "soft-fp.h" +#include "single.h" + +float +__fmaf (float a, float b, float c) +{ + FP_DECL_EX; + FP_DECL_S(A); FP_DECL_S(B); FP_DECL_S(C); FP_DECL_S(R); + float r; + + FP_INIT_ROUNDMODE; + FP_UNPACK_S(A, a); + FP_UNPACK_S(B, b); + FP_UNPACK_S(C, c); + FP_FMA_S(R, A, B, C); + FP_PACK_S(r, R); + FP_HANDLE_EXCEPTIONS; + + return r; +} +#ifndef __fmaf +weak_alias (__fmaf, fmaf) +#endif diff --git a/soft-fp/fmatf4.c b/soft-fp/fmatf4.c new file mode 100644 index 0000000000..cf489881d7 --- /dev/null +++ b/soft-fp/fmatf4.c @@ -0,0 +1,49 @@ +/* Implement fmal using soft-fp. + Copyright (C) 2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + In addition to the permissions in the GNU Lesser General Public + License, the Free Software Foundation gives you unlimited + permission to link the compiled version of this file into + combinations with other programs, and to distribute those + combinations without any restriction coming from the use of this + file. (The Lesser General Public License restrictions do apply in + other respects; for example, they cover modification of the file, + and distribution when not linked into a combine executable.) + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include "soft-fp.h" +#include "quad.h" + +long double +__fmal (long double a, long double b, long double c) +{ + FP_DECL_EX; + FP_DECL_Q(A); FP_DECL_Q(B); FP_DECL_Q(C); FP_DECL_Q(R); + long double r; + + FP_INIT_ROUNDMODE; + FP_UNPACK_Q(A, a); + FP_UNPACK_Q(B, b); + FP_UNPACK_Q(C, c); + FP_FMA_Q(R, A, B, C); + FP_PACK_Q(r, R); + FP_HANDLE_EXCEPTIONS; + + return r; +} +weak_alias (__fmal, fmal) diff --git a/soft-fp/op-1.h b/soft-fp/op-1.h index 8e05e2fab7..a9ad0d62cd 100644 --- a/soft-fp/op-1.h +++ b/soft-fp/op-1.h @@ -72,6 +72,7 @@ do { \ #define _FP_FRAC_ZEROP_1(X) (X##_f == 0) #define _FP_FRAC_OVERP_1(fs,X) (X##_f & _FP_OVERFLOW_##fs) #define _FP_FRAC_CLEAR_OVERP_1(fs,X) (X##_f &= ~_FP_OVERFLOW_##fs) +#define _FP_FRAC_HIGHBIT_DW_1(fs,X) (X##_f & _FP_HIGHBIT_DW_##fs) #define _FP_FRAC_EQ_1(X, Y) (X##_f == Y##_f) #define _FP_FRAC_GE_1(X, Y) (X##_f >= Y##_f) #define _FP_FRAC_GT_1(X, Y) (X##_f > Y##_f) @@ -137,9 +138,14 @@ do { \ /* Basic. Assuming the host word size is >= 2*FRACBITS, we can do the multiplication immediately. */ -#define _FP_MUL_MEAT_1_imm(wfracbits, R, X, Y) \ +#define _FP_MUL_MEAT_DW_1_imm(wfracbits, R, X, Y) \ do { \ R##_f = X##_f * Y##_f; \ + } while (0) + +#define _FP_MUL_MEAT_1_imm(wfracbits, R, X, Y) \ + do { \ + _FP_MUL_MEAT_DW_1_imm(wfracbits, R, X, Y); \ /* Normalize since we know where the msb of the multiplicands \ were (bit B), we know that the msb of the of the product is \ at either 2B or 2B-1. */ \ @@ -148,10 +154,15 @@ do { \ /* Given a 1W * 1W => 2W primitive, do the extended multiplication. */ +#define _FP_MUL_MEAT_DW_1_wide(wfracbits, R, X, Y, doit) \ + do { \ + doit(R##_f1, R##_f0, X##_f, Y##_f); \ + } while (0) + #define _FP_MUL_MEAT_1_wide(wfracbits, R, X, Y, doit) \ do { \ - _FP_W_TYPE _Z_f0, _Z_f1; \ - doit(_Z_f1, _Z_f0, X##_f, Y##_f); \ + _FP_FRAC_DECL_2(_Z); \ + _FP_MUL_MEAT_DW_1_wide(wfracbits, _Z, X, Y, doit); \ /* Normalize since we know where the msb of the multiplicands \ were (bit B), we know that the msb of the of the product is \ at either 2B or 2B-1. */ \ @@ -161,9 +172,10 @@ do { \ /* Finally, a simple widening multiply algorithm. What fun! */ -#define _FP_MUL_MEAT_1_hard(wfracbits, R, X, Y) \ +#define _FP_MUL_MEAT_DW_1_hard(wfracbits, R, X, Y) \ do { \ - _FP_W_TYPE _xh, _xl, _yh, _yl, _z_f0, _z_f1, _a_f0, _a_f1; \ + _FP_W_TYPE _xh, _xl, _yh, _yl; \ + _FP_FRAC_DECL_2(_a); \ \ /* split the words in half */ \ _xh = X##_f >> (_FP_W_TYPE_SIZE/2); \ @@ -172,17 +184,23 @@ do { \ _yl = Y##_f & (((_FP_W_TYPE)1 << (_FP_W_TYPE_SIZE/2)) - 1); \ \ /* multiply the pieces */ \ - _z_f0 = _xl * _yl; \ + R##_f0 = _xl * _yl; \ _a_f0 = _xh * _yl; \ _a_f1 = _xl * _yh; \ - _z_f1 = _xh * _yh; \ + R##_f1 = _xh * _yh; \ \ /* reassemble into two full words */ \ if ((_a_f0 += _a_f1) < _a_f1) \ - _z_f1 += (_FP_W_TYPE)1 << (_FP_W_TYPE_SIZE/2); \ + R##_f1 += (_FP_W_TYPE)1 << (_FP_W_TYPE_SIZE/2); \ _a_f1 = _a_f0 >> (_FP_W_TYPE_SIZE/2); \ _a_f0 = _a_f0 << (_FP_W_TYPE_SIZE/2); \ - _FP_FRAC_ADD_2(_z, _z, _a); \ + _FP_FRAC_ADD_2(R, R, _a); \ + } while (0) + +#define _FP_MUL_MEAT_1_hard(wfracbits, R, X, Y) \ + do { \ + _FP_FRAC_DECL_2(_z); \ + _FP_MUL_MEAT_DW_1_hard(wfracbits, _z, X, Y); \ \ /* normalize */ \ _FP_FRAC_SRS_2(_z, wfracbits - 1, 2*wfracbits); \ diff --git a/soft-fp/op-2.h b/soft-fp/op-2.h index 48e01d26dc..20088227e1 100644 --- a/soft-fp/op-2.h +++ b/soft-fp/op-2.h @@ -134,6 +134,8 @@ #define _FP_FRAC_ZEROP_2(X) ((X##_f1 | X##_f0) == 0) #define _FP_FRAC_OVERP_2(fs,X) (_FP_FRAC_HIGH_##fs(X) & _FP_OVERFLOW_##fs) #define _FP_FRAC_CLEAR_OVERP_2(fs,X) (_FP_FRAC_HIGH_##fs(X) &= ~_FP_OVERFLOW_##fs) +#define _FP_FRAC_HIGHBIT_DW_2(fs,X) \ + (_FP_FRAC_HIGH_DW_##fs(X) & _FP_HIGHBIT_DW_##fs) #define _FP_FRAC_EQ_2(X, Y) (X##_f1 == Y##_f1 && X##_f0 == Y##_f0) #define _FP_FRAC_GT_2(X, Y) \ (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 > Y##_f0)) @@ -257,23 +259,30 @@ /* Given a 1W * 1W => 2W primitive, do the extended multiplication. */ -#define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit) \ +#define _FP_MUL_MEAT_DW_2_wide(wfracbits, R, X, Y, doit) \ do { \ - _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c); \ + _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c); \ \ - doit(_FP_FRAC_WORD_4(_z,1), _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0); \ + doit(_FP_FRAC_WORD_4(R,1), _FP_FRAC_WORD_4(R,0), X##_f0, Y##_f0); \ doit(_b_f1, _b_f0, X##_f0, Y##_f1); \ doit(_c_f1, _c_f0, X##_f1, Y##_f0); \ - doit(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2), X##_f1, Y##_f1); \ + doit(_FP_FRAC_WORD_4(R,3), _FP_FRAC_WORD_4(R,2), X##_f1, Y##_f1); \ \ - __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \ - _FP_FRAC_WORD_4(_z,1), 0, _b_f1, _b_f0, \ - _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \ - _FP_FRAC_WORD_4(_z,1)); \ - __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \ - _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0, \ - _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \ - _FP_FRAC_WORD_4(_z,1)); \ + __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2), \ + _FP_FRAC_WORD_4(R,1), 0, _b_f1, _b_f0, \ + _FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2), \ + _FP_FRAC_WORD_4(R,1)); \ + __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2), \ + _FP_FRAC_WORD_4(R,1), 0, _c_f1, _c_f0, \ + _FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2), \ + _FP_FRAC_WORD_4(R,1)); \ + } while (0) + +#define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit) \ + do { \ + _FP_FRAC_DECL_4(_z); \ + \ + _FP_MUL_MEAT_DW_2_wide(wfracbits, _z, X, Y, doit); \ \ /* Normalize since we know where the msb of the multiplicands \ were (bit B), we know that the msb of the of the product is \ @@ -287,9 +296,9 @@ Do only 3 multiplications instead of four. This one is for machines where multiplication is much more expensive than subtraction. */ -#define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit) \ +#define _FP_MUL_MEAT_DW_2_wide_3mul(wfracbits, R, X, Y, doit) \ do { \ - _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c); \ + _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c); \ _FP_W_TYPE _d; \ int _c1, _c2; \ \ @@ -297,27 +306,34 @@ _c1 = _b_f0 < X##_f0; \ _b_f1 = Y##_f0 + Y##_f1; \ _c2 = _b_f1 < Y##_f0; \ - doit(_d, _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0); \ - doit(_FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1), _b_f0, _b_f1); \ + doit(_d, _FP_FRAC_WORD_4(R,0), X##_f0, Y##_f0); \ + doit(_FP_FRAC_WORD_4(R,2), _FP_FRAC_WORD_4(R,1), _b_f0, _b_f1); \ doit(_c_f1, _c_f0, X##_f1, Y##_f1); \ \ _b_f0 &= -_c2; \ _b_f1 &= -_c1; \ - __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \ - _FP_FRAC_WORD_4(_z,1), (_c1 & _c2), 0, _d, \ - 0, _FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1)); \ - __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \ + __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2), \ + _FP_FRAC_WORD_4(R,1), (_c1 & _c2), 0, _d, \ + 0, _FP_FRAC_WORD_4(R,2), _FP_FRAC_WORD_4(R,1)); \ + __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2), \ _b_f0); \ - __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \ + __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2), \ _b_f1); \ - __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \ - _FP_FRAC_WORD_4(_z,1), \ - 0, _d, _FP_FRAC_WORD_4(_z,0)); \ - __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \ - _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0); \ - __FP_FRAC_ADD_2(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2), \ + __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2), \ + _FP_FRAC_WORD_4(R,1), \ + 0, _d, _FP_FRAC_WORD_4(R,0)); \ + __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(R,3),_FP_FRAC_WORD_4(R,2), \ + _FP_FRAC_WORD_4(R,1), 0, _c_f1, _c_f0); \ + __FP_FRAC_ADD_2(_FP_FRAC_WORD_4(R,3), _FP_FRAC_WORD_4(R,2), \ _c_f1, _c_f0, \ - _FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2)); \ + _FP_FRAC_WORD_4(R,3), _FP_FRAC_WORD_4(R,2)); \ + } while (0) + +#define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit) \ + do { \ + _FP_FRAC_DECL_4(_z); \ + \ + _FP_MUL_MEAT_DW_2_wide_3mul(wfracbits, _z, X, Y, doit); \ \ /* Normalize since we know where the msb of the multiplicands \ were (bit B), we know that the msb of the of the product is \ @@ -327,14 +343,20 @@ R##_f1 = _FP_FRAC_WORD_4(_z,1); \ } while (0) -#define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y) \ +#define _FP_MUL_MEAT_DW_2_gmp(wfracbits, R, X, Y) \ do { \ - _FP_FRAC_DECL_4(_z); \ _FP_W_TYPE _x[2], _y[2]; \ _x[0] = X##_f0; _x[1] = X##_f1; \ _y[0] = Y##_f0; _y[1] = Y##_f1; \ \ - mpn_mul_n(_z_f, _x, _y, 2); \ + mpn_mul_n(R##_f, _x, _y, 2); \ + } while (0) + +#define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y) \ + do { \ + _FP_FRAC_DECL_4(_z); \ + \ + _FP_MUL_MEAT_DW_2_gmp(wfracbits, _z, X, Y); \ \ /* Normalize since we know where the msb of the multiplicands \ were (bit B), we know that the msb of the of the product is \ diff --git a/soft-fp/op-4.h b/soft-fp/op-4.h index fd31da90f8..f16870d0f7 100644 --- a/soft-fp/op-4.h +++ b/soft-fp/op-4.h @@ -142,6 +142,8 @@ #define _FP_FRAC_ZEROP_4(X) ((X##_f[0] | X##_f[1] | X##_f[2] | X##_f[3]) == 0) #define _FP_FRAC_NEGP_4(X) ((_FP_WS_TYPE)X##_f[3] < 0) #define _FP_FRAC_OVERP_4(fs,X) (_FP_FRAC_HIGH_##fs(X) & _FP_OVERFLOW_##fs) +#define _FP_FRAC_HIGHBIT_DW_4(fs,X) \ + (_FP_FRAC_HIGH_DW_##fs(X) & _FP_HIGHBIT_DW_##fs) #define _FP_FRAC_CLEAR_OVERP_4(fs,X) (_FP_FRAC_HIGH_##fs(X) &= ~_FP_OVERFLOW_##fs) #define _FP_FRAC_EQ_4(X,Y) \ @@ -246,81 +248,88 @@ /* Given a 1W * 1W => 2W primitive, do the extended multiplication. */ -#define _FP_MUL_MEAT_4_wide(wfracbits, R, X, Y, doit) \ +#define _FP_MUL_MEAT_DW_4_wide(wfracbits, R, X, Y, doit) \ do { \ - _FP_FRAC_DECL_8(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c); \ + _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c); \ _FP_FRAC_DECL_2(_d); _FP_FRAC_DECL_2(_e); _FP_FRAC_DECL_2(_f); \ \ - doit(_FP_FRAC_WORD_8(_z,1), _FP_FRAC_WORD_8(_z,0), X##_f[0], Y##_f[0]); \ + doit(_FP_FRAC_WORD_8(R,1), _FP_FRAC_WORD_8(R,0), X##_f[0], Y##_f[0]); \ doit(_b_f1, _b_f0, X##_f[0], Y##_f[1]); \ doit(_c_f1, _c_f0, X##_f[1], Y##_f[0]); \ doit(_d_f1, _d_f0, X##_f[1], Y##_f[1]); \ doit(_e_f1, _e_f0, X##_f[0], Y##_f[2]); \ doit(_f_f1, _f_f0, X##_f[2], Y##_f[0]); \ - __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,3),_FP_FRAC_WORD_8(_z,2), \ - _FP_FRAC_WORD_8(_z,1), 0,_b_f1,_b_f0, \ - 0,0,_FP_FRAC_WORD_8(_z,1)); \ - __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,3),_FP_FRAC_WORD_8(_z,2), \ - _FP_FRAC_WORD_8(_z,1), 0,_c_f1,_c_f0, \ - _FP_FRAC_WORD_8(_z,3),_FP_FRAC_WORD_8(_z,2), \ - _FP_FRAC_WORD_8(_z,1)); \ - __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,4),_FP_FRAC_WORD_8(_z,3), \ - _FP_FRAC_WORD_8(_z,2), 0,_d_f1,_d_f0, \ - 0,_FP_FRAC_WORD_8(_z,3),_FP_FRAC_WORD_8(_z,2)); \ - __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,4),_FP_FRAC_WORD_8(_z,3), \ - _FP_FRAC_WORD_8(_z,2), 0,_e_f1,_e_f0, \ - _FP_FRAC_WORD_8(_z,4),_FP_FRAC_WORD_8(_z,3), \ - _FP_FRAC_WORD_8(_z,2)); \ - __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,4),_FP_FRAC_WORD_8(_z,3), \ - _FP_FRAC_WORD_8(_z,2), 0,_f_f1,_f_f0, \ - _FP_FRAC_WORD_8(_z,4),_FP_FRAC_WORD_8(_z,3), \ - _FP_FRAC_WORD_8(_z,2)); \ + __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,3),_FP_FRAC_WORD_8(R,2), \ + _FP_FRAC_WORD_8(R,1), 0,_b_f1,_b_f0, \ + 0,0,_FP_FRAC_WORD_8(R,1)); \ + __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,3),_FP_FRAC_WORD_8(R,2), \ + _FP_FRAC_WORD_8(R,1), 0,_c_f1,_c_f0, \ + _FP_FRAC_WORD_8(R,3),_FP_FRAC_WORD_8(R,2), \ + _FP_FRAC_WORD_8(R,1)); \ + __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,4),_FP_FRAC_WORD_8(R,3), \ + _FP_FRAC_WORD_8(R,2), 0,_d_f1,_d_f0, \ + 0,_FP_FRAC_WORD_8(R,3),_FP_FRAC_WORD_8(R,2)); \ + __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,4),_FP_FRAC_WORD_8(R,3), \ + _FP_FRAC_WORD_8(R,2), 0,_e_f1,_e_f0, \ + _FP_FRAC_WORD_8(R,4),_FP_FRAC_WORD_8(R,3), \ + _FP_FRAC_WORD_8(R,2)); \ + __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,4),_FP_FRAC_WORD_8(R,3), \ + _FP_FRAC_WORD_8(R,2), 0,_f_f1,_f_f0, \ + _FP_FRAC_WORD_8(R,4),_FP_FRAC_WORD_8(R,3), \ + _FP_FRAC_WORD_8(R,2)); \ doit(_b_f1, _b_f0, X##_f[0], Y##_f[3]); \ doit(_c_f1, _c_f0, X##_f[3], Y##_f[0]); \ doit(_d_f1, _d_f0, X##_f[1], Y##_f[2]); \ doit(_e_f1, _e_f0, X##_f[2], Y##_f[1]); \ - __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,5),_FP_FRAC_WORD_8(_z,4), \ - _FP_FRAC_WORD_8(_z,3), 0,_b_f1,_b_f0, \ - 0,_FP_FRAC_WORD_8(_z,4),_FP_FRAC_WORD_8(_z,3)); \ - __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,5),_FP_FRAC_WORD_8(_z,4), \ - _FP_FRAC_WORD_8(_z,3), 0,_c_f1,_c_f0, \ - _FP_FRAC_WORD_8(_z,5),_FP_FRAC_WORD_8(_z,4), \ - _FP_FRAC_WORD_8(_z,3)); \ - __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,5),_FP_FRAC_WORD_8(_z,4), \ - _FP_FRAC_WORD_8(_z,3), 0,_d_f1,_d_f0, \ - _FP_FRAC_WORD_8(_z,5),_FP_FRAC_WORD_8(_z,4), \ - _FP_FRAC_WORD_8(_z,3)); \ - __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,5),_FP_FRAC_WORD_8(_z,4), \ - _FP_FRAC_WORD_8(_z,3), 0,_e_f1,_e_f0, \ - _FP_FRAC_WORD_8(_z,5),_FP_FRAC_WORD_8(_z,4), \ - _FP_FRAC_WORD_8(_z,3)); \ + __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,5),_FP_FRAC_WORD_8(R,4), \ + _FP_FRAC_WORD_8(R,3), 0,_b_f1,_b_f0, \ + 0,_FP_FRAC_WORD_8(R,4),_FP_FRAC_WORD_8(R,3)); \ + __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,5),_FP_FRAC_WORD_8(R,4), \ + _FP_FRAC_WORD_8(R,3), 0,_c_f1,_c_f0, \ + _FP_FRAC_WORD_8(R,5),_FP_FRAC_WORD_8(R,4), \ + _FP_FRAC_WORD_8(R,3)); \ + __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,5),_FP_FRAC_WORD_8(R,4), \ + _FP_FRAC_WORD_8(R,3), 0,_d_f1,_d_f0, \ + _FP_FRAC_WORD_8(R,5),_FP_FRAC_WORD_8(R,4), \ + _FP_FRAC_WORD_8(R,3)); \ + __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,5),_FP_FRAC_WORD_8(R,4), \ + _FP_FRAC_WORD_8(R,3), 0,_e_f1,_e_f0, \ + _FP_FRAC_WORD_8(R,5),_FP_FRAC_WORD_8(R,4), \ + _FP_FRAC_WORD_8(R,3)); \ doit(_b_f1, _b_f0, X##_f[2], Y##_f[2]); \ doit(_c_f1, _c_f0, X##_f[1], Y##_f[3]); \ doit(_d_f1, _d_f0, X##_f[3], Y##_f[1]); \ doit(_e_f1, _e_f0, X##_f[2], Y##_f[3]); \ doit(_f_f1, _f_f0, X##_f[3], Y##_f[2]); \ - __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,6),_FP_FRAC_WORD_8(_z,5), \ - _FP_FRAC_WORD_8(_z,4), 0,_b_f1,_b_f0, \ - 0,_FP_FRAC_WORD_8(_z,5),_FP_FRAC_WORD_8(_z,4)); \ - __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,6),_FP_FRAC_WORD_8(_z,5), \ - _FP_FRAC_WORD_8(_z,4), 0,_c_f1,_c_f0, \ - _FP_FRAC_WORD_8(_z,6),_FP_FRAC_WORD_8(_z,5), \ - _FP_FRAC_WORD_8(_z,4)); \ - __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,6),_FP_FRAC_WORD_8(_z,5), \ - _FP_FRAC_WORD_8(_z,4), 0,_d_f1,_d_f0, \ - _FP_FRAC_WORD_8(_z,6),_FP_FRAC_WORD_8(_z,5), \ - _FP_FRAC_WORD_8(_z,4)); \ - __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,7),_FP_FRAC_WORD_8(_z,6), \ - _FP_FRAC_WORD_8(_z,5), 0,_e_f1,_e_f0, \ - 0,_FP_FRAC_WORD_8(_z,6),_FP_FRAC_WORD_8(_z,5)); \ - __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(_z,7),_FP_FRAC_WORD_8(_z,6), \ - _FP_FRAC_WORD_8(_z,5), 0,_f_f1,_f_f0, \ - _FP_FRAC_WORD_8(_z,7),_FP_FRAC_WORD_8(_z,6), \ - _FP_FRAC_WORD_8(_z,5)); \ + __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,6),_FP_FRAC_WORD_8(R,5), \ + _FP_FRAC_WORD_8(R,4), 0,_b_f1,_b_f0, \ + 0,_FP_FRAC_WORD_8(R,5),_FP_FRAC_WORD_8(R,4)); \ + __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,6),_FP_FRAC_WORD_8(R,5), \ + _FP_FRAC_WORD_8(R,4), 0,_c_f1,_c_f0, \ + _FP_FRAC_WORD_8(R,6),_FP_FRAC_WORD_8(R,5), \ + _FP_FRAC_WORD_8(R,4)); \ + __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,6),_FP_FRAC_WORD_8(R,5), \ + _FP_FRAC_WORD_8(R,4), 0,_d_f1,_d_f0, \ + _FP_FRAC_WORD_8(R,6),_FP_FRAC_WORD_8(R,5), \ + _FP_FRAC_WORD_8(R,4)); \ + __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,7),_FP_FRAC_WORD_8(R,6), \ + _FP_FRAC_WORD_8(R,5), 0,_e_f1,_e_f0, \ + 0,_FP_FRAC_WORD_8(R,6),_FP_FRAC_WORD_8(R,5)); \ + __FP_FRAC_ADD_3(_FP_FRAC_WORD_8(R,7),_FP_FRAC_WORD_8(R,6), \ + _FP_FRAC_WORD_8(R,5), 0,_f_f1,_f_f0, \ + _FP_FRAC_WORD_8(R,7),_FP_FRAC_WORD_8(R,6), \ + _FP_FRAC_WORD_8(R,5)); \ doit(_b_f1, _b_f0, X##_f[3], Y##_f[3]); \ - __FP_FRAC_ADD_2(_FP_FRAC_WORD_8(_z,7),_FP_FRAC_WORD_8(_z,6), \ + __FP_FRAC_ADD_2(_FP_FRAC_WORD_8(R,7),_FP_FRAC_WORD_8(R,6), \ _b_f1,_b_f0, \ - _FP_FRAC_WORD_8(_z,7),_FP_FRAC_WORD_8(_z,6)); \ + _FP_FRAC_WORD_8(R,7),_FP_FRAC_WORD_8(R,6)); \ + } while (0) + +#define _FP_MUL_MEAT_4_wide(wfracbits, R, X, Y, doit) \ + do { \ + _FP_FRAC_DECL_8(_z); \ + \ + _FP_MUL_MEAT_DW_4_wide(wfracbits, _z, X, Y, doit); \ \ /* Normalize since we know where the msb of the multiplicands \ were (bit B), we know that the msb of the of the product is \ @@ -330,11 +339,16 @@ _FP_FRAC_WORD_8(_z,1), _FP_FRAC_WORD_8(_z,0)); \ } while (0) +#define _FP_MUL_MEAT_DW_4_gmp(wfracbits, R, X, Y) \ + do { \ + mpn_mul_n(R##_f, _x_f, _y_f, 4); \ + } while (0) + #define _FP_MUL_MEAT_4_gmp(wfracbits, R, X, Y) \ do { \ _FP_FRAC_DECL_8(_z); \ \ - mpn_mul_n(_z_f, _x_f, _y_f, 4); \ + _FP_MUL_MEAT_DW_4_gmp(wfracbits, _z, X, Y); \ \ /* Normalize since we know where the msb of the multiplicands \ were (bit B), we know that the msb of the of the product is \ diff --git a/soft-fp/op-common.h b/soft-fp/op-common.h index c4acb99161..bed1e21fd4 100644 --- a/soft-fp/op-common.h +++ b/soft-fp/op-common.h @@ -847,6 +847,217 @@ do { \ } while (0) +/* Fused multiply-add. The input values should be cooked. */ + +#define _FP_FMA(fs, wc, dwc, R, X, Y, Z) \ +do { \ + FP_DECL_##fs(T); \ + T##_s = X##_s ^ Y##_s; \ + T##_e = X##_e + Y##_e + 1; \ + switch (_FP_CLS_COMBINE(X##_c, Y##_c)) \ + { \ + case _FP_CLS_COMBINE(FP_CLS_NORMAL,FP_CLS_NORMAL): \ + switch (Z##_c) \ + { \ + case FP_CLS_INF: \ + case FP_CLS_NAN: \ + R##_s = Z##_s; \ + _FP_FRAC_COPY_##wc(R, Z); \ + R##_c = Z##_c; \ + break; \ + \ + case FP_CLS_ZERO: \ + R##_c = FP_CLS_NORMAL; \ + R##_s = T##_s; \ + R##_e = T##_e; \ + \ + _FP_MUL_MEAT_##fs(R, X, Y); \ + \ + if (_FP_FRAC_OVERP_##wc(fs, R)) \ + _FP_FRAC_SRS_##wc(R, 1, _FP_WFRACBITS_##fs); \ + else \ + R##_e--; \ + break; \ + \ + case FP_CLS_NORMAL:; \ + _FP_FRAC_DECL_##dwc(TD); \ + _FP_FRAC_DECL_##dwc(ZD); \ + _FP_FRAC_DECL_##dwc(RD); \ + _FP_MUL_MEAT_DW_##fs(TD, X, Y); \ + R##_e = T##_e; \ + int tsh = _FP_FRAC_HIGHBIT_DW_##dwc(fs, TD) == 0; \ + T##_e -= tsh; \ + int ediff = T##_e - Z##_e; \ + if (ediff >= 0) \ + { \ + int shift = _FP_WFRACBITS_##fs - tsh - ediff; \ + if (shift <= -_FP_WFRACBITS_##fs) \ + _FP_FRAC_SET_##dwc(ZD, _FP_MINFRAC_##dwc); \ + else \ + { \ + _FP_FRAC_COPY_##dwc##_##wc(ZD, Z); \ + if (shift < 0) \ + _FP_FRAC_SRS_##dwc(ZD, -shift, \ + _FP_WFRACBITS_DW_##fs); \ + else if (shift > 0) \ + _FP_FRAC_SLL_##dwc(ZD, shift); \ + } \ + R##_s = T##_s; \ + if (T##_s == Z##_s) \ + _FP_FRAC_ADD_##dwc(RD, TD, ZD); \ + else \ + { \ + _FP_FRAC_SUB_##dwc(RD, TD, ZD); \ + if (_FP_FRAC_NEGP_##dwc(RD)) \ + { \ + R##_s = Z##_s; \ + _FP_FRAC_SUB_##dwc(RD, ZD, TD); \ + } \ + } \ + } \ + else \ + { \ + R##_e = Z##_e; \ + R##_s = Z##_s; \ + _FP_FRAC_COPY_##dwc##_##wc(ZD, Z); \ + _FP_FRAC_SLL_##dwc(ZD, _FP_WFRACBITS_##fs); \ + int shift = -ediff - tsh; \ + if (shift >= _FP_WFRACBITS_DW_##fs) \ + _FP_FRAC_SET_##dwc(TD, _FP_MINFRAC_##dwc); \ + else if (shift > 0) \ + _FP_FRAC_SRS_##dwc(TD, shift, \ + _FP_WFRACBITS_DW_##fs); \ + if (Z##_s == T##_s) \ + _FP_FRAC_ADD_##dwc(RD, ZD, TD); \ + else \ + _FP_FRAC_SUB_##dwc(RD, ZD, TD); \ + } \ + if (_FP_FRAC_ZEROP_##dwc(RD)) \ + { \ + if (T##_s == Z##_s) \ + R##_s = Z##_s; \ + else \ + R##_s = (FP_ROUNDMODE == FP_RND_MINF); \ + _FP_FRAC_SET_##wc(R, _FP_ZEROFRAC_##wc); \ + R##_c = FP_CLS_ZERO; \ + } \ + else \ + { \ + int rlz; \ + _FP_FRAC_CLZ_##dwc(rlz, RD); \ + rlz -= _FP_WFRACXBITS_DW_##fs; \ + R##_e -= rlz; \ + int shift = _FP_WFRACBITS_##fs - rlz; \ + if (shift > 0) \ + _FP_FRAC_SRS_##dwc(RD, shift, \ + _FP_WFRACBITS_DW_##fs); \ + else if (shift < 0) \ + _FP_FRAC_SLL_##dwc(RD, -shift); \ + _FP_FRAC_COPY_##wc##_##dwc(R, RD); \ + R##_c = FP_CLS_NORMAL; \ + } \ + break; \ + } \ + goto done_fma; \ + \ + case _FP_CLS_COMBINE(FP_CLS_NAN,FP_CLS_NAN): \ + _FP_CHOOSENAN(fs, wc, T, X, Y, '*'); \ + break; \ + \ + case _FP_CLS_COMBINE(FP_CLS_NAN,FP_CLS_NORMAL): \ + case _FP_CLS_COMBINE(FP_CLS_NAN,FP_CLS_INF): \ + case _FP_CLS_COMBINE(FP_CLS_NAN,FP_CLS_ZERO): \ + T##_s = X##_s; \ + \ + case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_INF): \ + case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_NORMAL): \ + case _FP_CLS_COMBINE(FP_CLS_ZERO,FP_CLS_NORMAL): \ + case _FP_CLS_COMBINE(FP_CLS_ZERO,FP_CLS_ZERO): \ + _FP_FRAC_COPY_##wc(T, X); \ + T##_c = X##_c; \ + break; \ + \ + case _FP_CLS_COMBINE(FP_CLS_NORMAL,FP_CLS_NAN): \ + case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_NAN): \ + case _FP_CLS_COMBINE(FP_CLS_ZERO,FP_CLS_NAN): \ + T##_s = Y##_s; \ + \ + case _FP_CLS_COMBINE(FP_CLS_NORMAL,FP_CLS_INF): \ + case _FP_CLS_COMBINE(FP_CLS_NORMAL,FP_CLS_ZERO): \ + _FP_FRAC_COPY_##wc(T, Y); \ + T##_c = Y##_c; \ + break; \ + \ + case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_ZERO): \ + case _FP_CLS_COMBINE(FP_CLS_ZERO,FP_CLS_INF): \ + T##_s = _FP_NANSIGN_##fs; \ + T##_c = FP_CLS_NAN; \ + _FP_FRAC_SET_##wc(T, _FP_NANFRAC_##fs); \ + FP_SET_EXCEPTION(FP_EX_INVALID); \ + break; \ + \ + default: \ + abort(); \ + } \ + \ + /* T = X * Y is zero, infinity or NaN. */ \ + switch (_FP_CLS_COMBINE(T##_c, Z##_c)) \ + { \ + case _FP_CLS_COMBINE(FP_CLS_NAN,FP_CLS_NAN): \ + _FP_CHOOSENAN(fs, wc, R, T, Z, '+'); \ + break; \ + \ + case _FP_CLS_COMBINE(FP_CLS_NAN,FP_CLS_NORMAL): \ + case _FP_CLS_COMBINE(FP_CLS_NAN,FP_CLS_INF): \ + case _FP_CLS_COMBINE(FP_CLS_NAN,FP_CLS_ZERO): \ + case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_NORMAL): \ + case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_ZERO): \ + R##_s = T##_s; \ + _FP_FRAC_COPY_##wc(R, T); \ + R##_c = T##_c; \ + break; \ + \ + case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_NAN): \ + case _FP_CLS_COMBINE(FP_CLS_ZERO,FP_CLS_NAN): \ + case _FP_CLS_COMBINE(FP_CLS_ZERO,FP_CLS_NORMAL): \ + case _FP_CLS_COMBINE(FP_CLS_ZERO,FP_CLS_INF): \ + R##_s = Z##_s; \ + _FP_FRAC_COPY_##wc(R, Z); \ + R##_c = Z##_c; \ + break; \ + \ + case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_INF): \ + if (T##_s == Z##_s) \ + { \ + R##_s = Z##_s; \ + _FP_FRAC_COPY_##wc(R, Z); \ + R##_c = Z##_c; \ + } \ + else \ + { \ + R##_s = _FP_NANSIGN_##fs; \ + R##_c = FP_CLS_NAN; \ + _FP_FRAC_SET_##wc(R, _FP_NANFRAC_##fs); \ + FP_SET_EXCEPTION(FP_EX_INVALID); \ + } \ + break; \ + \ + case _FP_CLS_COMBINE(FP_CLS_ZERO,FP_CLS_ZERO): \ + if (T##_s == Z##_s) \ + R##_s = Z##_s; \ + else \ + R##_s = (FP_ROUNDMODE == FP_RND_MINF); \ + _FP_FRAC_COPY_##wc(R, Z); \ + R##_c = Z##_c; \ + break; \ + \ + default: \ + abort(); \ + } \ + done_fma: ; \ +} while (0) + + /* * Main division routine. The input values should be cooked. */ diff --git a/soft-fp/quad.h b/soft-fp/quad.h index f0aa07e74f..9a16bf3284 100644 --- a/soft-fp/quad.h +++ b/soft-fp/quad.h @@ -36,8 +36,10 @@ #if _FP_W_TYPE_SIZE < 64 #define _FP_FRACTBITS_Q (4*_FP_W_TYPE_SIZE) +#define _FP_FRACTBITS_DW_Q (8*_FP_W_TYPE_SIZE) #else #define _FP_FRACTBITS_Q (2*_FP_W_TYPE_SIZE) +#define _FP_FRACTBITS_DW_Q (4*_FP_W_TYPE_SIZE) #endif #define _FP_FRACBITS_Q 113 @@ -59,6 +61,11 @@ #define _FP_OVERFLOW_Q \ ((_FP_W_TYPE)1 << (_FP_WFRACBITS_Q % _FP_W_TYPE_SIZE)) +#define _FP_WFRACBITS_DW_Q (2 * _FP_WFRACBITS_Q) +#define _FP_WFRACXBITS_DW_Q (_FP_FRACTBITS_DW_Q - _FP_WFRACBITS_DW_Q) +#define _FP_HIGHBIT_DW_Q \ + ((_FP_W_TYPE)1 << (_FP_WFRACBITS_DW_Q - 1) % _FP_W_TYPE_SIZE) + typedef float TFtype __attribute__((mode(TF))); #if _FP_W_TYPE_SIZE < 64 @@ -155,6 +162,7 @@ union _FP_UNION_Q #define FP_DIV_Q(R,X,Y) _FP_DIV(Q,4,R,X,Y) #define FP_SQRT_Q(R,X) _FP_SQRT(Q,4,R,X) #define _FP_SQRT_MEAT_Q(R,S,T,X,Q) _FP_SQRT_MEAT_4(R,S,T,X,Q) +#define FP_FMA_Q(R,X,Y,Z) _FP_FMA(Q,4,8,R,X,Y,Z) #define FP_CMP_Q(r,X,Y,un) _FP_CMP(Q,4,r,X,Y,un) #define FP_CMP_EQ_Q(r,X,Y) _FP_CMP_EQ(Q,4,r,X,Y) @@ -166,6 +174,8 @@ union _FP_UNION_Q #define _FP_FRAC_HIGH_Q(X) _FP_FRAC_HIGH_4(X) #define _FP_FRAC_HIGH_RAW_Q(X) _FP_FRAC_HIGH_4(X) +#define _FP_FRAC_HIGH_DW_Q(X) _FP_FRAC_HIGH_8(X) + #else /* not _FP_W_TYPE_SIZE < 64 */ union _FP_UNION_Q { @@ -256,6 +266,7 @@ union _FP_UNION_Q #define FP_DIV_Q(R,X,Y) _FP_DIV(Q,2,R,X,Y) #define FP_SQRT_Q(R,X) _FP_SQRT(Q,2,R,X) #define _FP_SQRT_MEAT_Q(R,S,T,X,Q) _FP_SQRT_MEAT_2(R,S,T,X,Q) +#define FP_FMA_Q(R,X,Y,Z) _FP_FMA(Q,2,4,R,X,Y,Z) #define FP_CMP_Q(r,X,Y,un) _FP_CMP(Q,2,r,X,Y,un) #define FP_CMP_EQ_Q(r,X,Y) _FP_CMP_EQ(Q,2,r,X,Y) @@ -267,4 +278,6 @@ union _FP_UNION_Q #define _FP_FRAC_HIGH_Q(X) _FP_FRAC_HIGH_2(X) #define _FP_FRAC_HIGH_RAW_Q(X) _FP_FRAC_HIGH_2(X) +#define _FP_FRAC_HIGH_DW_Q(X) _FP_FRAC_HIGH_4(X) + #endif /* not _FP_W_TYPE_SIZE < 64 */ diff --git a/soft-fp/single.h b/soft-fp/single.h index dec0031e9a..c94f31f99b 100644 --- a/soft-fp/single.h +++ b/soft-fp/single.h @@ -36,6 +36,12 @@ #define _FP_FRACTBITS_S _FP_W_TYPE_SIZE +#if _FP_W_TYPE_SIZE < 64 +# define _FP_FRACTBITS_DW_S (2 * _FP_W_TYPE_SIZE) +#else +# define _FP_FRACTBITS_DW_S _FP_W_TYPE_SIZE +#endif + #define _FP_FRACBITS_S 24 #define _FP_FRACXBITS_S (_FP_FRACTBITS_S - _FP_FRACBITS_S) #define _FP_WFRACBITS_S (_FP_WORKBITS + _FP_FRACBITS_S) @@ -49,6 +55,11 @@ #define _FP_IMPLBIT_SH_S ((_FP_W_TYPE)1 << (_FP_FRACBITS_S-1+_FP_WORKBITS)) #define _FP_OVERFLOW_S ((_FP_W_TYPE)1 << (_FP_WFRACBITS_S)) +#define _FP_WFRACBITS_DW_S (2 * _FP_WFRACBITS_S) +#define _FP_WFRACXBITS_DW_S (_FP_FRACTBITS_DW_S - _FP_WFRACBITS_DW_S) +#define _FP_HIGHBIT_DW_S \ + ((_FP_W_TYPE)1 << (_FP_WFRACBITS_DW_S - 1) % _FP_W_TYPE_SIZE) + /* The implementation of _FP_MUL_MEAT_S and _FP_DIV_MEAT_S should be chosen by the target machine. */ @@ -139,6 +150,12 @@ union _FP_UNION_S #define FP_SQRT_S(R,X) _FP_SQRT(S,1,R,X) #define _FP_SQRT_MEAT_S(R,S,T,X,Q) _FP_SQRT_MEAT_1(R,S,T,X,Q) +#if _FP_W_TYPE_SIZE < 64 +# define FP_FMA_S(R, X, Y, Z) _FP_FMA(S, 1, 2, R, X, Y, Z) +#else +# define FP_FMA_S(R, X, Y, Z) _FP_FMA(S, 1, 1, R, X, Y, Z) +#endif + #define FP_CMP_S(r,X,Y,un) _FP_CMP(S,1,r,X,Y,un) #define FP_CMP_EQ_S(r,X,Y) _FP_CMP_EQ(S,1,r,X,Y) #define FP_CMP_UNORD_S(r,X,Y) _FP_CMP_UNORD(S,1,r,X,Y) @@ -148,3 +165,9 @@ union _FP_UNION_S #define _FP_FRAC_HIGH_S(X) _FP_FRAC_HIGH_1(X) #define _FP_FRAC_HIGH_RAW_S(X) _FP_FRAC_HIGH_1(X) + +#if _FP_W_TYPE_SIZE < 64 +# define _FP_FRAC_HIGH_DW_S(X) _FP_FRAC_HIGH_2(X) +#else +# define _FP_FRAC_HIGH_DW_S(X) _FP_FRAC_HIGH_1(X) +#endif