diff options
Diffstat (limited to 'src/math/x86_64')
33 files changed, 491 insertions, 0 deletions
diff --git a/src/math/x86_64/__invtrigl.s b/src/math/x86_64/__invtrigl.s new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/src/math/x86_64/__invtrigl.s diff --git a/src/math/x86_64/acosl.s b/src/math/x86_64/acosl.s new file mode 100644 index 0000000..88e01b4 --- /dev/null +++ b/src/math/x86_64/acosl.s @@ -0,0 +1,16 @@ +# see ../i386/acos.s + +.global acosl +.type acosl,@function +acosl: + fldt 8(%rsp) +1: fld %st(0) + fld1 + fsub %st(0),%st(1) + fadd %st(2) + fmulp + fsqrt + fabs + fxch %st(1) + fpatan + ret diff --git a/src/math/x86_64/asinl.s b/src/math/x86_64/asinl.s new file mode 100644 index 0000000..ed212d9 --- /dev/null +++ b/src/math/x86_64/asinl.s @@ -0,0 +1,12 @@ +.global asinl +.type asinl,@function +asinl: + fldt 8(%rsp) +1: fld %st(0) + fld1 + fsub %st(0),%st(1) + fadd %st(2) + fmulp + fsqrt + fpatan + ret diff --git a/src/math/x86_64/atan2l.s b/src/math/x86_64/atan2l.s new file mode 100644 index 0000000..e5f0a3d --- /dev/null +++ b/src/math/x86_64/atan2l.s @@ -0,0 +1,7 @@ +.global atan2l +.type atan2l,@function +atan2l: + fldt 8(%rsp) + fldt 24(%rsp) + fpatan + ret diff --git a/src/math/x86_64/atanl.s b/src/math/x86_64/atanl.s new file mode 100644 index 0000000..df76de5 --- /dev/null +++ b/src/math/x86_64/atanl.s @@ -0,0 +1,7 @@ +.global atanl +.type atanl,@function +atanl: + fldt 8(%rsp) + fld1 + fpatan + ret diff --git a/src/math/x86_64/ceill.s b/src/math/x86_64/ceill.s new file mode 100644 index 0000000..f5cfa3b --- /dev/null +++ b/src/math/x86_64/ceill.s @@ -0,0 +1 @@ +# see floorl.s diff --git a/src/math/x86_64/exp2l.s b/src/math/x86_64/exp2l.s new file mode 100644 index 0000000..effab2b --- /dev/null +++ b/src/math/x86_64/exp2l.s @@ -0,0 +1,83 @@ +.global expm1l +.type expm1l,@function +expm1l: + fldt 8(%rsp) + fldl2e + fmulp + movl $0xc2820000,-4(%rsp) + flds -4(%rsp) + fucomip %st(1),%st + fld1 + jb 1f + # x*log2e <= -65, return -1 without underflow + fstp %st(1) + fchs + ret +1: fld %st(1) + fabs + fucomip %st(1),%st + fstp %st(0) + ja 1f + f2xm1 + ret +1: push %rax + call 1f + pop %rax + fld1 + fsubrp + ret + +.global exp2l +.type exp2l,@function +exp2l: + fldt 8(%rsp) +1: fld %st(0) + sub $16,%rsp + fstpt (%rsp) + mov 8(%rsp),%ax + and $0x7fff,%ax + cmp $0x3fff+13,%ax + jb 4f # |x| < 8192 + cmp $0x3fff+15,%ax + jae 3f # |x| >= 32768 + fsts (%rsp) + cmpl $0xc67ff800,(%rsp) + jb 2f # x > -16382 + movl $0x5f000000,(%rsp) + flds (%rsp) # 0x1p63 + fld %st(1) + fsub %st(1) + faddp + fucomip %st(1),%st + je 2f # x - 0x1p63 + 0x1p63 == x + movl $1,(%rsp) + flds (%rsp) # 0x1p-149 + fdiv %st(1) + fstps (%rsp) # raise underflow +2: fld1 + fld %st(1) + frndint + fxch %st(2) + fsub %st(2) # st(0)=x-rint(x), st(1)=1, st(2)=rint(x) + f2xm1 + faddp # 2^(x-rint(x)) +1: fscale + fstp %st(1) + add $16,%rsp + ret +3: xor %eax,%eax +4: cmp $0x3fff-64,%ax + fld1 + jb 1b # |x| < 0x1p-64 + fstpt (%rsp) + fistl 8(%rsp) + fildl 8(%rsp) + fsubrp %st(1) + addl $0x3fff,8(%rsp) + f2xm1 + fld1 + faddp # 2^(x-rint(x)) + fldt (%rsp) # 2^rint(x) + fmulp + add $16,%rsp + ret diff --git a/src/math/x86_64/expl.s b/src/math/x86_64/expl.s new file mode 100644 index 0000000..798261d --- /dev/null +++ b/src/math/x86_64/expl.s @@ -0,0 +1,101 @@ +# exp(x) = 2^hi + 2^hi (2^lo - 1) +# where hi+lo = log2e*x with 128bit precision +# exact log2e*x calculation depends on nearest rounding mode +# using the exact multiplication method of Dekker and Veltkamp + +.global expl +.type expl,@function +expl: + fldt 8(%rsp) + + # interesting case: 0x1p-32 <= |x| < 16384 + # check if (exponent|0x8000) is in [0xbfff-32, 0xbfff+13] + mov 16(%rsp), %ax + or $0x8000, %ax + sub $0xbfdf, %ax + cmp $45, %ax + jbe 2f + test %ax, %ax + fld1 + js 1f + # if |x|>=0x1p14 or nan return 2^trunc(x) + fscale + fstp %st(1) + ret + # if |x|<0x1p-32 return 1+x +1: faddp + ret + + # should be 0x1.71547652b82fe178p0L == 0x3fff b8aa3b29 5c17f0bc + # it will be wrong on non-nearest rounding mode +2: fldl2e + subq $48, %rsp + # hi = log2e_hi*x + # 2^hi = exp2l(hi) + fmul %st(1),%st + fld %st(0) + fstpt (%rsp) + fstpt 16(%rsp) + fstpt 32(%rsp) + call exp2l@PLT + # if 2^hi == inf return 2^hi + fld %st(0) + fstpt (%rsp) + cmpw $0x7fff, 8(%rsp) + je 1f + fldt 32(%rsp) + fldt 16(%rsp) + # fpu stack: 2^hi x hi + # exact mult: x*log2e + fld %st(1) + # c = 0x1p32+1 + movq $0x41f0000000100000,%rax + pushq %rax + fldl (%rsp) + # xh = x - c*x + c*x + # xl = x - xh + fmulp + fld %st(2) + fsub %st(1), %st + faddp + fld %st(2) + fsub %st(1), %st + # yh = log2e_hi - c*log2e_hi + c*log2e_hi + movq $0x3ff7154765200000,%rax + pushq %rax + fldl (%rsp) + # fpu stack: 2^hi x hi xh xl yh + # lo = hi - xh*yh + xl*yh + fld %st(2) + fmul %st(1), %st + fsubp %st, %st(4) + fmul %st(1), %st + faddp %st, %st(3) + # yl = log2e_hi - yh + movq $0x3de705fc2f000000,%rax + pushq %rax + fldl (%rsp) + # fpu stack: 2^hi x lo xh xl yl + # lo += xh*yl + xl*yl + fmul %st, %st(2) + fmulp %st, %st(1) + fxch %st(2) + faddp + faddp + # log2e_lo + movq $0xbfbe,%rax + pushq %rax + movq $0x82f0025f2dc582ee,%rax + pushq %rax + fldt (%rsp) + addq $40,%rsp + # fpu stack: 2^hi x lo log2e_lo + # lo += log2e_lo*x + # return 2^hi + 2^hi (2^lo - 1) + fmulp %st, %st(2) + faddp + f2xm1 + fmul %st(1), %st + faddp +1: addq $48, %rsp + ret diff --git a/src/math/x86_64/expm1l.s b/src/math/x86_64/expm1l.s new file mode 100644 index 0000000..e773f08 --- /dev/null +++ b/src/math/x86_64/expm1l.s @@ -0,0 +1 @@ +# see exp2l.s diff --git a/src/math/x86_64/fabs.c b/src/math/x86_64/fabs.c new file mode 100644 index 0000000..1656247 --- /dev/null +++ b/src/math/x86_64/fabs.c @@ -0,0 +1,10 @@ +#include <math.h> + +double fabs(double x) +{ + double t; + __asm__ ("pcmpeqd %0, %0" : "=x"(t)); // t = ~0 + __asm__ ("psrlq $1, %0" : "+x"(t)); // t >>= 1 + __asm__ ("andps %1, %0" : "+x"(x) : "x"(t)); // x &= t + return x; +} diff --git a/src/math/x86_64/fabsf.c b/src/math/x86_64/fabsf.c new file mode 100644 index 0000000..36ea748 --- /dev/null +++ b/src/math/x86_64/fabsf.c @@ -0,0 +1,10 @@ +#include <math.h> + +float fabsf(float x) +{ + float t; + __asm__ ("pcmpeqd %0, %0" : "=x"(t)); // t = ~0 + __asm__ ("psrld $1, %0" : "+x"(t)); // t >>= 1 + __asm__ ("andps %1, %0" : "+x"(x) : "x"(t)); // x &= t + return x; +} diff --git a/src/math/x86_64/fabsl.c b/src/math/x86_64/fabsl.c new file mode 100644 index 0000000..cc1c9ed --- /dev/null +++ b/src/math/x86_64/fabsl.c @@ -0,0 +1,7 @@ +#include <math.h> + +long double fabsl(long double x) +{ + __asm__ ("fabs" : "+t"(x)); + return x; +} diff --git a/src/math/x86_64/floorl.s b/src/math/x86_64/floorl.s new file mode 100644 index 0000000..80da466 --- /dev/null +++ b/src/math/x86_64/floorl.s @@ -0,0 +1,27 @@ +.global floorl +.type floorl,@function +floorl: + fldt 8(%rsp) +1: mov $0x7,%al +1: fstcw 8(%rsp) + mov 9(%rsp),%ah + mov %al,9(%rsp) + fldcw 8(%rsp) + frndint + mov %ah,9(%rsp) + fldcw 8(%rsp) + ret + +.global ceill +.type ceill,@function +ceill: + fldt 8(%rsp) + mov $0xb,%al + jmp 1b + +.global truncl +.type truncl,@function +truncl: + fldt 8(%rsp) + mov $0xf,%al + jmp 1b diff --git a/src/math/x86_64/fma.c b/src/math/x86_64/fma.c new file mode 100644 index 0000000..4dd53f2 --- /dev/null +++ b/src/math/x86_64/fma.c @@ -0,0 +1,23 @@ +#include <math.h> + +#if __FMA__ + +double fma(double x, double y, double z) +{ + __asm__ ("vfmadd132sd %1, %2, %0" : "+x" (x) : "x" (y), "x" (z)); + return x; +} + +#elif __FMA4__ + +double fma(double x, double y, double z) +{ + __asm__ ("vfmaddsd %3, %2, %1, %0" : "=x" (x) : "x" (x), "x" (y), "x" (z)); + return x; +} + +#else + +#include "../fma.c" + +#endif diff --git a/src/math/x86_64/fmaf.c b/src/math/x86_64/fmaf.c new file mode 100644 index 0000000..30b971f --- /dev/null +++ b/src/math/x86_64/fmaf.c @@ -0,0 +1,23 @@ +#include <math.h> + +#if __FMA__ + +float fmaf(float x, float y, float z) +{ + __asm__ ("vfmadd132ss %1, %2, %0" : "+x" (x) : "x" (y), "x" (z)); + return x; +} + +#elif __FMA4__ + +float fmaf(float x, float y, float z) +{ + __asm__ ("vfmaddss %3, %2, %1, %0" : "=x" (x) : "x" (x), "x" (y), "x" (z)); + return x; +} + +#else + +#include "../fmaf.c" + +#endif diff --git a/src/math/x86_64/fmodl.c b/src/math/x86_64/fmodl.c new file mode 100644 index 0000000..3daeab0 --- /dev/null +++ b/src/math/x86_64/fmodl.c @@ -0,0 +1,9 @@ +#include <math.h> + +long double fmodl(long double x, long double y) +{ + unsigned short fpsr; + do __asm__ ("fprem; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y)); + while (fpsr & 0x400); + return x; +} diff --git a/src/math/x86_64/llrint.c b/src/math/x86_64/llrint.c new file mode 100644 index 0000000..dd38a72 --- /dev/null +++ b/src/math/x86_64/llrint.c @@ -0,0 +1,8 @@ +#include <math.h> + +long long llrint(double x) +{ + long long r; + __asm__ ("cvtsd2si %1, %0" : "=r"(r) : "x"(x)); + return r; +} diff --git a/src/math/x86_64/llrintf.c b/src/math/x86_64/llrintf.c new file mode 100644 index 0000000..fc8625e --- /dev/null +++ b/src/math/x86_64/llrintf.c @@ -0,0 +1,8 @@ +#include <math.h> + +long long llrintf(float x) +{ + long long r; + __asm__ ("cvtss2si %1, %0" : "=r"(r) : "x"(x)); + return r; +} diff --git a/src/math/x86_64/llrintl.c b/src/math/x86_64/llrintl.c new file mode 100644 index 0000000..c439ef2 --- /dev/null +++ b/src/math/x86_64/llrintl.c @@ -0,0 +1,8 @@ +#include <math.h> + +long long llrintl(long double x) +{ + long long r; + __asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st"); + return r; +} diff --git a/src/math/x86_64/log10l.s b/src/math/x86_64/log10l.s new file mode 100644 index 0000000..48ea4af --- /dev/null +++ b/src/math/x86_64/log10l.s @@ -0,0 +1,7 @@ +.global log10l +.type log10l,@function +log10l: + fldlg2 + fldt 8(%rsp) + fyl2x + ret diff --git a/src/math/x86_64/log1pl.s b/src/math/x86_64/log1pl.s new file mode 100644 index 0000000..955c9db --- /dev/null +++ b/src/math/x86_64/log1pl.s @@ -0,0 +1,15 @@ +.global log1pl +.type log1pl,@function +log1pl: + mov 14(%rsp),%eax + fldln2 + and $0x7fffffff,%eax + fldt 8(%rsp) + cmp $0x3ffd9400,%eax + ja 1f + fyl2xp1 + ret +1: fld1 + faddp + fyl2x + ret diff --git a/src/math/x86_64/log2l.s b/src/math/x86_64/log2l.s new file mode 100644 index 0000000..ba08b9f --- /dev/null +++ b/src/math/x86_64/log2l.s @@ -0,0 +1,7 @@ +.global log2l +.type log2l,@function +log2l: + fld1 + fldt 8(%rsp) + fyl2x + ret diff --git a/src/math/x86_64/logl.s b/src/math/x86_64/logl.s new file mode 100644 index 0000000..20dd1f8 --- /dev/null +++ b/src/math/x86_64/logl.s @@ -0,0 +1,7 @@ +.global logl +.type logl,@function +logl: + fldln2 + fldt 8(%rsp) + fyl2x + ret diff --git a/src/math/x86_64/lrint.c b/src/math/x86_64/lrint.c new file mode 100644 index 0000000..a742fec --- /dev/null +++ b/src/math/x86_64/lrint.c @@ -0,0 +1,8 @@ +#include <math.h> + +long lrint(double x) +{ + long r; + __asm__ ("cvtsd2si %1, %0" : "=r"(r) : "x"(x)); + return r; +} diff --git a/src/math/x86_64/lrintf.c b/src/math/x86_64/lrintf.c new file mode 100644 index 0000000..2ba5639 --- /dev/null +++ b/src/math/x86_64/lrintf.c @@ -0,0 +1,8 @@ +#include <math.h> + +long lrintf(float x) +{ + long r; + __asm__ ("cvtss2si %1, %0" : "=r"(r) : "x"(x)); + return r; +} diff --git a/src/math/x86_64/lrintl.c b/src/math/x86_64/lrintl.c new file mode 100644 index 0000000..068e2e4 --- /dev/null +++ b/src/math/x86_64/lrintl.c @@ -0,0 +1,8 @@ +#include <math.h> + +long lrintl(long double x) +{ + long r; + __asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st"); + return r; +} diff --git a/src/math/x86_64/remainderl.c b/src/math/x86_64/remainderl.c new file mode 100644 index 0000000..8cf7507 --- /dev/null +++ b/src/math/x86_64/remainderl.c @@ -0,0 +1,9 @@ +#include <math.h> + +long double remainderl(long double x, long double y) +{ + unsigned short fpsr; + do __asm__ ("fprem1; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y)); + while (fpsr & 0x400); + return x; +} diff --git a/src/math/x86_64/remquol.c b/src/math/x86_64/remquol.c new file mode 100644 index 0000000..60eef08 --- /dev/null +++ b/src/math/x86_64/remquol.c @@ -0,0 +1,32 @@ +#include <math.h> + +long double remquol(long double x, long double y, int *quo) +{ + signed char *cx = (void *)&x, *cy = (void *)&y; + /* By ensuring that addresses of x and y cannot be discarded, + * this empty asm guides GCC into representing extraction of + * their sign bits as memory loads rather than making x and y + * not-address-taken internally and using bitfield operations, + * which in the end wouldn't work out, as extraction from FPU + * registers needs to go through memory anyway. This way GCC + * should manage to use incoming stack slots without spills. */ + __asm__ ("" :: "X"(cx), "X"(cy)); + + long double t = x; + unsigned fpsr; + do __asm__ ("fprem1; fnstsw %%ax" : "+t"(t), "=a"(fpsr) : "u"(y)); + while (fpsr & 0x400); + /* C0, C1, C3 flags in x87 status word carry low bits of quotient: + * 15 14 13 12 11 10 9 8 + * . C3 . . . C2 C1 C0 + * . b1 . . . 0 b0 b2 */ + unsigned char i = fpsr >> 8; + i = i>>4 | i<<4; + /* i[5:2] is now {b0 b2 ? b1}. Retrieve {0 b2 b1 b0} via + * in-register table lookup. */ + unsigned qbits = 0x7575313164642020 >> (i & 60); + qbits &= 7; + + *quo = (cx[9]^cy[9]) < 0 ? -qbits : qbits; + return t; +} diff --git a/src/math/x86_64/rintl.c b/src/math/x86_64/rintl.c new file mode 100644 index 0000000..e1a9207 --- /dev/null +++ b/src/math/x86_64/rintl.c @@ -0,0 +1,7 @@ +#include <math.h> + +long double rintl(long double x) +{ + __asm__ ("frndint" : "+t"(x)); + return x; +} diff --git a/src/math/x86_64/sqrt.c b/src/math/x86_64/sqrt.c new file mode 100644 index 0000000..657e09e --- /dev/null +++ b/src/math/x86_64/sqrt.c @@ -0,0 +1,7 @@ +#include <math.h> + +double sqrt(double x) +{ + __asm__ ("sqrtsd %1, %0" : "=x"(x) : "x"(x)); + return x; +} diff --git a/src/math/x86_64/sqrtf.c b/src/math/x86_64/sqrtf.c new file mode 100644 index 0000000..720baec --- /dev/null +++ b/src/math/x86_64/sqrtf.c @@ -0,0 +1,7 @@ +#include <math.h> + +float sqrtf(float x) +{ + __asm__ ("sqrtss %1, %0" : "=x"(x) : "x"(x)); + return x; +} diff --git a/src/math/x86_64/sqrtl.c b/src/math/x86_64/sqrtl.c new file mode 100644 index 0000000..864cfcc --- /dev/null +++ b/src/math/x86_64/sqrtl.c @@ -0,0 +1,7 @@ +#include <math.h> + +long double sqrtl(long double x) +{ + __asm__ ("fsqrt" : "+t"(x)); + return x; +} diff --git a/src/math/x86_64/truncl.s b/src/math/x86_64/truncl.s new file mode 100644 index 0000000..f5cfa3b --- /dev/null +++ b/src/math/x86_64/truncl.s @@ -0,0 +1 @@ +# see floorl.s |