Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 161 additions & 5 deletions src/erf.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@ for f in (:erf, :erfc)
@eval begin
$f(x::Number) = $internalf(float(x))

$internalf(x::Float64) = ccall(($libopenlibmf, libopenlibm), Float64, (Float64,), x)
$internalf(x::Float32) = ccall(($libopenlibmf0, libopenlibm), Float32, (Float32,), x)
$internalf(x::Float16) = Float16($internalf(Float32(x)))

$internalf(z::Complex{Float64}) = Complex{Float64}(ccall(($openspecfunf, libopenspecfun), Complex{Float64}, (Complex{Float64}, Float64), z, zero(Float64)))
$internalf(z::Complex{Float32}) = Complex{Float32}(ccall(($openspecfunf, libopenspecfun), Complex{Float64}, (Complex{Float64}, Float64), Complex{Float64}(z), Float64(eps(Float32))))
Expand All @@ -28,6 +25,10 @@ for f in (:erf, :erfc)
end
end

_erfc(x::Float64) = ccall((:erfc, libopenlibm), Float64, (Float64,), x)
_erfc(x::Float32) = ccall((:erfcf, libopenlibm), Float32, (Float32,), x)
_erfc(x::Float16) = Float16(_erfc(Float32(x)))

for f in (:erfcx, :erfi, :dawson, :faddeeva)
internalf = Symbol(:_, f)
openspecfunfsym = Symbol(:Faddeeva_, f === :dawson ? :Dawson : f === :faddeeva ? :w : f)
Expand Down Expand Up @@ -96,10 +97,165 @@ See also:
[`erfinv(x)`](@ref erfinv), [`erfcinv(x)`](@ref erfcinv).

# Implementation by
- `Float32`/`Float64`: C standard math library
[libm](https://en.wikipedia.org/wiki/C_mathematical_functions#libm).
- `Float32`: polynomial approximations of erf
- `Float64`: Julia implementation of https://github.com/ARM-software/optimized-routines/blob/master/math/erf.c
- `BigFloat`: C library for multiple-precision floating-point [MPFR](https://www.mpfr.org/)
"""

# Fast erf implementation using a mix of
# rational and polynomial approximations.
# Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0.
function _erf(x::Float64)

# # top 32 bits
ix::UInt32=reinterpret(UInt64,x)>>32
# # top 32, without sign bit
ia::UInt32=ix & 0x7fffffff

if (ia < 0x3feb0000)
# a = |x| < 0.84375.

x2 = x * x

if (ia < 0x3fe00000)
## a < 0.5 - Use polynomial approximation.

# Minimax approximation of erf of the form x*P(x^2) approximately on the interval [0;0.5]
PA=(0.12837916709551256, -0.3761263890318287, 0.11283791670896592, -0.026866170630075903, 0.005223977428649761, -0.0008548312229989974, 0.00012054654502151287, -1.4906315067498891e-5, 1.6126444349070824e-6, -1.3074259056679966e-7)

r= fma(x,evalpoly(x2,PA),x) ## This fma is crucial for accuracy.
return r
else
## 0.5 <= a < 0.84375 - Use rational approximation.

# Rational approximation on [0x1p-28, 0.84375]
NA=(0x1.06eba8214db68p-3, -0x1.4cd7d691cb913p-2, -0x1.d2a51dbd7194fp-6,-0x1.7a291236668e4p-8, -0x1.8ead6120016acp-16)
DA=(1,0x1.97779cddadc09p-2, 0x1.0a54c5536cebap-4, 0x1.4d022c4d36b0fp-8,0x1.15dc9221c1a1p-13, -0x1.09c4342a2612p-18)

P=evalpoly(x2,NA)
Q=evalpoly(x2,DA)

return fma(P / Q, x, x)
end
elseif (ia < 0x3ff40000)
## 0.84375 <= |x| < 1.25.

# Rational approximation on [0.84375, 1.25]
NB=( -0x1.359b8bef77538p-9, 0x1.a8d00ad92b34dp-2, -0x1.7d240fbb8c3f1p-2, 0x1.45fca805120e4p-2, -0x1.c63983d3e28ecp-4, 0x1.22a36599795ebp-5, -0x1.1bf380a96073fp-9 )
DB=( 1, 0x1.b3e6618eee323p-4, 0x1.14af092eb6f33p-1, 0x1.2635cd99fe9a7p-4, 0x1.02660e763351fp-3, 0x1.bedc26b51dd1cp-7, 0x1.88b545735151dp-7 )

C = 0x1.b0ac16p-1

a = abs(x) - 1.0

P=evalpoly(a,NB)
Q=evalpoly(a,DB)

r= C + P / Q
return copysign(r,x)

elseif (ia < 0x40000000)
## 1.25 <= |x| < 2.0.
a = abs(x)
a = a - 1.25

# erfc polynomial approximation
# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=15 a=1.25 b=2 c=1 d=1.25
PC=( 0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2, -0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5, -0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8, -0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12, 0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16, -0x1.578c9e375d37p-19)

# Obtains erfc of |x|
r=1.0-evalpoly(a,PC)
return copysign(r,x)

elseif (ia < 0x400a0000)
## 2 <= |x| < 3.25.
a = abs(x)
a = fma(0.5, a, -1.0)

# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2
PD=( 0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3, -0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2, -0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2, 0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3, -0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3, 0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10 )

# Obtains erfc of |x|
r=1.0-evalpoly(a,PD)
return copysign(r,x)

elseif (ia < 0x40100000)
## 3.25 <= |x| < 4.0.
a = abs(x)
a = a - 3.25

# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25
PE=( 0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14, -0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12, 0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14, -0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20, -0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23 )

r=1.0-evalpoly(a,PE)
return copysign(r,x)

elseif (ia < 0x4017a000)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any chance you can regenerate the polies to make this be 0x40180000? If you can, you would be able to use UInt16 literals for all of these by only taking the top 16 rather than top 32 bits.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would that have a meaningful impact?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably not much. It might just save a cycle or 2.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you wanted to test the speed, you could try it without regenerating the polys and it should give you a good idea.

## 4 <= |x| < 5.90625.
a = abs(x)
a = fma(0.5, a, -2.0)

# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4
PF=( 0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19, -0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14, 0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11, -0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11, 0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14, -0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19 )

r=1.0-evalpoly(a,PF)
return copysign(r,x)
elseif isnan(x)
return NaN
else
return copysign(1.0,x)
end
end

# Fast erf implementation using
# polynomial approximations of erf and erfc.
# Highest measured error is 1.12 ULPs at x = 1.232469
function _erf(x::Float32)
xabs=abs(x)

if (xabs< 0.5)
# range [0;0.5]
# # erf approximation using erf(x)=x+x*P(x^2) with degree 6
# Sollya::remez(erf(sqrt(x))/sqrt(x)-1,6,[1e-32;0.5],1,1e-32);

p=(0.12837917f0, -0.37612638f0, 0.11283784f0, -0.026865287f0, 0.005218856f0, -0.0008394848f0)
return copysign(fma(evalpoly(x^2,p),x,x),x)

elseif(xabs<1.25)
# range [0.5;1.25]
# # erfc approximation with degree 11
# Sollya::remez(erfc(x+0.5),11,[0;1.25-0.5],1,1e-32);

p=(0.47950011f0, -0.8787826f0, 0.4393913f0, 0.14646415f0, -0.18308467f0, -0.007286422f0, 0.04987047f0, -0.0048868246f0, -0.011067663f0, 0.003422347f0, 0.00073027064f0, -0.0003758171f0)
return copysign(1f0-evalpoly(xabs-0.5f0,p),x)

elseif(xabs<2.5)
# range [1.25;2.5]
# erfc approximation with degree 13
# Sollya::remez(erfc(x+1.25),13,[1.25-1.25;2.5-1.25],1,1e-32);

p=(0.077099875f0, -0.23652112f0, 0.2956514f0, -0.16753574f0, 0.006158593f0, 0.04718712f0, -0.021331023f0, -0.0035262543f0, 0.005461831f0, -0.00047858673f0, -0.0012763853f0, 0.00073386944f0, -0.00017831658f0, 1.7451624f-5)
return copysign(1f0-evalpoly(xabs-1.25f0,p),x)

elseif (xabs<4.0)
# range [2.5;4.0]
# erfc approximation with degree 13
# Sollya::remez(erfc(x+2.5),13,[0;4-2.5],1,1e-32);

p=(0.00040695202f0, -0.002178284f0, 0.0054457085f0, -0.008350053f0, 0.008622011f0, -0.006115167f0, 0.0027899458f0, -0.000519395f0, -0.00030461047f0, 0.00031068458f0, -0.00013866898f0, 3.6909692f-5, -5.682889f-6, 3.929763f-7)
return copysign(1f0-evalpoly(xabs-2.5f0,p),x)

elseif isnan(x)
return NaN32
else
# range [4.0,inf)
return copysign(1f0, x)
end
end

_erf(x::Float16)=Float16(_erf(Float32(x)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you wanted to do a Float16 impl, it should be easier than the others. Specifically, the domain is only to 2, and the accuracy required is much reduced.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

100% could wait for a followup PR.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm thinking that too to be honest.
this and the poli regen.



function erf end
"""
erf(x, y)
Expand Down
18 changes: 18 additions & 0 deletions test/erf.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,25 @@
@testset "real argument" begin
for T in (Float16, Float32, Float64)
@test @inferred(erf(T(1))) isa T
@test erf(T(0.25)) ≈ T(0.27632639016823696) rtol=2*eps(T)
@test erf(T(0.75)) ≈ T(0.7111556336535151) rtol=2*eps(T)
@test erf(T(1)) ≈ T(0.84270079294971486934) rtol=2*eps(T)
@test erf(T(1.5)) ≈ T(0.9661051464753108) rtol=2*eps(T)
@test erf(T(2.5)) ≈ T(0.9995930479825551) rtol=2*eps(T)
@test erf(T(3.5)) ≈ T(0.9999992569016276) rtol=2*eps(T)
@test erf(T(4.5)) ≈ T(0.9999999998033839) rtol=2*eps(T)
@test erf(T(6)) ≈ T(1.0) rtol=2*eps(T)

@test erf(T(-0.25)) ≈ T(-0.27632639016823696) rtol=2*eps(T)
@test erf(T(-0.75)) ≈ T(-0.7111556336535151) rtol=2*eps(T)
@test erf(T(-1)) ≈ T(-0.84270079294971486934) rtol=2*eps(T)
@test erf(T(-1.5)) ≈ T(-0.9661051464753108) rtol=2*eps(T)
@test erf(T(-2.5)) ≈ T(-0.9995930479825551) rtol=2*eps(T)
@test erf(T(-3.5)) ≈ T(-0.9999992569016276) rtol=2*eps(T)
@test erf(T(-4.5)) ≈ T(-0.9999999998033839) rtol=2*eps(T)
@test erf(T(-6)) ≈ T(-1.0) rtol=2*eps(T)

@test isnan(erf(T(NaN)))

@test @inferred(erfc(T(1))) isa T
@test erfc(T(1)) ≈ T(0.15729920705028513066) rtol=2*eps(T)
Expand Down
Loading