diff --git a/src/erf.jl b/src/erf.jl
index c4b57e29..a97e9496 100644
--- a/src/erf.jl
+++ b/src/erf.jl
@@ -12,9 +12,6 @@ for f in (:erf, :erfc)
     @eval begin
         $f(x::Number) = $internalf(float(x))
 
-        $internalf(x::Float64) = ccall(($libopenlibmf, libopenlibm), Float64, (Float64,), x)
-        $internalf(x::Float32) = ccall(($libopenlibmf0, libopenlibm), Float32, (Float32,), x)
-        $internalf(x::Float16) = Float16($internalf(Float32(x)))
 
         $internalf(z::Complex{Float64}) = Complex{Float64}(ccall(($openspecfunf, libopenspecfun), Complex{Float64}, (Complex{Float64}, Float64), z, zero(Float64)))
         $internalf(z::Complex{Float32}) = Complex{Float32}(ccall(($openspecfunf, libopenspecfun), Complex{Float64}, (Complex{Float64}, Float64), Complex{Float64}(z), Float64(eps(Float32))))
@@ -28,6 +25,10 @@ for f in (:erf, :erfc)
     end
 end
 
+_erfc(x::Float64) = ccall((:erfc, libopenlibm), Float64, (Float64,), x)
+_erfc(x::Float32) = ccall((:erfcf, libopenlibm), Float32, (Float32,), x)
+_erfc(x::Float16) = Float16(_erfc(Float32(x)))
+
 for f in (:erfcx, :erfi, :dawson, :faddeeva)
     internalf = Symbol(:_, f)
     openspecfunfsym = Symbol(:Faddeeva_, f === :dawson ? :Dawson : f === :faddeeva ? :w : f)
@@ -96,10 +97,165 @@ See also:
 [`erfinv(x)`](@ref erfinv), [`erfcinv(x)`](@ref erfcinv).
 
 # Implementation by
-- `Float32`/`Float64`: C standard math library
-    [libm](https://en.wikipedia.org/wiki/C_mathematical_functions#libm).
+- `Float32`: polynomial approximations of erf 
+- `Float64`: Julia implementation of https://github.com/ARM-software/optimized-routines/blob/master/math/erf.c 
 - `BigFloat`: C library for multiple-precision floating-point [MPFR](https://www.mpfr.org/)
 """
+
+#    Fast erf implementation using a mix of
+#    rational and polynomial approximations.
+#    Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0.  
+function _erf(x::Float64)
+
+    # # top 32 bits 
+    ix::UInt32=reinterpret(UInt64,x)>>32
+    # # top 32, without sign bit 
+    ia::UInt32=ix & 0x7fffffff
+
+    if (ia < 0x3feb0000)
+    #  a = |x| < 0.84375.  
+
+        x2 = x * x
+
+        if (ia < 0x3fe00000)
+        ## a < 0.5  - Use polynomial approximation.  
+
+            # Minimax approximation of erf of the form x*P(x^2) approximately on the interval [0;0.5]
+            PA=(0.12837916709551256, -0.3761263890318287, 0.11283791670896592, -0.026866170630075903, 0.005223977428649761, -0.0008548312229989974, 0.00012054654502151287, -1.4906315067498891e-5, 1.6126444349070824e-6, -1.3074259056679966e-7)
+        
+            r= fma(x,evalpoly(x2,PA),x) ## This fma is crucial for accuracy.  
+            return r
+        else
+        ## 0.5 <= a < 0.84375 - Use rational approximation.  
+
+            # Rational approximation on [0x1p-28, 0.84375] 
+            NA=(0x1.06eba8214db68p-3, -0x1.4cd7d691cb913p-2, -0x1.d2a51dbd7194fp-6,-0x1.7a291236668e4p-8, -0x1.8ead6120016acp-16)
+            DA=(1,0x1.97779cddadc09p-2, 0x1.0a54c5536cebap-4, 0x1.4d022c4d36b0fp-8,0x1.15dc9221c1a1p-13, -0x1.09c4342a2612p-18)
+
+            P=evalpoly(x2,NA)
+            Q=evalpoly(x2,DA)
+
+            return fma(P / Q, x, x)
+        end
+    elseif (ia < 0x3ff40000)
+    ## 0.84375 <= |x| < 1.25.  
+
+        # Rational approximation on [0.84375, 1.25] 
+        NB=( -0x1.359b8bef77538p-9, 0x1.a8d00ad92b34dp-2, -0x1.7d240fbb8c3f1p-2, 0x1.45fca805120e4p-2, -0x1.c63983d3e28ecp-4, 0x1.22a36599795ebp-5, -0x1.1bf380a96073fp-9 )
+        DB=( 1, 0x1.b3e6618eee323p-4, 0x1.14af092eb6f33p-1, 0x1.2635cd99fe9a7p-4, 0x1.02660e763351fp-3, 0x1.bedc26b51dd1cp-7, 0x1.88b545735151dp-7 )
+
+        C = 0x1.b0ac16p-1
+
+        a = abs(x) - 1.0
+
+        P=evalpoly(a,NB)
+        Q=evalpoly(a,DB)
+
+        r= C + P / Q
+        return copysign(r,x)
+
+    elseif (ia < 0x40000000)
+    ## 1.25 <= |x| < 2.0.  
+        a = abs(x)
+        a = a - 1.25
+        
+        # erfc polynomial approximation
+        # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=15 a=1.25 b=2 c=1 d=1.25 
+        PC=( 0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2, -0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5, -0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8, -0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12, 0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16, -0x1.578c9e375d37p-19)
+
+        # Obtains erfc of |x|
+        r=1.0-evalpoly(a,PC)
+        return copysign(r,x)
+
+    elseif (ia < 0x400a0000)
+    ## 2 <= |x| < 3.25.  
+        a = abs(x)
+        a = fma(0.5, a, -1.0)
+
+        # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2 
+        PD=( 0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3, -0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2, -0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2, 0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3, -0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3, 0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10 )
+
+        # Obtains erfc of |x|
+        r=1.0-evalpoly(a,PD)
+        return copysign(r,x)
+
+    elseif (ia < 0x40100000)
+    ## 3.25 <= |x| < 4.0.  
+        a = abs(x)
+        a = a - 3.25
+
+        # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25 
+        PE=( 0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14, -0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12, 0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14, -0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20, -0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23 )
+
+        r=1.0-evalpoly(a,PE)
+        return copysign(r,x)
+
+    elseif (ia < 0x4017a000)
+    ## 4 <= |x| < 5.90625.  
+        a = abs(x)
+        a = fma(0.5, a, -2.0)
+
+        # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4 
+        PF=( 0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19, -0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14, 0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11, -0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11, 0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14, -0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19 )
+
+        r=1.0-evalpoly(a,PF)
+        return copysign(r,x)
+    elseif isnan(x)
+        return NaN
+    else
+        return copysign(1.0,x)
+    end
+end
+
+#    Fast erf implementation using 
+#    polynomial approximations of erf and erfc.
+#    Highest measured error is 1.12 ULPs at x = 1.232469  
+function _erf(x::Float32)
+    xabs=abs(x)
+
+  if (xabs< 0.5)
+    # range [0;0.5]
+    # # erf approximation using erf(x)=x+x*P(x^2) with degree 6
+    # Sollya::remez(erf(sqrt(x))/sqrt(x)-1,6,[1e-32;0.5],1,1e-32);
+
+    p=(0.12837917f0, -0.37612638f0, 0.11283784f0, -0.026865287f0, 0.005218856f0, -0.0008394848f0)
+    return copysign(fma(evalpoly(x^2,p),x,x),x)
+
+  elseif(xabs<1.25)
+    # range [0.5;1.25]
+    # # erfc approximation with degree 11
+    # Sollya::remez(erfc(x+0.5),11,[0;1.25-0.5],1,1e-32);
+    
+    p=(0.47950011f0, -0.8787826f0, 0.4393913f0, 0.14646415f0, -0.18308467f0, -0.007286422f0, 0.04987047f0, -0.0048868246f0, -0.011067663f0, 0.003422347f0, 0.00073027064f0, -0.0003758171f0)
+    return copysign(1f0-evalpoly(xabs-0.5f0,p),x)
+
+  elseif(xabs<2.5)
+    # range [1.25;2.5]
+    # erfc approximation with degree 13
+    # Sollya::remez(erfc(x+1.25),13,[1.25-1.25;2.5-1.25],1,1e-32);
+
+    p=(0.077099875f0, -0.23652112f0, 0.2956514f0, -0.16753574f0, 0.006158593f0, 0.04718712f0, -0.021331023f0, -0.0035262543f0, 0.005461831f0, -0.00047858673f0, -0.0012763853f0, 0.00073386944f0, -0.00017831658f0, 1.7451624f-5)
+    return copysign(1f0-evalpoly(xabs-1.25f0,p),x)
+
+  elseif (xabs<4.0)
+    # range [2.5;4.0]
+    # erfc approximation with degree 13
+    # Sollya::remez(erfc(x+2.5),13,[0;4-2.5],1,1e-32);
+
+    p=(0.00040695202f0, -0.002178284f0, 0.0054457085f0, -0.008350053f0, 0.008622011f0, -0.006115167f0, 0.0027899458f0, -0.000519395f0, -0.00030461047f0, 0.00031068458f0, -0.00013866898f0, 3.6909692f-5, -5.682889f-6, 3.929763f-7)
+    return copysign(1f0-evalpoly(xabs-2.5f0,p),x)
+    
+  elseif isnan(x)
+    return NaN32
+  else
+    # range [4.0,inf)
+    return copysign(1f0, x)
+  end
+end
+
+_erf(x::Float16)=Float16(_erf(Float32(x)))
+
+
 function erf end
 """
     erf(x, y)
diff --git a/test/erf.jl b/test/erf.jl
index 29127343..2aa01387 100644
--- a/test/erf.jl
+++ b/test/erf.jl
@@ -2,7 +2,25 @@
     @testset "real argument" begin
         for T in (Float16, Float32, Float64)
             @test @inferred(erf(T(1))) isa T
+            @test erf(T(0.25))   ≈ T(0.27632639016823696) rtol=2*eps(T)
+            @test erf(T(0.75))   ≈ T(0.7111556336535151) rtol=2*eps(T)
             @test erf(T(1))   ≈ T(0.84270079294971486934) rtol=2*eps(T)
+            @test erf(T(1.5))   ≈ T(0.9661051464753108) rtol=2*eps(T)
+            @test erf(T(2.5))   ≈ T(0.9995930479825551) rtol=2*eps(T)
+            @test erf(T(3.5))   ≈ T(0.9999992569016276) rtol=2*eps(T)
+            @test erf(T(4.5))   ≈ T(0.9999999998033839) rtol=2*eps(T)
+            @test erf(T(6))   ≈ T(1.0) rtol=2*eps(T)
+
+            @test erf(T(-0.25))   ≈ T(-0.27632639016823696) rtol=2*eps(T)
+            @test erf(T(-0.75))   ≈ T(-0.7111556336535151) rtol=2*eps(T)
+            @test erf(T(-1))   ≈ T(-0.84270079294971486934) rtol=2*eps(T)
+            @test erf(T(-1.5))   ≈ T(-0.9661051464753108) rtol=2*eps(T)
+            @test erf(T(-2.5))   ≈ T(-0.9995930479825551) rtol=2*eps(T)
+            @test erf(T(-3.5))   ≈ T(-0.9999992569016276) rtol=2*eps(T)
+            @test erf(T(-4.5))   ≈ T(-0.9999999998033839) rtol=2*eps(T)
+            @test erf(T(-6))   ≈ T(-1.0) rtol=2*eps(T)
+
+            @test isnan(erf(T(NaN)))
 
             @test @inferred(erfc(T(1))) isa T
             @test erfc(T(1))  ≈ T(0.15729920705028513066) rtol=2*eps(T)