diff --git a/Project.toml b/Project.toml
index aaaccc94..62caa680 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,7 +10,6 @@ Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReliabilityDiagrams = "e5f51471-6270-49e4-a15a-f1cfbff4f856"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
 julia = "1"
diff --git a/examples/batchensemble.jl b/examples/batchensemble.jl
deleted file mode 100644
index cb44d2a5..00000000
--- a/examples/batchensemble.jl
+++ /dev/null
@@ -1,207 +0,0 @@
-## Classification of MNIST dataset 
-## with the convolutional neural network known as LeNet5.
-## This script also combines various
-## packages from the Julia ecosystem with Flux.
-using Flux
-using Flux.Data: DataLoader
-using Flux.Optimise: Optimiser, WeightDecay
-using Flux: onehotbatch, onecold, glorot_normal, label_smoothing
-using Flux.Losses: logitcrossentropy
-using Statistics, Random
-using Logging: with_logger
-using TensorBoardLogger: TBLogger, tb_overwrite, set_step!, set_step_increment!
-using ProgressMeter: @showprogress
-import MLDatasets
-import BSON
-using CUDA
-using Formatting
-
-using DeepUncertainty
-
-# LeNet5 "constructor". 
-# The model can be adapted to any image size
-# and any number of output classes.
-function LeNet5(args; imgsize = (28, 28, 1), nclasses = 10)
-    out_conv_size = (imgsize[1] ÷ 4 - 3, imgsize[2] ÷ 4 - 3, 16)
-
-    return Chain(
-        ConvBatchEnsemble((5, 5), imgsize[end] => 6, args.rank, args.ensemble_size, relu),
-        MaxPool((2, 2)),
-        ConvBatchEnsemble((5, 5), 6 => 16, args.rank, args.ensemble_size, relu),
-        MaxPool((2, 2)),
-        flatten,
-        DenseBatchEnsemble(prod(out_conv_size), 120, args.rank, args.ensemble_size, relu),
-        DenseBatchEnsemble(120, 84, args.rank, args.ensemble_size, relu),
-        DenseBatchEnsemble(84, nclasses, args.rank, args.ensemble_size),
-    )
-end
-
-function get_data(args)
-    xtrain, ytrain = MLDatasets.MNIST.traindata(Float32)
-    xtest, ytest = MLDatasets.MNIST.testdata(Float32)
-
-    xtrain = reshape(xtrain, 28, 28, 1, :)
-    xtest = reshape(xtest, 28, 28, 1, :)
-
-    ytrain, ytest = onehotbatch(ytrain, 0:9), onehotbatch(ytest, 0:9)
-
-    train_loader = DataLoader(
-        (xtrain, ytrain),
-        batchsize = args.batchsize,
-        shuffle = true,
-        partial = false,
-    )
-    test_loader = DataLoader((xtest, ytest), batchsize = args.batchsize, partial = false)
-
-    return train_loader, test_loader
-end
-
-loss(ŷ, y) = logitcrossentropy(ŷ, y)
-
-function accuracy(preds, labels)
-    acc = sum(onecold(preds |> cpu) .== onecold(labels |> cpu))
-    return acc
-end
-
-function eval_loss_accuracy(args, loader, model, device)
-    l = [0.0f0 for x = 1:args.ensemble_size]
-    acc = [0 for x = 1:args.ensemble_size]
-    ece_list = [0.0f0 for x = 1:args.ensemble_size]
-    ntot = 0
-    mean_l = 0
-    mean_acc = 0
-    mean_ece = 0
-    for (x, y) in loader
-        x = repeat(x, 1, 1, 1, args.ensemble_size)
-        x, y = x |> device, y |> device
-        # Perform the forward pass 
-        ŷ = model(x)
-        ŷ = softmax(ŷ, dims = 1)
-        # Reshape the predictions into [classes, batch_size, ensemble_size
-        reshaped_ŷ = reshape(ŷ, size(ŷ)[1], args.batchsize, args.ensemble_size)
-        # Loop through each model's predictions 
-        for ensemble = 1:args.ensemble_size
-            model_predictions = reshaped_ŷ[:, :, ensemble]
-            # Calculate individual loss 
-            l[ensemble] += loss(model_predictions, y) * size(model_predictions)[end]
-            acc[ensemble] += accuracy(model_predictions, y)
-            ece_list[ensemble] +=
-                ExpectedCalibrationError(model_predictions |> cpu, onecold(y |> cpu)) *
-                args.batchsize
-        end
-        # Get the mean predictions
-        mean_predictions = mean(reshaped_ŷ, dims = ndims(reshaped_ŷ))
-        mean_predictions = dropdims(mean_predictions, dims = ndims(mean_predictions))
-        mean_l += loss(mean_predictions, y) * size(mean_predictions)[end]
-        mean_acc += accuracy(mean_predictions, y)
-        mean_ece +=
-            ExpectedCalibrationError(mean_predictions |> cpu, onecold(y |> cpu)) *
-            args.batchsize
-        ntot += size(mean_predictions)[end]
-    end
-    # Normalize the loss 
-    losses = [loss / ntot |> round4 for loss in l]
-    acc = [a / ntot * 100 |> round4 for a in acc]
-    ece_list = [x / ntot |> round4 for x in ece_list]
-    # Calculate mean loss 
-    mean_l = mean_l / ntot |> round4
-    mean_acc = mean_acc / ntot * 100 |> round4
-    mean_ece = mean_ece / ntot |> round4
-
-    # Print the per ensemble mode loss and accuracy 
-    for ensemble = 1:args.ensemble_size
-        @info (format(
-            "Model {} Loss: {} Accuracy: {} ECE: {}",
-            ensemble,
-            losses[ensemble],
-            acc[ensemble],
-            ece_list[ensemble],
-        ))
-    end
-    @info (format(
-        "Mean Loss: {} Mean Accuracy: {} Mean ECE: {}",
-        mean_l,
-        mean_acc,
-        mean_ece,
-    ))
-    @info "==========================================================="
-    return nothing
-end
-
-## utility functions
-num_params(model) = sum(length, Flux.params(model))
-round4(x) = round(x, digits = 4)
-
-# arguments for the `train` function 
-Base.@kwdef mutable struct Args
-    η = 3e-4             # learning rate
-    λ = 0                # L2 regularizer param, implemented as weight decay
-    batchsize = 32      # batch size
-    epochs = 10          # number of epochs
-    seed = 0             # set seed > 0 for reproducibility
-    use_cuda = true      # if true use cuda (if available)
-    infotime = 1      # report every `infotime` epochs
-    checktime = 5        # Save the model every `checktime` epochs. Set to 0 for no checkpoints.
-    savepath = "runs/"    # results path
-    rank = 1
-    ensemble_size = 4
-end
-
-function train(; kws...)
-    args = Args(; kws...)
-    args.seed > 0 && Random.seed!(args.seed)
-    use_cuda = args.use_cuda && CUDA.functional()
-
-    if use_cuda
-        device = gpu
-        @info "Training on GPU"
-    else
-        device = cpu
-        @info "Training on CPU"
-    end
-
-    ## DATA
-    train_loader, test_loader = get_data(args)
-    @info "Dataset MNIST: $(train_loader.nobs) train and $(test_loader.nobs) test examples"
-
-    ## MODEL AND OPTIMIZER
-    model = LeNet5(args) |> device
-    @info "LeNet5 model: $(num_params(model)) trainable params"
-
-    ps = Flux.params(model)
-
-    opt = ADAM(args.η)
-    if args.λ > 0 # add weight decay, equivalent to L2 regularization
-        opt = Optimiser(WeightDecay(args.λ), opt)
-    end
-
-    function report(epoch)
-        # @info "Train Metrics"
-        # eval_loss_accuracy(args, train_loader, model, device)
-        @info "Test metrics"
-        eval_loss_accuracy(args, test_loader, model, device)
-    end
-
-    ## TRAINING
-    @info "Start Training"
-    report(0)
-    for epoch = 1:args.epochs
-        @showprogress for (x, y) in train_loader
-            # Make copies of batches for ensembles 
-            x = repeat(x, 1, 1, 1, args.ensemble_size)
-            y = repeat(y, 1, args.ensemble_size)
-            x, y = x |> device, y |> device
-            gs = Flux.gradient(ps) do
-                ŷ = model(x)
-                loss(ŷ, y)
-            end
-
-            Flux.Optimise.update!(opt, ps, gs)
-        end
-
-        ## Printing and logging
-        epoch % args.infotime == 0 && report(epoch)
-    end
-end
-
-train()
diff --git a/src/DeepUncertainty.jl b/src/DeepUncertainty.jl
index 74de5589..5aacd657 100644
--- a/src/DeepUncertainty.jl
+++ b/src/DeepUncertainty.jl
@@ -1,17 +1,10 @@
 module DeepUncertainty
 
-using Flux
-using Random
-using Flux: @functor, glorot_normal, create_bias
-
 # Export layers 
 export MCLayer, MCDense, MCConv
-export DenseBatchEnsemble, ConvBatchEnsemble
 export mean_loglikelihood, brier_score, ExpectedCalibrationError, prediction_metrics
 
 include("metrics.jl")
 include("layers/mclayers.jl")
-include("layers/BatchEnsemble/dense.jl")
-include("layers/BatchEnsemble/conv.jl")
 
 end
diff --git a/src/layers/BatchEnsemble/conv.jl b/src/layers/BatchEnsemble/conv.jl
deleted file mode 100644
index 564c943b..00000000
--- a/src/layers/BatchEnsemble/conv.jl
+++ /dev/null
@@ -1,145 +0,0 @@
-"""
-    ConvBatchEnsemble(filter, in => out, rank, 
-                    ensemble_size, σ = identity;
-                    stride = 1, pad = 0, dilation = 1, 
-                    groups = 1, [bias, weight, init])
-    ConvBatchEnsemble(layer, alpha, gamma, ensemble_bias, ensemble_act, rank)
-
-Creates a conv BatchEnsemble layer. Batch ensemble is a memory efficient alternative 
-for deep ensembles. In deep ensembles, if the ensemble size is N, N different models 
-are trained, making the time and memory complexity O(N * complexity of one network). 
-BatchEnsemble generates weight matrices for each member in the ensemble using a 
-couple of rank 1 vectors R (alpha), S (gamma), RS' and multiplying the result with 
-weight matrix W element wise. We also call R and S as fast weights. 
-
-Reference - https://arxiv.org/abs/2002.06715 
-
-During both training and testing, we repeat the samples along the batch dimension 
-N times, where N is the ensemble_size. For example, if each mini batch has 10 samples 
-and our ensemble size is 4, then the actual input to the layer has 40 samples. 
-The output of the layer has 40 samples as well, and each 10 samples can be considered 
-as the output of an esnemble member. 
-
-# Fields 
-- `layer`: The dense layer which transforms the pertubed input to output 
-- `alpha`: The first Fast weight of size (in_dim, ensemble_size)
-- `gamma`: The second Fast weight of size (out_dim, ensemble_size)
-- `ensemble_bias`: Bias added to the ensemble output, separate from dense layer bias 
-- `ensemble_act`: The activation function to be applied on ensemble output 
-- `rank`: Rank of the fast weights (rank > 1 doesn't work on GPU for now)
-
-# Arguments 
-- `filter::NTuple{N,Integer}`: Kernel dimensions, eg, (5, 5) 
-- `ch::Pair{<:Integer,<:Integer}`: Input channels => output channels 
-- `rank::Integer`: Rank of the fast weights 
-- `ensemble_size::Integer`: Number of models in the ensemble 
-- `σ::F=identity`: Activation of the dense layer, defaults to identity
-- `init=glorot_normal`: Initialization function, defaults to glorot_normal 
-- `alpha_init=glorot_normal`: Initialization function for the alpha fast weight,
-                            defaults to glorot_normal 
-- `gamma_init=glorot_normal`: Initialization function for the gamma fast weight, 
-                            defaults to glorot_normal 
-- `bias::Bool=true`: Toggle the usage of bias in the dense layer 
-- `ensemble_bias::Bool=true`: Toggle the usage of ensemble bias 
-- `ensemble_act::F=identity`: Activation function for enseble outputs 
-"""
-struct ConvBatchEnsemble{L,F,M,B}
-    layer::L
-    alpha::M
-    gamma::M
-    ensemble_bias::B
-    ensemble_act::F
-    rank::Any
-    function ConvBatchEnsemble(
-        layer::L,
-        alpha::M,
-        gamma::M,
-        ensemble_bias = true,
-        ensemble_act::F = identity,
-        rank = 1,
-    ) where {M,F,L}
-        ensemble_bias = create_bias(gamma, ensemble_bias, size(gamma)[1], size(gamma)[2])
-        new{typeof(layer),F,M,typeof(ensemble_bias)}(
-            layer,
-            alpha,
-            gamma,
-            ensemble_bias,
-            ensemble_act,
-            rank,
-        )
-    end
-end
-
-function ConvBatchEnsemble(
-    k::NTuple{N,Integer},
-    ch::Pair{<:Integer,<:Integer},
-    rank::Integer,
-    ensemble_size::Integer,
-    σ = identity;
-    init = glorot_normal,
-    alpha_init = glorot_normal,
-    gamma_init = glorot_normal,
-    stride = 1,
-    pad = 0,
-    dilation = 1,
-    groups = 1,
-    bias = true,
-    ensemble_bias = true,
-    ensemble_act = identity,
-) where {N}
-    layer = Flux.Conv(
-        k,
-        ch,
-        σ;
-        stride = stride,
-        pad = pad,
-        dilation = dilation,
-        init = init,
-        groups = groups,
-        bias = bias,
-    )
-    in_dim = ch[1]
-    out_dim = ch[2]
-    if rank >= 1
-        alpha_shape = (in_dim, ensemble_size)
-        gamma_shape = (out_dim, ensemble_size)
-    else
-        error("Rank must be >= 1.")
-    end
-    alpha = alpha_init(alpha_shape)
-    gamma = gamma_init(gamma_shape)
-
-    return ConvBatchEnsemble(layer, alpha, gamma, ensemble_bias, ensemble_act, rank)
-end
-
-@functor ConvBatchEnsemble
-
-function (be::ConvBatchEnsemble)(x)
-    # Conv Batch Ensemble params 
-    layer = be.layer
-    alpha = be.alpha
-    gamma = be.gamma
-    e_b = be.ensemble_bias
-    e_σ = be.ensemble_act
-
-    batch_size = size(x)[end]
-    in_size = size(alpha)[1]
-    out_size = size(gamma)[1]
-    ensemble_size = size(alpha)[2]
-    samples_per_model = batch_size ÷ ensemble_size
-
-    # Alpha, gamma shapes - [units, ensembles, rank]
-    e_b = repeat(e_b, samples_per_model)
-    alpha = repeat(alpha, samples_per_model)
-    gamma = repeat(gamma, samples_per_model)
-    # Reshape alpha, gamma to [units, batch_size, rank]
-    e_b = reshape(e_b, (1, 1, out_size, batch_size))
-    alpha = reshape(alpha, (1, 1, in_size, batch_size))
-    gamma = reshape(gamma, (1, 1, out_size, batch_size))
-
-    perturbed_x = x .* alpha
-    output = layer(perturbed_x) .* gamma
-    output = e_σ.(output .+ e_b)
-
-    return output
-end
diff --git a/src/layers/BatchEnsemble/dense.jl b/src/layers/BatchEnsemble/dense.jl
deleted file mode 100644
index 9ef92921..00000000
--- a/src/layers/BatchEnsemble/dense.jl
+++ /dev/null
@@ -1,151 +0,0 @@
-"""
-DenseBatchEnsemble(in, out, rank, 
-                    ensemble_size, 
-                    σ=identity; 
-                    bias=true,
-                    init=glorot_normal, 
-                    alpha_init=glorot_normal, 
-                    gamma_init=glorot_normal)
-DenseBatchEnsemble(layer, alpha, gamma, ensemble_bias, ensemble_act, rank)
-
-Creates a dense BatchEnsemble layer. Batch ensemble is a memory efficient alternative 
-for deep ensembles. In deep ensembles, if the ensemble size is N, N different models 
-are trained, making the time and memory complexity O(N * complexity of one network). 
-BatchEnsemble generates weight matrices for each member in the ensemble using a 
-couple of rank 1 vectors R (alpha), S (gamma), RS' and multiplying the result with 
-weight matrix W element wise. We also call R and S as fast weights. 
-
-Reference - https://arxiv.org/abs/2002.06715 
-
-During both training and testing, we repeat the samples along the batch dimension 
-N times, where N is the ensemble_size. For example, if each mini batch has 10 samples 
-and our ensemble size is 4, then the actual input to the layer has 40 samples. 
-The output of the layer has 40 samples as well, and each 10 samples can be considered 
-as the output of an esnemble member. 
-
-# Fields 
-- `layer`: The dense layer which transforms the pertubed input to output 
-- `alpha`: The first Fast weight of size (in_dim, ensemble_size)
-- `gamma`: The second Fast weight of size (out_dim, ensemble_size)
-- `ensemble_bias`: Bias added to the ensemble output, separate from dense layer bias 
-- `ensemble_act`: The activation function to be applied on ensemble output 
-- `rank`: Rank of the fast weights (rank > 1 doesn't work on GPU for now)
-
-# Arguments 
-- `in::Integer`: Input dimension of features 
-- `out::Integer`: Output dimension of features 
-- `rank::Integer`: Rank of the fast weights 
-- `ensemble_size::Integer`: Number of models in the ensemble 
-- `σ::F=identity`: Activation of the dense layer, defaults to identity
-- `init=glorot_normal`: Initialization function, defaults to glorot_normal 
-- `alpha_init=glorot_normal`: Initialization function for the alpha fast weight,
-                        defaults to glorot_normal 
-- `gamma_init=glorot_normal`: Initialization function for the gamma fast weight, 
-                        defaults to glorot_normal 
-- `bias::Bool=true`: Toggle the usage of bias in the dense layer 
-- `ensemble_bias::Bool=true`: Toggle the usage of ensemble bias 
-- `ensemble_act::F=identity`: Activation function for enseble outputs 
-"""
-struct DenseBatchEnsemble{L,F,M,B}
-    layer::L
-    alpha::M
-    gamma::M
-    ensemble_bias::B
-    ensemble_act::F
-    rank::Any
-    function DenseBatchEnsemble(
-        layer::L,
-        alpha::M,
-        gamma::M,
-        ensemble_bias = true,
-        ensemble_act::F = identity,
-        rank = 1,
-    ) where {M,F,L}
-        ensemble_bias = create_bias(gamma, ensemble_bias, size(gamma)[1], size(gamma)[2])
-        new{typeof(layer),F,M,typeof(ensemble_bias)}(
-            layer,
-            alpha,
-            gamma,
-            ensemble_bias,
-            ensemble_act,
-            rank,
-        )
-    end
-end
-
-function DenseBatchEnsemble(
-    in::Integer,
-    out::Integer,
-    rank::Integer,
-    ensemble_size::Integer,
-    σ = identity;
-    init = glorot_normal,
-    alpha_init = glorot_normal,
-    gamma_init = glorot_normal,
-    bias = true,
-    ensemble_bias = true,
-    ensemble_act = identity,
-)
-
-    layer = Flux.Dense(in, out, σ; init = init, bias = bias)
-    if rank >= 1
-        alpha_shape = (in, ensemble_size, rank)
-        gamma_shape = (out, ensemble_size, rank)
-    else
-        error("Rank must be >= 1.")
-    end
-    alpha = alpha_init(alpha_shape)
-    gamma = gamma_init(gamma_shape)
-
-    return DenseBatchEnsemble(layer, alpha, gamma, ensemble_bias, ensemble_act, rank)
-end
-
-@functor DenseBatchEnsemble
-
-"""
-The forward pass for a DenseBatchEnsemble layer. The input is initially perturbed 
-using the first fast weight, then passed through the dense layer, and finall 
-multiplied by the second fast weight.
-
-# Arguments 
-- `x::AbstractVecOrMat`: Input tensors 
-"""
-function (be::DenseBatchEnsemble)(x)
-    layer = be.layer
-    alpha = be.alpha
-    gamma = be.gamma
-    e_b = be.ensemble_bias
-    e_σ = be.ensemble_act
-    rank = be.rank
-
-    batch_size = size(x)[end]
-    in_size = size(alpha)[1]
-    out_size = size(gamma)[1]
-    ensemble_size = size(alpha)[2]
-    samples_per_model = batch_size ÷ ensemble_size
-
-    # Alpha, gamma shapes - [units, ensembles, rank]
-    alpha = reshape(alpha, (in_size, ensemble_size * rank))
-    gamma = reshape(gamma, (out_size, ensemble_size * rank))
-    # Repeat breaks on GPU when input dims > 2 
-    alpha = repeat(alpha, samples_per_model)
-    gamma = repeat(gamma, samples_per_model)
-    # Reshape alpha, gamma to [units, batch_size, rank]
-    alpha = reshape(alpha, (in_size, batch_size, rank))
-    gamma = reshape(gamma, (out_size, batch_size, rank))
-    # Reshape inputs to [units, batch_size, 1] for broadcasting
-    x = Flux.unsqueeze(x, (ndims(x) + 1))
-    # Perturb the inputs 
-    perturbed_x = x .* alpha
-    # Dense layer forward pass 
-    outputs = layer(perturbed_x) .* gamma
-    # Reduce the rank dimension through summing it up
-    outputs = sum(outputs, dims = 3)
-    outputs = reshape(outputs, (out_size, samples_per_model, ensemble_size))
-    # Reshape ensemble bias 
-    e_b = Flux.unsqueeze(e_b, ndims(e_b))
-
-    outputs = e_σ.(outputs .+ e_b)
-    outputs = reshape(outputs, (out_size, batch_size))
-    return outputs
-end
diff --git a/test/cuda/layers/batchensemble_gpu.jl b/test/cuda/layers/batchensemble_gpu.jl
deleted file mode 100644
index 2d6ed695..00000000
--- a/test/cuda/layers/batchensemble_gpu.jl
+++ /dev/null
@@ -1,77 +0,0 @@
-@testset "Dense batchensemble" begin
-    ensemble_size = 4
-    samples_per_model = 4
-    input_dim = 5
-    output_dim = 5
-    rank = 1
-    inputs = rand(Float32, input_dim, samples_per_model)
-    layer = DenseBatchEnsemble(
-        input_dim,
-        output_dim,
-        rank,
-        ensemble_size;
-        alpha_init = ones,
-        gamma_init = ones,
-    )
-    layer = layer |> gpu
-    batch_inputs = gpu(repeat(inputs, 1, ensemble_size))
-    batch_outputs = layer(batch_inputs)
-    # Do the computation in for loop to compare outputs 
-    layer = layer |> cpu
-    loop_outputs = []
-    for i = 1:ensemble_size
-        perturbed_inputs = inputs .* layer.alpha[i]
-        outputs = layer.layer(perturbed_inputs) .* layer.gamma[i]
-        outputs = layer.ensemble_act.(outputs .+ layer.ensemble_bias[i])
-        push!(loop_outputs, outputs)
-    end
-    loop_outputs = Flux.batch(loop_outputs)
-    loop_outputs = reshape(loop_outputs, (output_dim, samples_per_model * ensemble_size))
-    @test batch_outputs isa CuArray
-    @test size(batch_outputs) == size(loop_outputs)
-    @test isapprox(cpu(batch_outputs), loop_outputs, atol = 0.05)
-end
-
-@testset "ConvBatchEnsemble" begin
-    ensemble_size = 4
-    samples_per_model = 4
-    input_dim = 5
-    output_dim = 10
-    rank = 1
-    inputs = rand(Float32, 10, 10, input_dim, samples_per_model)
-    beconv = ConvBatchEnsemble(
-        (5, 5),
-        5 => 10,
-        rank,
-        ensemble_size,
-        relu;
-        alpha_init = ones,
-        gamma_init = ones,
-    )
-    beconv = beconv |> gpu
-    batch_inputs = gpu(repeat(inputs, 1, 1, 1, ensemble_size))
-    batch_outputs = beconv(batch_inputs)
-    # Do the computation in for loop to compare outputs 
-    beconv = beconv |> cpu
-    loop_outputs = []
-    for i = 1:ensemble_size
-        perturbed_inputs = inputs .* beconv.alpha[i]
-        outputs = beconv.layer(perturbed_inputs) .* beconv.gamma[i]
-        outputs = beconv.ensemble_act.(outputs .+ beconv.ensemble_bias[i])
-        push!(loop_outputs, outputs)
-    end
-    loop_outputs = Flux.batch(loop_outputs)
-    loop_outputs_size = size(batch_outputs)
-    loop_outputs = reshape(
-        loop_outputs,
-        (
-            loop_outputs_size[1],
-            loop_outputs_size[2],
-            output_dim,
-            samples_per_model * ensemble_size,
-        ),
-    )
-    @test batch_outputs isa CuArray
-    @test size(batch_outputs) == size(loop_outputs)
-    @test isapprox(cpu(batch_outputs), loop_outputs, atol = 0.05)
-end
diff --git a/test/cuda/layers/mclayers_gpu.jl b/test/cuda/layers/mclayers_gpu.jl
deleted file mode 100644
index f176cfbf..00000000
--- a/test/cuda/layers/mclayers_gpu.jl
+++ /dev/null
@@ -1,37 +0,0 @@
-function test_sparsity(x, target_sparsity; atol = 0.05)
-    number_of_zeros = count(ele -> (ele == 0.0), x)
-    sparsity = number_of_zeros / sum(length, x)
-    @test isapprox(target_sparsity, sparsity; atol)
-end
-
-@testset "MC Dense GPU" begin
-    dropout_rate = 0.35
-    # Test MC Dense layer 
-    a = gpu(rand(Float32, 8, 32))
-    layer = gpu(MCDense(8, 16, dropout_rate))
-    output = layer(a)
-    # Test if it's CuArray 
-    @test output isa CuArray
-    @test isequal(size(output), (16, 32))
-    test_sparsity(output, dropout_rate)
-    # Test MC dense dropout toggle 
-    output = layer(a, dropout = false)
-    test_sparsity(output, 0)
-end
-
-@testset "MC Conv GPU" begin
-    dropout_rate = 0.4
-    # Test MC conv layer 
-    a = gpu(rand(Float32, 32, 32, 3, 32))
-    layer = MCConv((5, 5), 3 => 6, dropout_rate) |> gpu
-    output = layer(a)
-    # Test if it's CuArray 
-    @test output isa CuArray
-    # Test the output shape 
-    @test isequal(size(output), (28, 28, 6, 32))
-    # Test the sparsity percentage in the array 
-    test_sparsity(output, dropout_rate)
-    # Test MC conv dropout toggle 
-    output = layer(a, dropout = false)
-    test_sparsity(output, 0)
-end
diff --git a/test/cuda/runtests.jl b/test/cuda/runtests.jl
deleted file mode 100644
index 8e9a4141..00000000
--- a/test/cuda/runtests.jl
+++ /dev/null
@@ -1,7 +0,0 @@
-using Flux, Test, CUDA
-
-@info "Testing GPU Support"
-CUDA.allowscalar(false)
-
-include("layers/mclayers_gpu.jl")
-include("layers/batchensemble_gpu.jl")
diff --git a/test/layers/batchensemble.jl b/test/layers/batchensemble.jl
deleted file mode 100644
index 4107178e..00000000
--- a/test/layers/batchensemble.jl
+++ /dev/null
@@ -1,72 +0,0 @@
-@testset "Dense batchensemble" begin
-    ensemble_size = 4
-    samples_per_model = 4
-    input_dim = 5
-    output_dim = 5
-    rank = 1
-    inputs = rand(Float32, input_dim, samples_per_model)
-    layer = DenseBatchEnsemble(
-        input_dim,
-        output_dim,
-        rank,
-        ensemble_size;
-        alpha_init = ones,
-        gamma_init = ones,
-    )
-    batch_inputs = repeat(inputs, 1, ensemble_size)
-    batch_outputs = layer(batch_inputs)
-    # Do the computation in for loop to compare outputs 
-    loop_outputs = []
-    for i = 1:ensemble_size
-        perturbed_inputs = inputs .* layer.alpha[i]
-        outputs = layer.layer(perturbed_inputs) .* layer.gamma[i]
-        outputs = layer.ensemble_act.(outputs .+ layer.ensemble_bias[i])
-        push!(loop_outputs, outputs)
-    end
-    loop_outputs = Flux.batch(loop_outputs)
-    loop_outputs = reshape(loop_outputs, (output_dim, samples_per_model * ensemble_size))
-    @test size(batch_outputs) == size(loop_outputs)
-    @test isapprox(batch_outputs, loop_outputs, atol = 0.05)
-end
-
-@testset "ConvBatchEnsemble" begin
-    ensemble_size = 4
-    samples_per_model = 4
-    input_dim = 5
-    output_dim = 10
-    rank = 1
-    inputs = rand(Float32, 10, 10, input_dim, samples_per_model)
-    beconv = ConvBatchEnsemble(
-        (5, 5),
-        5 => 10,
-        rank,
-        ensemble_size,
-        relu;
-        alpha_init = ones,
-        gamma_init = ones,
-    )
-    batch_inputs = repeat(inputs, 1, 1, 1, ensemble_size)
-    batch_outputs = beconv(batch_inputs)
-
-    # Do the computation in for loop to compare outputs 
-    loop_outputs = []
-    for i = 1:ensemble_size
-        perturbed_inputs = inputs .* beconv.alpha[i]
-        outputs = beconv.layer(perturbed_inputs) .* beconv.gamma[i]
-        outputs = beconv.ensemble_act.(outputs .+ beconv.ensemble_bias[i])
-        push!(loop_outputs, outputs)
-    end
-    loop_outputs = Flux.batch(loop_outputs)
-    loop_outputs_size = size(batch_outputs)
-    loop_outputs = reshape(
-        loop_outputs,
-        (
-            loop_outputs_size[1],
-            loop_outputs_size[2],
-            output_dim,
-            samples_per_model * ensemble_size,
-        ),
-    )
-    @test size(batch_outputs) == size(loop_outputs)
-    @test isapprox(batch_outputs, loop_outputs, atol = 0.05)
-end
diff --git a/test/layers/mclayers.jl b/test/layers/mclayers_test.jl
similarity index 50%
rename from test/layers/mclayers.jl
rename to test/layers/mclayers_test.jl
index 1e8c2627..9d963b4a 100644
--- a/test/layers/mclayers.jl
+++ b/test/layers/mclayers_test.jl
@@ -1,8 +1,5 @@
-function test_sparsity(x, target_sparsity; atol = 0.05)
-    number_of_zeros = count(ele -> (ele == 0.0), x)
-    sparsity = number_of_zeros / sum(length, x)
-    @test isapprox(target_sparsity, sparsity; atol)
-end
+using Test
+using DeepUncertainty: MCDense, MCConv
 
 @testset "MC Dense" begin
     dropout_rate = 0.35
@@ -10,11 +7,16 @@ end
     a = rand(Float32, 8, 32)
     layer = MCDense(8, 16, dropout_rate)
     output = layer(a)
+    number_of_zeros = count(x -> (x == 0.0), output)
+    sparsity = number_of_zeros / sum(length, output)
     @test isequal(size(output), (16, 32))
-    test_sparsity(output, dropout_rate)
+    @test isapprox(dropout_rate, sparsity; atol = 0.05)
+
     # Test MC dense dropout toggle 
     output = layer(a, dropout = false)
-    test_sparsity(output, 0)
+    number_of_zeros = count(x -> (x == 0.0), output)
+    sparsity = number_of_zeros / sum(length, output)
+    @test isapprox(0, sparsity; atol = 0.05)
 end
 
 @testset "MC Conv" begin
@@ -23,11 +25,16 @@ end
     a = rand(Float32, 32, 32, 3, 32)
     layer = MCConv((5, 5), 3 => 6, dropout_rate)
     output = layer(a)
+    number_of_zeros = count(x -> (x == 0.0), output)
+    sparsity = number_of_zeros / sum(length, output)
     # Test the output shape 
     @test isequal(size(output), (28, 28, 6, 32))
     # Test the sparsity percentage in the array 
-    test_sparsity(output, dropout_rate)
+    @test isapprox(dropout_rate, sparsity; atol = 0.05)
+
     # Test MC conv dropout toggle 
     output = layer(a, dropout = false)
-    test_sparsity(output, 0)
+    number_of_zeros = count(x -> (x == 0.0), output)
+    sparsity = number_of_zeros / sum(length, output)
+    @test isapprox(0, sparsity; atol = 0.05)
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 476d7605..99d1e32a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,20 +1,7 @@
 using DeepUncertainty
 using Test
-using Flux
-using Flux.CUDA
-using Flux: cpu, gpu
 
 @testset "Layers" begin
-    # MC layers 
-    include("./layers/mclayers.jl")
-    # Batch ensembe layers 
-    include("./layers/batchensemble.jl")
-end
-
-@testset "CUDA" begin
-    if CUDA.functional()
-        include("cuda/runtests.jl")
-    else
-        @warn "CUDA unavailable, not testing GPU support"
-    end
+    # Test the layers 
+    include("./layers/mclayers_test.jl")
 end