diff --git a/Cargo.toml b/Cargo.toml
index 8bbb664..9d8f4ba 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,3 +9,4 @@ edition = "2021"
 ndarray = "^0.15"
 # num-traits = "0.2.15"
 rand = "^0.8"
+rand_distr = "0.4.3"
diff --git a/examples/bivariate.rs b/examples/bivariate.rs
index 4a7c0b0..67443fe 100644
--- a/examples/bivariate.rs
+++ b/examples/bivariate.rs
@@ -10,10 +10,16 @@ use rand::Rng;
 
 fn main() {
     let mut network = neura_network![
-        neura_layer!("dense", LeakyRelu(0.01), 4, 2),
-        neura_layer!("dense", Tanh, 3),
-        neura_layer!("dense", Relu, 2)
+        neura_layer!("dense", LeakyRelu(0.01), 9, 2),
+        neura_layer!("dropout", 0.1),
+        neura_layer!("dense", LeakyRelu(0.01), 9),
+        neura_layer!("dropout", 0.3),
+        neura_layer!("dense", LeakyRelu(0.01), 6),
+        neura_layer!("dropout", 0.1),
+        neura_layer!("dense", LeakyRelu(0.01), 4),
+        neura_layer!("dense", LeakyRelu(0.1), 2)
     ];
+    // println!("{:#?}", network);
 
     let mut rng = rand::thread_rng();
     let inputs = (0..=1).cycle().map(move |category| {
@@ -48,6 +54,8 @@ fn main() {
         let guess = argmax(&network.eval(&input));
         writeln!(&mut file, "{},{},{}", input[0], input[1], guess).unwrap();
     }
+
+    // println!("{:#?}", network);
 }
 
 fn one_hot<const N: usize>(value: usize) -> [f64; N] {
diff --git a/src/algebra.rs b/src/algebra.rs
index 8ed39ab..d3bcd12 100644
--- a/src/algebra.rs
+++ b/src/algebra.rs
@@ -7,6 +7,23 @@ pub trait NeuraVectorSpace {
     fn zero() -> Self;
 }
 
+impl NeuraVectorSpace for () {
+    #[inline(always)]
+    fn add_assign(&mut self, _other: &Self) {
+        // Noop
+    }
+
+    #[inline(always)]
+    fn mul_assign(&mut self, _by: f64) {
+        // Noop
+    }
+
+    #[inline(always)]
+    fn zero() -> Self {
+        ()
+    }
+}
+
 impl<Left: NeuraVectorSpace, Right: NeuraVectorSpace> NeuraVectorSpace for (Left, Right) {
     fn add_assign(&mut self, other: &Self) {
         NeuraVectorSpace::add_assign(&mut self.0, &other.0);
diff --git a/src/layer/dense.rs b/src/layer/dense.rs
index 1776bc8..bf94b76 100644
--- a/src/layer/dense.rs
+++ b/src/layer/dense.rs
@@ -1,5 +1,7 @@
 use super::NeuraLayer;
 use crate::{derivable::NeuraDerivable, utils::{multiply_matrix_vector, reverse_dot_product, multiply_matrix_transpose_vector}, train::NeuraTrainableLayer, algebra::NeuraVectorSpace};
+
+use rand_distr::Distribution;
 use rand::Rng;
 
 #[derive(Clone, Debug)]
@@ -31,11 +33,11 @@ impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize>
     pub fn from_rng(rng: &mut impl Rng, activation: Act) -> Self {
         let mut weights = [[0.0; INPUT_LEN]; OUTPUT_LEN];
 
-        let multiplier = std::f64::consts::SQRT_2 / (INPUT_LEN as f64).sqrt();
+        let distribution = rand_distr::Normal::new(0.0, 2.0 / (INPUT_LEN as f64 + OUTPUT_LEN as f64)).unwrap();
 
         for i in 0..OUTPUT_LEN {
             for j in 0..INPUT_LEN {
-                weights[i][j] = rng.gen_range(0.0..multiplier);
+                weights[i][j] = distribution.sample(rng);
             }
         }
 
diff --git a/src/layer/dropout.rs b/src/layer/dropout.rs
new file mode 100644
index 0000000..d8ff615
--- /dev/null
+++ b/src/layer/dropout.rs
@@ -0,0 +1,87 @@
+use rand::Rng;
+
+use crate::train::NeuraTrainableLayer;
+
+use super::NeuraLayer;
+
+#[derive(Clone, Debug)]
+pub struct NeuraDropoutLayer<const LENGTH: usize, R: Rng> {
+    pub dropout_probability: f64,
+    multiplier: f64,
+    mask: [bool; LENGTH],
+    rng: R,
+}
+
+impl<const LENGTH: usize, R: Rng> NeuraDropoutLayer<LENGTH, R> {
+    pub fn new(dropout_probability: f64, rng: R) -> Self {
+        Self {
+            dropout_probability,
+            multiplier: 1.0,
+            mask: [false; LENGTH],
+            rng,
+        }
+    }
+
+    fn apply_dropout(&self, vector: &mut [f64; LENGTH]) {
+        for (index, &dropout) in self.mask.iter().enumerate() {
+            if dropout {
+                vector[index] = 0.0;
+            } else {
+                vector[index] *= self.multiplier;
+            }
+        }
+    }
+}
+
+impl<const LENGTH: usize, R: Rng> NeuraLayer for NeuraDropoutLayer<LENGTH, R> {
+    type Input = [f64; LENGTH];
+    type Output = [f64; LENGTH];
+
+    fn eval(&self, input: &Self::Input) -> Self::Output {
+        let mut result = input.clone();
+
+        self.apply_dropout(&mut result);
+
+        result
+    }
+}
+
+impl<const LENGTH: usize, R: Rng> NeuraTrainableLayer for NeuraDropoutLayer<LENGTH, R> {
+    type Delta = ();
+
+    fn backpropagate(
+        &self,
+        _input: &Self::Input,
+        mut epsilon: Self::Output,
+    ) -> (Self::Input, Self::Delta) {
+        self.apply_dropout(&mut epsilon);
+
+        (epsilon, ())
+    }
+
+    #[inline(always)]
+    fn apply_gradient(&mut self, _gradient: &Self::Delta) {
+        // Noop
+    }
+
+    fn prepare_epoch(&mut self) {
+        // Rejection sampling to prevent all the inputs from being dropped out
+        loop {
+            let mut sum = 0;
+            for i in 0..LENGTH {
+                self.mask[i] = self.rng.gen_bool(self.dropout_probability);
+                sum += (!self.mask[i]) as usize;
+            }
+
+            if sum < LENGTH {
+                self.multiplier = LENGTH as f64 / sum as f64;
+                break;
+            }
+        }
+    }
+
+    fn cleanup(&mut self) {
+        self.mask = [false; LENGTH];
+        self.multiplier = 1.0;
+    }
+}
diff --git a/src/layer/mod.rs b/src/layer/mod.rs
index d10964b..a74bd69 100644
--- a/src/layer/mod.rs
+++ b/src/layer/mod.rs
@@ -1,6 +1,9 @@
 mod dense;
 pub use dense::NeuraDenseLayer;
 
+mod dropout;
+pub use dropout::NeuraDropoutLayer;
+
 pub trait NeuraLayer {
     type Input;
     type Output;
@@ -19,4 +22,9 @@ macro_rules! neura_layer {
         NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation)
             as NeuraDenseLayer<_, $input, $output>
     };
+
+    ( "dropout", $probability:expr ) => {
+        NeuraDropoutLayer::new($probability, rand::thread_rng())
+            as NeuraDropoutLayer<_, _>
+    };
 }
diff --git a/src/lib.rs b/src/lib.rs
index eb9b2e8..0c66809 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -14,7 +14,11 @@ pub mod prelude {
 
     // Structs and traits
     pub use crate::network::{NeuraNetwork};
-    pub use crate::layer::{NeuraLayer, NeuraDenseLayer};
+    pub use crate::layer::{
+        NeuraLayer,
+        NeuraDenseLayer,
+        NeuraDropoutLayer
+    };
     pub use crate::train::{NeuraBackprop, NeuraBatchedTrainer};
     pub use crate::utils::cycle_shuffling;
 }
diff --git a/src/network.rs b/src/network.rs
index 29ac603..f165e3e 100644
--- a/src/network.rs
+++ b/src/network.rs
@@ -81,6 +81,14 @@ impl<Layer: NeuraTrainableLayer> NeuraTrainable for NeuraNetwork<Layer, ()> {
         let backprop_epsilon = loss.nabla(target, &final_activation);
         self.layer.backpropagate(&input, backprop_epsilon)
     }
+
+    fn prepare_epoch(&mut self) {
+        self.layer.prepare_epoch();
+    }
+
+    fn cleanup(&mut self) {
+        self.layer.cleanup();
+    }
 }
 
 impl<Layer: NeuraTrainableLayer, ChildNetwork: NeuraTrainable<Input = Layer::Output>> NeuraTrainable
@@ -108,6 +116,16 @@ impl<Layer: NeuraTrainableLayer, ChildNetwork: NeuraTrainable<Input = Layer::Out
 
         (backprop_gradient, (layer_gradient, weights_gradient))
     }
+
+    fn prepare_epoch(&mut self) {
+        self.layer.prepare_epoch();
+        self.child_network.prepare_epoch();
+    }
+
+    fn cleanup(&mut self) {
+        self.layer.cleanup();
+        self.child_network.cleanup();
+    }
 }
 
 #[macro_export]
diff --git a/src/train.rs b/src/train.rs
index 13074b4..b2e71c9 100644
--- a/src/train.rs
+++ b/src/train.rs
@@ -1,11 +1,11 @@
 use crate::{
-    // utils::{assign_add_vector, chunked},
     algebra::NeuraVectorSpace,
     derivable::NeuraLoss,
     layer::NeuraLayer,
-    network::NeuraNetwork, utils::cycle_shuffling,
+    network::NeuraNetwork,
 };
 
+// TODO: move this to layer/mod.rs
 pub trait NeuraTrainableLayer: NeuraLayer {
     type Delta: NeuraVectorSpace;
 
@@ -28,6 +28,14 @@ pub trait NeuraTrainableLayer: NeuraLayer {
 
     /// Applies `δW_l` to the weights of the layer
     fn apply_gradient(&mut self, gradient: &Self::Delta);
+
+    /// Called before an epoch begins, to allow the layer to set itself up for training.
+    #[inline(always)]
+    fn prepare_epoch(&mut self) {}
+
+    /// Called at the end of training, to allow the layer to clean itself up
+    #[inline(always)]
+    fn cleanup(&mut self) {}
 }
 
 pub trait NeuraTrainable: NeuraLayer {
@@ -42,6 +50,12 @@ pub trait NeuraTrainable: NeuraLayer {
         target: &Loss::Target,
         loss: Loss,
     ) -> (Self::Input, Self::Delta);
+
+    /// Called before an epoch begins, to allow the network to set itself up for training.
+    fn prepare_epoch(&mut self);
+
+    /// Called at the end of training, to allow the network to clean itself up
+    fn cleanup(&mut self);
 }
 
 pub trait NeuraGradientSolver<Output, Target = Output> {
@@ -179,6 +193,7 @@ impl NeuraBatchedTrainer {
         let mut previous_gradient_sum = <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
         'd: for epoch in 0..self.epochs {
             let mut gradient_sum = <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
+            network.prepare_epoch();
 
             for _ in 0..self.batch_size {
                 if let Some((input, target)) = iter.next() {
@@ -199,6 +214,7 @@ impl NeuraBatchedTrainer {
             }
 
             if self.log_epochs > 0 && (epoch + 1) % self.log_epochs == 0 {
+                network.cleanup();
                 let mut loss_sum = 0.0;
                 for (input, target) in test_inputs {
                     loss_sum += gradient_solver.score(&network, input, target);
@@ -207,6 +223,8 @@ impl NeuraBatchedTrainer {
                 println!("Epoch {}, Loss: {:.3}", epoch + 1, loss_sum);
             }
         }
+
+        network.cleanup();
     }
 }
 
diff --git a/src/utils.rs b/src/utils.rs
index 4f8e535..c6db3a4 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -48,6 +48,7 @@ pub(crate) fn reverse_dot_product<const WIDTH: usize, const HEIGHT: usize>(
     result
 }
 
+#[allow(dead_code)]
 pub(crate) fn assign_add_vector<const N: usize>(sum: &mut [f64; N], operand: &[f64; N]) {
     for i in 0..N {
         sum[i] += operand[i];
@@ -81,14 +82,6 @@ impl<J: Iterator> Iterator for Chunked<J> {
     }
 }
 
-pub(crate) fn chunked<I: Iterator>(
-    iter: I,
-    chunk_size: usize,
-) -> impl Iterator<Item = Vec<I::Item>> {
-    Chunked { iter, chunk_size }
-}
-
-
 struct ShuffleCycled<I: Iterator, R: rand::Rng> {
     buffer: Vec<I::Item>,
     index: usize,