diff --git a/examples/bivariate.rs b/examples/bivariate.rs
index 67443fe..5d64419 100644
--- a/examples/bivariate.rs
+++ b/examples/bivariate.rs
@@ -2,22 +2,26 @@
 
 use std::io::Write;
 
-use neuramethyst::prelude::*;
-use neuramethyst::derivable::activation::{Relu, Tanh, LeakyRelu};
+use neuramethyst::derivable::activation::Linear;
+#[allow(unused_imports)]
+use neuramethyst::derivable::activation::{LeakyRelu, Relu, Tanh};
 use neuramethyst::derivable::loss::Euclidean;
+use neuramethyst::derivable::regularize::NeuraElastic;
+use neuramethyst::prelude::*;
 
 use rand::Rng;
 
 fn main() {
     let mut network = neura_network![
-        neura_layer!("dense", LeakyRelu(0.01), 9, 2),
+        neura_layer!("dense", 2, 8; LeakyRelu(0.01)),
         neura_layer!("dropout", 0.1),
-        neura_layer!("dense", LeakyRelu(0.01), 9),
+        neura_layer!("dense", 8; LeakyRelu(0.01), NeuraElastic::new(0.0001, 0.002)),
         neura_layer!("dropout", 0.3),
-        neura_layer!("dense", LeakyRelu(0.01), 6),
+        neura_layer!("dense", 8; LeakyRelu(0.01), NeuraElastic::new(0.0001, 0.002)),
         neura_layer!("dropout", 0.1),
-        neura_layer!("dense", LeakyRelu(0.01), 4),
-        neura_layer!("dense", LeakyRelu(0.1), 2)
+        neura_layer!("dense", 4; LeakyRelu(0.1), NeuraElastic::new(0.0001, 0.002)),
+        neura_layer!("dense", 2; Linear),
+        neura_layer!("softmax"),
     ];
     // println!("{:#?}", network);
 
@@ -39,20 +43,23 @@ fn main() {
 
     let test_inputs: Vec<_> = inputs.clone().take(100).collect();
 
-    let mut trainer = NeuraBatchedTrainer::new(0.1, 4000);
-    trainer.log_epochs = 500;
+    let mut trainer = NeuraBatchedTrainer::new(0.25, 1000);
+    trainer.log_epochs = 50;
+    trainer.learning_momentum = 0.05;
+    trainer.batch_size = 2000;
 
     trainer.train(
         NeuraBackprop::new(Euclidean),
         &mut network,
         inputs,
-        &test_inputs
+        &test_inputs,
     );
 
     let mut file = std::fs::File::create("target/bivariate.csv").unwrap();
     for (input, _target) in test_inputs {
         let guess = argmax(&network.eval(&input));
         writeln!(&mut file, "{},{},{}", input[0], input[1], guess).unwrap();
+        // println!("{:?}", network.eval(&input));
     }
 
     // println!("{:#?}", network);
diff --git a/examples/xor.rs b/examples/xor.rs
index fa1a88b..be04629 100644
--- a/examples/xor.rs
+++ b/examples/xor.rs
@@ -1,25 +1,30 @@
 #![feature(generic_arg_infer)]
 
-use neuramethyst::prelude::*;
-use neuramethyst::derivable::activation::{Relu};
+use neuramethyst::derivable::activation::Relu;
 use neuramethyst::derivable::loss::Euclidean;
+use neuramethyst::prelude::*;
 
 fn main() {
     let mut network = neura_network![
-        neura_layer!("dense", Relu, 4, 2),
-        neura_layer!("dense", Relu, 3),
-        neura_layer!("dense", Relu, 1)
+        neura_layer!("dense", 2, 4; Relu),
+        neura_layer!("dense", 3; Relu),
+        neura_layer!("dense", 1; Relu)
     ];
 
     let inputs = [
         ([0.0, 0.0], [0.0]),
         ([0.0, 1.0], [1.0]),
         ([1.0, 0.0], [1.0]),
-        ([1.0, 1.0], [0.0])
+        ([1.0, 1.0], [0.0]),
     ];
 
     for (input, target) in inputs {
-        println!("Input: {:?}, target: {}, actual: {:.3}", &input, target[0], network.eval(&input)[0]);
+        println!(
+            "Input: {:?}, target: {}, actual: {:.3}",
+            &input,
+            target[0],
+            network.eval(&input)[0]
+        );
     }
 
     let mut trainer = NeuraBatchedTrainer::new(0.05, 1000);
@@ -35,6 +40,11 @@ fn main() {
     );
 
     for (input, target) in inputs {
-        println!("Input: {:?}, target: {}, actual: {:.3}", &input, target[0], network.eval(&input)[0]);
+        println!(
+            "Input: {:?}, target: {}, actual: {:.3}",
+            &input,
+            target[0],
+            network.eval(&input)[0]
+        );
     }
 }
diff --git a/src/derivable/activation.rs b/src/derivable/activation.rs
index 9c13bd3..07b97da 100644
--- a/src/derivable/activation.rs
+++ b/src/derivable/activation.rs
@@ -36,10 +36,9 @@ impl NeuraDerivable<f32> for Relu {
 }
 
 #[derive(Clone, Copy, Debug, PartialEq)]
-pub struct LeakyRelu(pub f64);
+pub struct LeakyRelu<F>(pub F);
 
-
-impl NeuraDerivable<f64> for LeakyRelu {
+impl NeuraDerivable<f64> for LeakyRelu<f64> {
     #[inline(always)]
     fn eval(&self, input: f64) -> f64 {
         if input > 0.0 {
@@ -59,13 +58,13 @@ impl NeuraDerivable<f64> for LeakyRelu {
     }
 }
 
-impl NeuraDerivable<f32> for LeakyRelu {
+impl NeuraDerivable<f32> for LeakyRelu<f32> {
     #[inline(always)]
     fn eval(&self, input: f32) -> f32 {
         if input > 0.0 {
             input
         } else {
-            (self.0 as f32) * input
+            self.0 * input
         }
     }
 
@@ -74,7 +73,7 @@ impl NeuraDerivable<f32> for LeakyRelu {
         if input > 0.0 {
             1.0
         } else {
-            self.0 as f32
+            self.0
         }
     }
 }
diff --git a/src/derivable/mod.rs b/src/derivable/mod.rs
index 9888423..b5c4412 100644
--- a/src/derivable/mod.rs
+++ b/src/derivable/mod.rs
@@ -1,5 +1,6 @@
 pub mod activation;
 pub mod loss;
+pub mod regularize;
 
 pub trait NeuraDerivable<F> {
     fn eval(&self, input: F) -> F;
diff --git a/src/derivable/regularize.rs b/src/derivable/regularize.rs
new file mode 100644
index 0000000..91cdb30
--- /dev/null
+++ b/src/derivable/regularize.rs
@@ -0,0 +1,134 @@
+use super::*;
+
+/// Default regularization, which is no regularization
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct NeuraL0;
+
+impl NeuraDerivable<f64> for NeuraL0 {
+    #[inline(always)]
+    fn eval(&self, _input: f64) -> f64 {
+        0.0
+    }
+
+    #[inline(always)]
+    fn derivate(&self, _at: f64) -> f64 {
+        0.0
+    }
+}
+
+impl NeuraDerivable<f32> for NeuraL0 {
+    #[inline(always)]
+    fn eval(&self, _input: f32) -> f32 {
+        0.0
+    }
+
+    #[inline(always)]
+    fn derivate(&self, _at: f32) -> f32 {
+        0.0
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct NeuraL1<F>(pub F);
+
+impl NeuraDerivable<f64> for NeuraL1<f64> {
+    #[inline(always)]
+    fn eval(&self, input: f64) -> f64 {
+        self.0 * input.abs()
+    }
+
+    #[inline(always)]
+    fn derivate(&self, at: f64) -> f64 {
+        if at > 0.0 {
+            self.0
+        } else if at < 0.0 {
+            -self.0
+        } else {
+            0.0
+        }
+    }
+}
+
+impl NeuraDerivable<f32> for NeuraL1<f32> {
+    #[inline(always)]
+    fn eval(&self, input: f32) -> f32 {
+        self.0 * input.abs()
+    }
+
+    #[inline(always)]
+    fn derivate(&self, at: f32) -> f32 {
+        if at > 0.0 {
+            self.0
+        } else if at < 0.0 {
+            -self.0
+        } else {
+            0.0
+        }
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct NeuraL2<F>(pub F);
+
+impl NeuraDerivable<f64> for NeuraL2<f64> {
+    #[inline(always)]
+    fn eval(&self, input: f64) -> f64 {
+        self.0 * (input * input)
+    }
+
+    #[inline(always)]
+    fn derivate(&self, at: f64) -> f64 {
+        self.0 * at
+    }
+}
+
+impl NeuraDerivable<f32> for NeuraL2<f32> {
+    #[inline(always)]
+    fn eval(&self, input: f32) -> f32 {
+        self.0 * (input * input)
+    }
+
+    #[inline(always)]
+    fn derivate(&self, at: f32) -> f32 {
+        self.0 * at
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct NeuraElastic<F> {
+    pub l1: F,
+    pub l2: F,
+}
+
+impl<F> NeuraElastic<F> {
+    pub fn new(l1_factor: F, l2_factor: F) -> Self {
+        Self {
+            l1: l1_factor,
+            l2: l2_factor,
+        }
+    }
+}
+
+impl NeuraDerivable<f64> for NeuraElastic<f64> {
+    #[inline(always)]
+    fn eval(&self, input: f64) -> f64 {
+        NeuraL1(self.l1).eval(input) + NeuraL2(self.l2).eval(input)
+    }
+
+    #[inline(always)]
+    fn derivate(&self, at: f64) -> f64 {
+        NeuraL1(self.l1).derivate(at) + NeuraL2(self.l2).derivate(at)
+    }
+}
+
+impl NeuraDerivable<f32> for NeuraElastic<f32> {
+    #[inline(always)]
+    fn eval(&self, input: f32) -> f32 {
+        NeuraL1(self.l1).eval(input) + NeuraL2(self.l2).eval(input)
+    }
+
+    #[inline(always)]
+    fn derivate(&self, at: f32) -> f32 {
+        NeuraL1(self.l1).derivate(at) + NeuraL2(self.l2).derivate(at)
+    }
+}
diff --git a/src/layer/dense.rs b/src/layer/dense.rs
index bf94b76..2c9e5fc 100644
--- a/src/layer/dense.rs
+++ b/src/layer/dense.rs
@@ -1,39 +1,53 @@
 use super::NeuraLayer;
-use crate::{derivable::NeuraDerivable, utils::{multiply_matrix_vector, reverse_dot_product, multiply_matrix_transpose_vector}, train::NeuraTrainableLayer, algebra::NeuraVectorSpace};
+use crate::{
+    algebra::NeuraVectorSpace,
+    derivable::NeuraDerivable,
+    train::NeuraTrainableLayer,
+    utils::{multiply_matrix_transpose_vector, multiply_matrix_vector, reverse_dot_product},
+};
 
-use rand_distr::Distribution;
 use rand::Rng;
+use rand_distr::Distribution;
 
 #[derive(Clone, Debug)]
 pub struct NeuraDenseLayer<
     Act: NeuraDerivable<f64>,
+    Reg: NeuraDerivable<f64>,
     const INPUT_LEN: usize,
     const OUTPUT_LEN: usize,
 > {
     weights: [[f64; INPUT_LEN]; OUTPUT_LEN],
     bias: [f64; OUTPUT_LEN],
     activation: Act,
+    regularization: Reg,
 }
 
-impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize>
-    NeuraDenseLayer<Act, INPUT_LEN, OUTPUT_LEN>
+impl<
+        Act: NeuraDerivable<f64>,
+        Reg: NeuraDerivable<f64>,
+        const INPUT_LEN: usize,
+        const OUTPUT_LEN: usize,
+    > NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
 {
     pub fn new(
         weights: [[f64; INPUT_LEN]; OUTPUT_LEN],
         bias: [f64; OUTPUT_LEN],
         activation: Act,
+        regularization: Reg,
     ) -> Self {
         Self {
             weights,
             bias,
             activation,
+            regularization,
         }
     }
 
-    pub fn from_rng(rng: &mut impl Rng, activation: Act) -> Self {
+    pub fn from_rng(rng: &mut impl Rng, activation: Act, regularization: Reg) -> Self {
         let mut weights = [[0.0; INPUT_LEN]; OUTPUT_LEN];
 
-        let distribution = rand_distr::Normal::new(0.0, 2.0 / (INPUT_LEN as f64 + OUTPUT_LEN as f64)).unwrap();
+        let distribution =
+            rand_distr::Normal::new(0.0, 2.0 / (INPUT_LEN as f64 + OUTPUT_LEN as f64)).unwrap();
 
         for i in 0..OUTPUT_LEN {
             for j in 0..INPUT_LEN {
@@ -46,12 +60,17 @@ impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize>
             // Biases are zero-initialized, as this shouldn't cause any issues during training
             bias: [0.0; OUTPUT_LEN],
             activation,
+            regularization,
         }
     }
 }
 
-impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize> NeuraLayer
-    for NeuraDenseLayer<Act, INPUT_LEN, OUTPUT_LEN>
+impl<
+        Act: NeuraDerivable<f64>,
+        Reg: NeuraDerivable<f64>,
+        const INPUT_LEN: usize,
+        const OUTPUT_LEN: usize,
+    > NeuraLayer for NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
 {
     type Input = [f64; INPUT_LEN];
 
@@ -68,13 +87,21 @@ impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize>
     }
 }
 
-impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize> NeuraTrainableLayer
-    for NeuraDenseLayer<Act, INPUT_LEN, OUTPUT_LEN>
+impl<
+        Act: NeuraDerivable<f64>,
+        Reg: NeuraDerivable<f64>,
+        const INPUT_LEN: usize,
+        const OUTPUT_LEN: usize,
+    > NeuraTrainableLayer for NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
 {
     type Delta = ([[f64; INPUT_LEN]; OUTPUT_LEN], [f64; OUTPUT_LEN]);
 
     // TODO: double-check the math in this
-    fn backpropagate(&self, input: &Self::Input, epsilon: Self::Output) -> (Self::Input, Self::Delta) {
+    fn backpropagate(
+        &self,
+        input: &Self::Input,
+        epsilon: Self::Output,
+    ) -> (Self::Input, Self::Delta) {
         let evaluated = multiply_matrix_vector(&self.weights, input);
         // Compute delta from epsilon, with `self.activation'(input) ° epsilon = delta`
         let mut delta = epsilon.clone();
@@ -96,17 +123,32 @@ impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize>
         NeuraVectorSpace::add_assign(&mut self.weights, &gradient.0);
         NeuraVectorSpace::add_assign(&mut self.bias, &gradient.1);
     }
+
+    fn regularize(&self) -> Self::Delta {
+        let mut res = ([[0.0; INPUT_LEN]; OUTPUT_LEN], [0.0; OUTPUT_LEN]);
+
+        for i in 0..OUTPUT_LEN {
+            for j in 0..INPUT_LEN {
+                res.0[i][j] = self.regularization.derivate(self.weights[i][j]);
+            }
+        }
+
+        // Note: biases aren't taken into account here, as per https://stats.stackexchange.com/questions/153605/no-regularisation-term-for-bias-unit-in-neural-network
+
+        res
+    }
 }
 
 #[cfg(test)]
 mod test {
     use super::*;
-    use crate::derivable::activation::Relu;
+    use crate::derivable::{activation::Relu, regularize::NeuraL0};
 
     #[test]
     fn test_from_rng() {
         let mut rng = rand::thread_rng();
-        let layer: NeuraDenseLayer<_, 64, 32> = NeuraDenseLayer::from_rng(&mut rng, Relu);
+        let layer: NeuraDenseLayer<_, _, 64, 32> =
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0);
         let mut input = [0.0; 64];
         for x in 0..64 {
             input[x] = rng.gen();
diff --git a/src/layer/dropout.rs b/src/layer/dropout.rs
index d8ff615..5ce6479 100644
--- a/src/layer/dropout.rs
+++ b/src/layer/dropout.rs
@@ -59,6 +59,10 @@ impl<const LENGTH: usize, R: Rng> NeuraTrainableLayer for NeuraDropoutLayer<LENG
         (epsilon, ())
     }
 
+    fn regularize(&self) -> Self::Delta {
+        ()
+    }
+
     #[inline(always)]
     fn apply_gradient(&mut self, _gradient: &Self::Delta) {
         // Noop
diff --git a/src/layer/mod.rs b/src/layer/mod.rs
index a74bd69..4ac7393 100644
--- a/src/layer/mod.rs
+++ b/src/layer/mod.rs
@@ -4,6 +4,9 @@ pub use dense::NeuraDenseLayer;
 mod dropout;
 pub use dropout::NeuraDropoutLayer;
 
+mod softmax;
+pub use softmax::NeuraSoftmaxLayer;
+
 pub trait NeuraLayer {
     type Input;
     type Output;
@@ -13,18 +16,34 @@ pub trait NeuraLayer {
 
 #[macro_export]
 macro_rules! neura_layer {
-    ( "dense", $activation:expr, $output:expr ) => {
-        NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation)
-            as NeuraDenseLayer<_, _, $output>
+    ( "dense", $( $shape:expr ),*; $activation:expr ) => {
+        $crate::layer::NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation, $crate::derivable::regularize::NeuraL0)
+            as neura_layer!("_dense_shape", $($shape),*)
+    };
+
+    ( "dense", $( $shape:expr ),*; $activation:expr, $regularization:expr ) => {
+        $crate::layer::NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation, $regularization)
+            as neura_layer!("_dense_shape", $($shape),*)
+    };
+
+    ( "_dense_shape", $output:expr ) => {
+        $crate::layer::NeuraDenseLayer<_, _, _, $output>
     };
 
-    ( "dense", $activation:expr, $output:expr, $input:expr ) => {
-        NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation)
-            as NeuraDenseLayer<_, $input, $output>
+    ( "_dense_shape", $input:expr, $output:expr ) => {
+        $crate::layer::NeuraDenseLayer<_, _, $input, $output>
     };
 
     ( "dropout", $probability:expr ) => {
-        NeuraDropoutLayer::new($probability, rand::thread_rng())
-            as NeuraDropoutLayer<_, _>
+        $crate::layer::NeuraDropoutLayer::new($probability, rand::thread_rng())
+            as $crate::layer::NeuraDropoutLayer<_, _>
+    };
+
+    ( "softmax" ) => {
+        $crate::layer::NeuraSoftmaxLayer::new() as $crate::layer::NeuraSoftmaxLayer<_>
+    };
+
+    ( "softmax", $length:expr ) => {
+        $crate::layer::NeuraSoftmaxLayer::new() as $crate::layer::NeuraSoftmaxLayer<$length>
     };
 }
diff --git a/src/layer/softmax.rs b/src/layer/softmax.rs
new file mode 100644
index 0000000..8160e50
--- /dev/null
+++ b/src/layer/softmax.rs
@@ -0,0 +1,155 @@
+use crate::{train::NeuraTrainableLayer, utils::multiply_vectors_pointwise};
+
+use super::NeuraLayer;
+
+#[non_exhaustive]
+#[derive(Clone, Debug)]
+pub struct NeuraSoftmaxLayer<const LENGTH: usize>;
+
+impl<const LENGTH: usize> NeuraSoftmaxLayer<LENGTH> {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+impl<const LENGTH: usize> NeuraLayer for NeuraSoftmaxLayer<LENGTH> {
+    type Input = [f64; LENGTH];
+    type Output = [f64; LENGTH];
+
+    fn eval(&self, input: &Self::Input) -> Self::Output {
+        let mut res = input.clone();
+
+        let mut max = 0.0;
+        for item in &res {
+            if *item > max {
+                max = *item;
+            }
+        }
+
+        for item in &mut res {
+            *item = (*item - max).exp();
+        }
+
+        let mut sum = 0.0;
+        for item in &res {
+            sum += item;
+        }
+
+        for item in &mut res {
+            *item /= sum;
+        }
+
+        res
+    }
+}
+
+impl<const LENGTH: usize> NeuraTrainableLayer for NeuraSoftmaxLayer<LENGTH> {
+    type Delta = ();
+
+    fn backpropagate(
+        &self,
+        input: &Self::Input,
+        mut epsilon: Self::Output,
+    ) -> (Self::Input, Self::Delta) {
+        // Note: a constant value can be added to `input` to bring it to increase precision
+        let evaluated = self.eval(input);
+
+        // Compute $a_{l-1,i} \epsilon_{l,i}$
+        epsilon = multiply_vectors_pointwise(&epsilon, &evaluated);
+
+        // Compute $\sum_{k}{a_{l-1,k} \epsilon_{l,k}}$
+        let sum_diagonal_terms: f64 = epsilon.iter().copied().sum();
+
+        for i in 0..LENGTH {
+            // Multiply $\sum_{k}{a_{l-1,k} \epsilon_{l,k}}$ by $a_{l-1,i}$ and add it to $a_{l-1,i} \epsilon_{l,i}$
+            epsilon[i] -= evaluated[i] * sum_diagonal_terms;
+        }
+
+        (epsilon, ())
+    }
+
+    fn regularize(&self) -> Self::Delta {
+        ()
+    }
+
+    fn apply_gradient(&mut self, _gradient: &Self::Delta) {
+        // Noop
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::algebra::NeuraVectorSpace;
+    use crate::utils::{
+        matrix_from_diagonal, multiply_matrix_vector, reverse_dot_product, uniform_vector,
+    };
+
+    use super::*;
+
+    #[test]
+    fn test_softmax_eval() {
+        const EPSILON: f64 = 0.000002;
+        let layer = NeuraSoftmaxLayer::new() as NeuraSoftmaxLayer<3>;
+
+        let result = layer.eval(&[1.0, 2.0, 8.0]);
+
+        assert!((result[0] - 0.0009088).abs() < EPSILON);
+        assert!((result[1] - 0.0024704).abs() < EPSILON);
+        assert!((result[2] - 0.9966208).abs() < EPSILON);
+    }
+
+    // Based on https://stats.stackexchange.com/a/306710
+    #[test]
+    fn test_softmax_backpropagation_two() {
+        const EPSILON: f64 = 0.000001;
+        let layer = NeuraSoftmaxLayer::new() as NeuraSoftmaxLayer<2>;
+
+        for input1 in [0.2, 0.3, 0.5] as [f64; 3] {
+            for input2 in [0.7, 1.1, 1.3] {
+                let input = [input1, input2];
+                let sum = input1.exp() + input2.exp();
+                let output = [input1.exp() / sum, input2.exp() / sum];
+                for epsilon1 in [1.7, 1.9, 2.3] {
+                    for epsilon2 in [2.9, 3.1, 3.7] {
+                        let epsilon = [epsilon1, epsilon2];
+
+                        let (epsilon, _) = layer.backpropagate(&input, epsilon);
+                        let expected = [
+                            output[0] * (1.0 - output[0]) * epsilon1
+                                - output[1] * output[0] * epsilon2,
+                            output[1] * (1.0 - output[1]) * epsilon2
+                                - output[1] * output[0] * epsilon1,
+                        ];
+
+                        assert!((epsilon[0] - expected[0]).abs() < EPSILON);
+                        assert!((epsilon[1] - expected[1]).abs() < EPSILON);
+                    }
+                }
+            }
+        }
+    }
+
+    // Based on https://e2eml.school/softmax.html
+    #[test]
+    fn test_softmax_backpropagation() {
+        const EPSILON: f64 = 0.000001;
+        let layer = NeuraSoftmaxLayer::new() as NeuraSoftmaxLayer<4>;
+
+        for _ in 0..100 {
+            let input: [f64; 4] = uniform_vector();
+            let evaluated = layer.eval(&input);
+            let loss: [f64; 4] = uniform_vector();
+
+            let mut derivative = reverse_dot_product(&evaluated, &evaluated);
+            derivative.mul_assign(-1.0);
+            derivative.add_assign(&matrix_from_diagonal(&evaluated));
+
+            let expected = multiply_matrix_vector(&derivative, &loss);
+            let (actual, _) = layer.backpropagate(&input, loss);
+
+            for i in 0..4 {
+                assert!((expected[i] - actual[i]).abs() < EPSILON);
+            }
+        }
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 0c66809..4bfe1e5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -10,15 +10,11 @@ mod utils;
 
 pub mod prelude {
     // Macros
-    pub use crate::{neura_network, neura_layer};
+    pub use crate::{neura_layer, neura_network};
 
     // Structs and traits
-    pub use crate::network::{NeuraNetwork};
-    pub use crate::layer::{
-        NeuraLayer,
-        NeuraDenseLayer,
-        NeuraDropoutLayer
-    };
+    pub use crate::layer::{NeuraDenseLayer, NeuraDropoutLayer, NeuraLayer};
+    pub use crate::network::NeuraNetwork;
     pub use crate::train::{NeuraBackprop, NeuraBatchedTrainer};
     pub use crate::utils::cycle_shuffling;
 }
diff --git a/src/network.rs b/src/network.rs
index f165e3e..3a6f4dc 100644
--- a/src/network.rs
+++ b/src/network.rs
@@ -82,6 +82,10 @@ impl<Layer: NeuraTrainableLayer> NeuraTrainable for NeuraNetwork<Layer, ()> {
         self.layer.backpropagate(&input, backprop_epsilon)
     }
 
+    fn regularize(&self) -> Self::Delta {
+        self.layer.regularize()
+    }
+
     fn prepare_epoch(&mut self) {
         self.layer.prepare_epoch();
     }
@@ -117,6 +121,10 @@ impl<Layer: NeuraTrainableLayer, ChildNetwork: NeuraTrainable<Input = Layer::Out
         (backprop_gradient, (layer_gradient, weights_gradient))
     }
 
+    fn regularize(&self) -> Self::Delta {
+        (self.layer.regularize(), self.child_network.regularize())
+    }
+
     fn prepare_epoch(&mut self) {
         self.layer.prepare_epoch();
         self.child_network.prepare_epoch();
@@ -145,7 +153,11 @@ macro_rules! neura_network {
 
 #[cfg(test)]
 mod test {
-    use crate::{derivable::activation::Relu, layer::NeuraDenseLayer, neura_layer};
+    use crate::{
+        derivable::{activation::Relu, regularize::NeuraL0},
+        layer::NeuraDenseLayer,
+        neura_layer,
+    };
 
     use super::*;
 
@@ -154,23 +166,24 @@ mod test {
         let mut rng = rand::thread_rng();
 
         let _ = neura_network![
-            NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, 8, 16>,
-            NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, _, 12>,
-            NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, _, 2>
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, 8, 16>,
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, _, 12>,
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, _, 2>
         ];
 
-        let _ =
-            neura_network![NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, 8, 16>,];
+        let _ = neura_network![
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, 8, 16>,
+        ];
 
         let _ = neura_network![
-            NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, 8, 16>,
-            NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, _, 12>,
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, 8, 16>,
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, _, 12>,
         ];
 
         let _ = neura_network![
-            neura_layer!("dense", Relu, 16, 8),
-            neura_layer!("dense", Relu, 12),
-            neura_layer!("dense", Relu, 2)
+            neura_layer!("dense", 8, 16; Relu),
+            neura_layer!("dense", 12; Relu),
+            neura_layer!("dense", 2; Relu)
         ];
     }
 }
diff --git a/src/train.rs b/src/train.rs
index b2e71c9..c7627fe 100644
--- a/src/train.rs
+++ b/src/train.rs
@@ -1,8 +1,5 @@
 use crate::{
-    algebra::NeuraVectorSpace,
-    derivable::NeuraLoss,
-    layer::NeuraLayer,
-    network::NeuraNetwork,
+    algebra::NeuraVectorSpace, derivable::NeuraLoss, layer::NeuraLayer, network::NeuraNetwork,
 };
 
 // TODO: move this to layer/mod.rs
@@ -26,6 +23,9 @@ pub trait NeuraTrainableLayer: NeuraLayer {
         epsilon: Self::Output,
     ) -> (Self::Input, Self::Delta);
 
+    /// Computes the regularization
+    fn regularize(&self) -> Self::Delta;
+
     /// Applies `δW_l` to the weights of the layer
     fn apply_gradient(&mut self, gradient: &Self::Delta);
 
@@ -51,6 +51,9 @@ pub trait NeuraTrainable: NeuraLayer {
         loss: Loss,
     ) -> (Self::Input, Self::Delta);
 
+    /// Should return the regularization gradient
+    fn regularize(&self) -> Self::Delta;
+
     /// Called before an epoch begins, to allow the network to set itself up for training.
     fn prepare_epoch(&mut self);
 
@@ -89,8 +92,8 @@ impl<Loss: NeuraLoss + Clone> NeuraBackprop<Loss> {
     }
 }
 
-impl<const N: usize, Loss: NeuraLoss<Input = [f64; N]> + Clone> NeuraGradientSolver<[f64; N], Loss::Target>
-    for NeuraBackprop<Loss>
+impl<const N: usize, Loss: NeuraLoss<Input = [f64; N]> + Clone>
+    NeuraGradientSolver<[f64; N], Loss::Target> for NeuraBackprop<Loss>
 {
     fn get_gradient<Layer: NeuraLayer, ChildNetwork>(
         &self,
@@ -184,15 +187,17 @@ impl NeuraBatchedTrainer {
         NeuraNetwork<Layer, ChildNetwork>: NeuraTrainable<Input = Layer::Input, Output = Output>,
         Layer::Input: Clone,
     {
-        // TODO: apply shuffling?
         let mut iter = inputs.into_iter();
         let factor = -self.learning_rate / (self.batch_size as f64);
         let momentum_factor = self.learning_momentum / self.learning_rate;
+        let reg_factor = -self.learning_rate;
 
         // Contains `momentum_factor * factor * gradient_sum_previous_iter`
-        let mut previous_gradient_sum = <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
+        let mut previous_gradient_sum =
+            <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
         'd: for epoch in 0..self.epochs {
-            let mut gradient_sum = <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
+            let mut gradient_sum =
+                <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
             network.prepare_epoch();
 
             for _ in 0..self.batch_size {
@@ -205,6 +210,12 @@ impl NeuraBatchedTrainer {
             }
 
             gradient_sum.mul_assign(factor);
+
+            // Add regularization gradient (TODO: check if it can be factored out of momentum)
+            let mut reg_gradient = network.regularize();
+            reg_gradient.mul_assign(reg_factor);
+            gradient_sum.add_assign(&reg_gradient);
+
             network.apply_gradient(&gradient_sum);
 
             if self.learning_momentum != 0.0 {
@@ -230,23 +241,21 @@ impl NeuraBatchedTrainer {
 
 #[cfg(test)]
 mod test {
-    use crate::{layer::NeuraDenseLayer, derivable::{activation::Linear, loss::Euclidean}};
     use super::*;
+    use crate::{
+        derivable::{activation::Linear, loss::Euclidean, regularize::NeuraL0},
+        layer::NeuraDenseLayer,
+    };
 
     #[test]
     fn test_backpropagation_simple() {
         for wa in [0.0, 0.25, 0.5, 1.0] {
             for wb in [0.0, 0.25, 0.5, 1.0] {
-                let network = NeuraNetwork::new(
-                    NeuraDenseLayer::new([[wa, wb]], [0.0], Linear),
-                    ()
-                );
-
-                let gradient = NeuraBackprop::new(Euclidean).get_gradient(
-                    &network,
-                    &[1.0, 1.0],
-                    &[0.0]
-                );
+                let network =
+                    NeuraNetwork::new(NeuraDenseLayer::new([[wa, wb]], [0.0], Linear, NeuraL0), ());
+
+                let gradient =
+                    NeuraBackprop::new(Euclidean).get_gradient(&network, &[1.0, 1.0], &[0.0]);
 
                 let expected = wa + wb;
                 assert!((gradient.0[0][0] - expected) < 0.001);
diff --git a/src/utils.rs b/src/utils.rs
index c6db3a4..081312b 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -33,6 +33,7 @@ pub(crate) fn multiply_matrix_transpose_vector<const WIDTH: usize, const HEIGHT:
     result
 }
 
+// Returns $left^{\top} \cdot right$, ie. $\ket{left} \bra{right}$
 pub(crate) fn reverse_dot_product<const WIDTH: usize, const HEIGHT: usize>(
     left: &[f64; HEIGHT],
     right: &[f64; WIDTH],
@@ -48,6 +49,32 @@ pub(crate) fn reverse_dot_product<const WIDTH: usize, const HEIGHT: usize>(
     result
 }
 
+pub(crate) fn multiply_vectors_pointwise<const LENGTH: usize>(
+    left: &[f64; LENGTH],
+    right: &[f64; LENGTH],
+) -> [f64; LENGTH] {
+    let mut result = [0.0; LENGTH];
+
+    for i in 0..LENGTH {
+        result[i] = left[i] * right[i];
+    }
+
+    result
+}
+
+#[cfg(test)]
+pub(crate) fn matrix_from_diagonal<const LENGTH: usize>(
+    vector: &[f64; LENGTH],
+) -> [[f64; LENGTH]; LENGTH] {
+    let mut result = [[0.0; LENGTH]; LENGTH];
+
+    for i in 0..LENGTH {
+        result[i][i] = vector[i];
+    }
+
+    result
+}
+
 #[allow(dead_code)]
 pub(crate) fn assign_add_vector<const N: usize>(sum: &mut [f64; N], operand: &[f64; N]) {
     for i in 0..N {
@@ -89,7 +116,10 @@ struct ShuffleCycled<I: Iterator, R: rand::Rng> {
     rng: R,
 }
 
-impl<I: Iterator, R: rand::Rng> Iterator for ShuffleCycled<I, R> where I::Item: Clone {
+impl<I: Iterator, R: rand::Rng> Iterator for ShuffleCycled<I, R>
+where
+    I::Item: Clone,
+{
     type Item = I::Item;
 
     #[inline]
@@ -99,7 +129,7 @@ impl<I: Iterator, R: rand::Rng> Iterator for ShuffleCycled<I, R> where I::Item:
         if let Some(next) = self.iter.next() {
             // Base iterator is not empty yet
             self.buffer.push(next.clone());
-            return Some(next)
+            return Some(next);
         } else if self.buffer.len() > 0 {
             if self.index == 0 {
                 // Shuffle the vector and return the first element, setting the index to 1
@@ -118,12 +148,9 @@ impl<I: Iterator, R: rand::Rng> Iterator for ShuffleCycled<I, R> where I::Item:
     }
 }
 
-pub fn cycle_shuffling<I: Iterator>(
-    iter: I,
-    rng: impl rand::Rng
-) -> impl Iterator<Item=I::Item>
+pub fn cycle_shuffling<I: Iterator>(iter: I, rng: impl rand::Rng) -> impl Iterator<Item = I::Item>
 where
-    I::Item: Clone
+    I::Item: Clone,
 {
     let size_hint = iter.size_hint();
     let size_hint = size_hint.1.unwrap_or(size_hint.0).max(1);
@@ -132,6 +159,19 @@ where
         buffer: Vec::with_capacity(size_hint),
         index: 0,
         iter,
-        rng
+        rng,
     }
 }
+
+#[cfg(test)]
+pub(crate) fn uniform_vector<const LENGTH: usize>() -> [f64; LENGTH] {
+    use rand::Rng;
+    let mut res = [0.0; LENGTH];
+    let mut rng = rand::thread_rng();
+
+    for i in 0..LENGTH {
+        res[i] = rng.gen();
+    }
+
+    res
+}