diff --git a/examples/bivariate.rs b/examples/bivariate.rs
index cb89a56..10da1af 100644
--- a/examples/bivariate.rs
+++ b/examples/bivariate.rs
@@ -13,10 +13,12 @@ use rand::Rng;
 
 fn main() {
     let mut network = neura_sequential![
-        neura_layer!("dense", 8),
+        neura_layer!("dense", 8).regularization(NeuraL1(0.001)),
         neura_layer!("dropout", 0.25),
-        neura_layer!("dense", 2).activation(Linear),
-        // neura_layer!("softmax"),
+        neura_layer!("dense", 2)
+            .activation(Linear)
+            .regularization(NeuraL1(0.001)),
+        neura_layer!("softmax"),
     ]
     .construct(NeuraShape::Vector(2))
     .unwrap();
diff --git a/examples/convolution.rs b/examples/convolution.disabled-rs
similarity index 100%
rename from examples/convolution.rs
rename to examples/convolution.disabled-rs
diff --git a/src/derivable/loss.rs b/src/derivable/loss.rs
index da7c2f5..ec26552 100644
--- a/src/derivable/loss.rs
+++ b/src/derivable/loss.rs
@@ -1,8 +1,6 @@
 use nalgebra::DVector;
 use num::Float;
 
-use crate::algebra::NeuraVector;
-
 use super::NeuraLoss;
 
 #[derive(Clone, Copy, Debug, PartialEq)]
diff --git a/src/layer/mod.rs b/src/layer/mod.rs
index 4e4e4e0..672ee00 100644
--- a/src/layer/mod.rs
+++ b/src/layer/mod.rs
@@ -2,8 +2,7 @@ use crate::algebra::NeuraVectorSpace;
 
 pub mod dense;
 pub mod dropout;
-
-pub use dense::NeuraDenseLayer;
+pub mod softmax;
 
 #[derive(Clone, Copy, PartialEq, Debug)]
 pub enum NeuraShape {
@@ -121,4 +120,8 @@ macro_rules! neura_layer {
     ( "dropout", $probability:expr ) => {
         $crate::layer::dropout::NeuraDropoutLayer::new($probability, rand::thread_rng())
     };
+
+    ( "softmax" ) => {
+        $crate::layer::softmax::NeuraSoftmaxLayer::new()
+    };
 }
diff --git a/src/layer/softmax.rs b/src/layer/softmax.rs
new file mode 100644
index 0000000..e428677
--- /dev/null
+++ b/src/layer/softmax.rs
@@ -0,0 +1,175 @@
+use nalgebra::{DVector, Scalar};
+use num::{traits::NumAssignOps, Float};
+
+use super::*;
+
+#[derive(Clone, Debug)]
+pub struct NeuraSoftmaxLayer {
+    shape: NeuraShape,
+}
+
+impl NeuraSoftmaxLayer {
+    pub fn new() -> Self {
+        Self {
+            shape: NeuraShape::Vector(0),
+        }
+    }
+}
+
+impl<F: Float + Scalar + NumAssignOps> NeuraLayer<DVector<F>> for NeuraSoftmaxLayer {
+    type Output = DVector<F>;
+
+    fn eval(&self, input: &DVector<F>) -> Self::Output {
+        let mut res = input.clone();
+
+        let mut max = F::zero();
+        for &item in &res {
+            if item > max {
+                max = item;
+            }
+        }
+
+        let mut sum = F::zero();
+        for item in &mut res {
+            *item = (*item - max).exp();
+            sum += *item;
+        }
+
+        res /= sum;
+
+        res
+    }
+}
+
+impl NeuraPartialLayer for NeuraSoftmaxLayer {
+    type Constructed = Self;
+    type Err = ();
+
+    fn construct(self, input_shape: NeuraShape) -> Result<Self::Constructed, Self::Err> {
+        Ok(Self { shape: input_shape })
+    }
+
+    fn output_shape(constructed: &Self::Constructed) -> NeuraShape {
+        constructed.shape
+    }
+}
+
+impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayer<DVector<F>> for NeuraSoftmaxLayer {
+    type Gradient = ();
+
+    fn default_gradient(&self) -> Self::Gradient {
+        ()
+    }
+
+    fn backprop_layer(
+        &self,
+        input: &DVector<F>,
+        mut epsilon: Self::Output,
+    ) -> (DVector<F>, Self::Gradient) {
+        // Note: a constant value can be added to `input` to bring it to increase precision
+        let evaluated = self.eval(input);
+
+        // Compute $a_{l-1,i} \epsilon_{l,i}$
+        hadamard_product(&mut epsilon, &evaluated);
+
+        // Compute $\sum_{k}{a_{l-1,k} \epsilon_{l,k}}$
+        let sum_diagonal_terms = epsilon.sum();
+
+        for i in 0..input.len() {
+            // Multiply $\sum_{k}{a_{l-1,k} \epsilon_{l,k}}$ by $a_{l-1,i}$ and add it to $a_{l-1,i} \epsilon_{l,i}$
+            epsilon[i] -= evaluated[i] * sum_diagonal_terms;
+        }
+
+        (epsilon, ())
+    }
+
+    fn regularize_layer(&self) -> Self::Gradient {
+        ()
+    }
+
+    fn apply_gradient(&mut self, _gradient: &Self::Gradient) {
+        // Noop
+    }
+}
+
+fn hadamard_product<F: Float + std::ops::MulAssign>(left: &mut DVector<F>, right: &DVector<F>) {
+    for i in 0..left.len() {
+        left[i] *= right[i];
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use nalgebra::{dvector, DMatrix};
+
+    use crate::utils::uniform_vector;
+
+    use super::*;
+
+    #[test]
+    fn test_softmax_eval() {
+        const EPSILON: f64 = 0.000002;
+        let layer = NeuraSoftmaxLayer::new();
+
+        let result = layer.eval(&dvector![1.0, 2.0, 8.0]);
+
+        assert!((result[0] - 0.0009088).abs() < EPSILON);
+        assert!((result[1] - 0.0024704).abs() < EPSILON);
+        assert!((result[2] - 0.9966208).abs() < EPSILON);
+    }
+
+    // Based on https://stats.stackexchange.com/a/306710
+    #[test]
+    fn test_softmax_backpropagation_two() {
+        const EPSILON: f64 = 0.000001;
+        let layer = NeuraSoftmaxLayer::new();
+
+        for input1 in [0.2, 0.3, 0.5] as [f64; 3] {
+            for input2 in [0.7, 1.1, 1.3] {
+                let input = dvector![input1, input2];
+                let sum = input1.exp() + input2.exp();
+                let output = dvector![input1.exp() / sum, input2.exp() / sum];
+                for epsilon1 in [1.7, 1.9, 2.3] {
+                    for epsilon2 in [2.9, 3.1, 3.7] {
+                        let epsilon = dvector![epsilon1, epsilon2];
+
+                        let (epsilon, _) = layer.backprop_layer(&input, epsilon);
+                        let expected = [
+                            output[0] * (1.0 - output[0]) * epsilon1
+                                - output[1] * output[0] * epsilon2,
+                            output[1] * (1.0 - output[1]) * epsilon2
+                                - output[1] * output[0] * epsilon1,
+                        ];
+
+                        assert!((epsilon[0] - expected[0]).abs() < EPSILON);
+                        assert!((epsilon[1] - expected[1]).abs() < EPSILON);
+                    }
+                }
+            }
+        }
+    }
+
+    // Based on https://e2eml.school/softmax.html
+    #[test]
+    fn test_softmax_backpropagation() {
+        const EPSILON: f64 = 0.000001;
+        let layer = NeuraSoftmaxLayer::new();
+
+        for _ in 0..100 {
+            let input = uniform_vector(4);
+            let evaluated = layer.eval(&input);
+            let loss = uniform_vector(4);
+
+            let mut derivative = &evaluated * evaluated.transpose();
+            derivative *= -1.0;
+            derivative += DMatrix::from_diagonal(&evaluated);
+
+            let expected = derivative * &loss;
+            let (actual, _) = layer.backprop_layer(&input, loss);
+
+            for i in 0..4 {
+                assert!((expected[i] - actual[i]).abs() < EPSILON);
+            }
+        }
+    }
+}
diff --git a/src/network/sequential/mod.rs b/src/network/sequential/mod.rs
index cbcc54f..ed95a87 100644
--- a/src/network/sequential/mod.rs
+++ b/src/network/sequential/mod.rs
@@ -247,7 +247,7 @@ mod test {
 
     use crate::{
         derivable::{activation::Relu, regularize::NeuraL0},
-        layer::{NeuraDenseLayer, NeuraLayer, NeuraShape},
+        layer::{dense::NeuraDenseLayer, NeuraLayer, NeuraShape},
         neura_layer,
     };
 
diff --git a/src/train.rs b/src/train.rs
index b9de86e..a331955 100644
--- a/src/train.rs
+++ b/src/train.rs
@@ -186,7 +186,7 @@ mod test {
     use crate::{
         assert_approx,
         derivable::{activation::Linear, loss::Euclidean, regularize::NeuraL0},
-        layer::{NeuraDenseLayer, NeuraLayer},
+        layer::{dense::NeuraDenseLayer, NeuraLayer},
         network::sequential::{NeuraSequential, NeuraSequentialTail},
         neura_sequential,
     };
diff --git a/src/utils.rs b/src/utils.rs
index 442c5bd..6a1d976 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -89,17 +89,12 @@ where
 }
 
 #[cfg(test)]
-pub(crate) fn uniform_vector<const LENGTH: usize>() -> NeuraVector<LENGTH, f64> {
+pub(crate) fn uniform_vector(length: usize) -> nalgebra::DVector<f64> {
+    use nalgebra::DVector;
     use rand::Rng;
 
-    let mut res: NeuraVector<LENGTH, f64> = NeuraVector::default();
     let mut rng = rand::thread_rng();
-
-    for i in 0..LENGTH {
-        res[i] = rng.gen();
-    }
-
-    res
+    DVector::from_fn(length, |_, _| -> f64 { rng.gen() })
 }
 
 pub fn one_hot<const N: usize>(value: usize) -> NeuraVector<N, f64> {