🔥 🚚 ♻️ Refactoring the previous layer system

It was becoming almost impossible to manage the dimensions of the layers, especially with convolution layers. Generic consts are nice, but they are a bit too early to have right now for this use-case. We'll probably be expanding the implementations to accept const or dynamically-sized layers at some point, for performance-critical applications.
2 years ago · 2edbff860c
parent cc7686569a
commit 2edbff860c
21 changed files with 796 additions and 458 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -7,6 +7,7 @@ edition = "2021"

 [dependencies]
 boxed-array = "0.1.0"
+nalgebra = { version = "^0.32", features = ["std", "macros", "rand"] }
 ndarray = "^0.15"
 num = "^0.4"
 # num-traits = "0.2.15"
--- a/examples/xor.rs
+++ b/examples/xor.rs
@ -1,22 +1,24 @@
 #![feature(generic_arg_infer)]

-use neuramethyst::algebra::NeuraVector;
+use nalgebra::dvector;
+
 use neuramethyst::derivable::activation::Relu;
 use neuramethyst::derivable::loss::Euclidean;
-use neuramethyst::{cycle_shuffling, prelude::*};
+use neuramethyst::prelude::*;
+use neuramethyst::cycle_shuffling;

 fn main() {
    let mut network = neura_sequential![
-        neura_layer!("dense", 2, 4; Relu),
-        neura_layer!("dense", 3; Relu),
-        neura_layer!("dense", 1; Relu)
-    ];
+        neura_layer!("dense", 4, Relu),
+        neura_layer!("dense", 3, Relu),
+        neura_layer!("dense", 1, Relu)
+    ].construct(NeuraShape::Vector(2)).unwrap();

-    let inputs: [(NeuraVector<2, f64>, NeuraVector<1, f64>); 4] = [
-        ([0.0, 0.0].into(), [0.0].into()),
-        ([0.0, 1.0].into(), [1.0].into()),
-        ([1.0, 0.0].into(), [1.0].into()),
-        ([1.0, 1.0].into(), [0.0].into()),
+    let inputs = [
+        (dvector![0.0, 0.0], dvector![0.0]),
+        (dvector![0.0, 1.0], dvector![1.0]),
+        (dvector![1.0, 0.0], dvector![1.0]),
+        (dvector![1.0, 1.0], dvector![0.0]),
    ];

    for (input, target) in &inputs {
--- a/src/algebra/matrix.rs
+++ b/src/algebra/matrix.rs
@ -167,10 +167,10 @@ impl<const WIDTH: usize, const HEIGHT: usize, F: NeuraVectorSpace + Clone> Neura
        }
    }

-    #[inline(always)]
-    fn zero() -> Self {
-        Self::from_value(F::zero())
-    }
+    // #[inline(always)]
+    // fn zero() -> Self {
+    //     Self::from_value(F::zero())
+    // }

    fn norm_squared(&self) -> f64 {
        let mut sum = 0.0;
--- a/src/algebra/mod.rs
+++ b/src/algebra/mod.rs
@ -2,6 +2,8 @@ mod matrix;
 pub use matrix::NeuraMatrix;

 mod vector;
+use nalgebra::Matrix;
+use num::Float;
 pub use vector::NeuraVector;

 /// An extension of `std::ops::AddAssign` and `std::ops::Default`
@ -10,7 +12,7 @@ pub trait NeuraVectorSpace {

    fn mul_assign(&mut self, by: f64);

-    fn zero() -> Self;
+    // fn zero() -> Self;

    fn norm_squared(&self) -> f64;
 }
@ -26,10 +28,10 @@ impl NeuraVectorSpace for () {
        // Noop
    }

-    #[inline(always)]
-    fn zero() -> Self {
-        ()
-    }
+    // #[inline(always)]
+    // fn zero() -> Self {
+    //     ()
+    // }

    fn norm_squared(&self) -> f64 {
        0.0
@ -45,9 +47,9 @@ impl<T: NeuraVectorSpace> NeuraVectorSpace for Box<T> {
        self.as_mut().mul_assign(by);
    }

-    fn zero() -> Self {
-        Box::new(T::zero())
-    }
+    // fn zero() -> Self {
+    //     Box::new(T::zero())
+    // }

    fn norm_squared(&self) -> f64 {
        self.as_ref().norm_squared()
@ -65,9 +67,9 @@ impl<Left: NeuraVectorSpace, Right: NeuraVectorSpace> NeuraVectorSpace for (Left
        NeuraVectorSpace::mul_assign(&mut self.1, by);
    }

-    fn zero() -> Self {
-        (Left::zero(), Right::zero())
-    }
+    // fn zero() -> Self {
+    //     (Left::zero(), Right::zero())
+    // }

    fn norm_squared(&self) -> f64 {
        self.0.norm_squared() + self.1.norm_squared()
@ -87,24 +89,43 @@ impl<const N: usize, T: NeuraVectorSpace + Clone> NeuraVectorSpace for [T; N] {
        }
    }

-    fn zero() -> Self {
-        let mut res: Vec<T> = Vec::with_capacity(N);
+    // fn zero() -> Self {
+    //     let mut res: Vec<T> = Vec::with_capacity(N);

-        for _ in 0..N {
-            res.push(T::zero());
-        }
+    //     for _ in 0..N {
+    //         res.push(T::zero());
+    //     }

-        res.try_into().unwrap_or_else(|_| {
-            // TODO: check that this panic is optimized away
-            unreachable!()
-        })
-    }
+    //     res.try_into().unwrap_or_else(|_| {
+    //         // TODO: check that this panic is optimized away
+    //         unreachable!()
+    //     })
+    // }

    fn norm_squared(&self) -> f64 {
        self.iter().map(T::norm_squared).sum()
    }
 }

+impl<F: Float, R: nalgebra::Dim, C: nalgebra::Dim, S: nalgebra::RawStorage<F, R, C>> NeuraVectorSpace for Matrix<F, R, C, S>
+where
+    Matrix<F, R, C, S>: std::ops::MulAssign<F>,
+    for<'c> Matrix<F, R, C, S>: std::ops::AddAssign<&'c Matrix<F, R, C, S>>,
+    F: From<f64> + Into<f64>
+{
+    fn add_assign(&mut self, other: &Self) {
+        *self += other;
+    }
+
+    fn mul_assign(&mut self, by: f64) {
+        *self *= <F as From<f64>>::from(by);
+    }
+
+    fn norm_squared(&self) -> f64 {
+        self.iter().map(|x| *x * *x).reduce(|sum, curr| sum + curr).unwrap_or(F::zero()).into()
+    }
+}
+
 macro_rules! base {
    ( $type:ty ) => {
        impl NeuraVectorSpace for $type {
@ -116,9 +137,9 @@ macro_rules! base {
                std::ops::MulAssign::mul_assign(self, other as $type);
            }

-            fn zero() -> Self {
-                <Self as Default>::default()
-            }
+            // fn zero() -> Self {
+            //     <Self as Default>::default()
+            // }

            fn norm_squared(&self) -> f64 {
                (self * self) as f64
--- a/src/algebra/vector.rs
+++ b/src/algebra/vector.rs
@ -95,10 +95,10 @@ impl<const LENGTH: usize, F: Float + From<f64> + Into<f64>> NeuraVectorSpace
        }
    }

-    #[inline(always)]
-    fn zero() -> Self {
-        Self::from_value(F::zero())
-    }
+    // #[inline(always)]
+    // fn zero() -> Self {
+    //     Self::from_value(F::zero())
+    // }

    fn norm_squared(&self) -> f64 {
        let mut sum = F::zero();
--- a/src/derivable/loss.rs
+++ b/src/derivable/loss.rs
@ -1,19 +1,22 @@
+use nalgebra::DVector;
+
 use crate::algebra::NeuraVector;

 use super::NeuraLoss;

 #[derive(Clone, Copy, Debug, PartialEq)]
-pub struct Euclidean<const N: usize>;
+pub struct Euclidean;

-impl<const N: usize> NeuraLoss for Euclidean<N> {
-    type Input = NeuraVector<N, f64>;
-    type Target = NeuraVector<N, f64>;
+impl NeuraLoss for Euclidean {
+    type Input = DVector<f64>;
+    type Target = DVector<f64>;

    #[inline]
-    fn eval(&self, target: &NeuraVector<N, f64>, actual: &NeuraVector<N, f64>) -> f64 {
+    fn eval(&self, target: &DVector<f64>, actual: &DVector<f64>) -> f64 {
+        assert_eq!(target.shape(), actual.shape());
        let mut sum_squared = 0.0;

-        for i in 0..N {
+        for i in 0..target.len() {
            sum_squared += (target[i] - actual[i]) * (target[i] - actual[i]);
        }

@ -23,13 +26,13 @@ impl<const N: usize> NeuraLoss for Euclidean<N> {
    #[inline]
    fn nabla(
        &self,
-        target: &NeuraVector<N, f64>,
-        actual: &NeuraVector<N, f64>,
-    ) -> NeuraVector<N, f64> {
-        let mut res = NeuraVector::default();
+        target: &DVector<f64>,
+        actual: &DVector<f64>,
+    ) -> DVector<f64> {
+        let mut res = DVector::zeros(target.len());

        // ∂E(y)/∂yᵢ = yᵢ - yᵢ'
-        for i in 0..N {
+        for i in 0..target.len() {
            res[i] = actual[i] - target[i];
        }

--- a/src/layer/dense.rs
+++ b/src/layer/dense.rs
@ -1,38 +1,49 @@
-use super::{NeuraLayer, NeuraTrainableLayer};
-use crate::{
-    algebra::{NeuraMatrix, NeuraVector, NeuraVectorSpace},
-    derivable::NeuraDerivable,
-};
+use std::marker::PhantomData;

+use nalgebra::{DMatrix, DVector};
+use num::Float;
 use rand::Rng;
-use rand_distr::Distribution;
+
+use crate::derivable::NeuraDerivable;
+
+use super::*;

 #[derive(Clone, Debug)]
-pub struct NeuraDenseLayer<
-    Act: NeuraDerivable<f64>,
-    Reg: NeuraDerivable<f64>,
-    const INPUT_LEN: usize,
-    const OUTPUT_LEN: usize,
+pub struct NeuraDenseLayer<F: Float, Act: NeuraDerivable<F>, Reg: NeuraDerivable<F>> {
+    weights: DMatrix<F>,
+    bias: DVector<F>,
+    activation: Act,
+    regularization: Reg,
+}
+
+#[derive(Clone, Debug)]
+pub struct NeuraDenseLayerPartial<
+    F: Float,
+    Act: NeuraDerivable<F>,
+    Reg: NeuraDerivable<F>,
+    R: Rng,
 > {
-    weights: NeuraMatrix<INPUT_LEN, OUTPUT_LEN, f64>,
-    bias: NeuraVector<OUTPUT_LEN, f64>,
    activation: Act,
    regularization: Reg,
+    output_size: usize,
+    rng: R,
+    phantom: PhantomData<F>,
 }

 impl<
-        Act: NeuraDerivable<f64>,
-        Reg: NeuraDerivable<f64>,
-        const INPUT_LEN: usize,
-        const OUTPUT_LEN: usize,
-    > NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
+        F: Float + From<f64> + std::fmt::Debug + 'static,
+        Act: NeuraDerivable<F>,
+        Reg: NeuraDerivable<F>,
+    > NeuraDenseLayer<F, Act, Reg>
 {
    pub fn new(
-        weights: NeuraMatrix<INPUT_LEN, OUTPUT_LEN, f64>,
-        bias: NeuraVector<OUTPUT_LEN, f64>,
+        weights: DMatrix<F>,
+        bias: DVector<F>,
        activation: Act,
        regularization: Reg,
    ) -> Self {
+        assert_eq!(bias.shape().0, weights.shape().0);
+
        Self {
            weights,
            bias,
@ -41,85 +52,129 @@ impl<
        }
    }

-    pub fn from_rng(rng: &mut impl Rng, activation: Act, regularization: Reg) -> Self {
-        let mut weights: NeuraMatrix<INPUT_LEN, OUTPUT_LEN, f64> = NeuraMatrix::from_value(0.0f64);
-
-        // Use Xavier (or He) initialisation, using the harmonic mean
-        // Ref: https://www.deeplearning.ai/ai-notes/initialization/index.html
+    pub fn from_rng(
+        input_size: usize,
+        output_size: usize,
+        rng: &mut impl Rng,
+        activation: Act,
+        regularization: Reg,
+    ) -> Self
+    where
+        rand_distr::StandardNormal: rand_distr::Distribution<F>,
+    {
        let distribution = rand_distr::Normal::new(
-            0.0,
-            activation.variance_hint() * 2.0 / (INPUT_LEN as f64 + OUTPUT_LEN as f64),
+            F::zero(),
+            <F as From<f64>>::from(
+                activation.variance_hint() * 2.0 / (input_size as f64 + output_size as f64),
+            ),
        )
        .unwrap();
-        // let distribution = rand_distr::Uniform::new(-0.5, 0.5);

-        for i in 0..OUTPUT_LEN {
-            for j in 0..INPUT_LEN {
-                weights[i][j] = distribution.sample(rng);
-            }
+        Self {
+            weights: DMatrix::from_distribution(output_size, input_size, &distribution, rng),
+            bias: DVector::from_element(
+                output_size,
+                <F as From<f64>>::from(activation.bias_hint()),
+            ),
+            activation,
+            regularization,
        }
+    }

-        Self {
-            weights,
-            // Biases are initialized based on the activation's hint
-            bias: NeuraVector::from_value(activation.bias_hint()),
+    pub fn new_partial<R: Rng>(
+        output_size: usize,
+        rng: R,
+        activation: Act,
+        regularization: Reg,
+    ) -> NeuraDenseLayerPartial<F, Act, Reg, R> {
+        NeuraDenseLayerPartial {
            activation,
            regularization,
+            output_size,
+            rng,
+            phantom: PhantomData,
        }
    }
 }

 impl<
-        Act: NeuraDerivable<f64>,
-        Reg: NeuraDerivable<f64>,
-        const INPUT_LEN: usize,
-        const OUTPUT_LEN: usize,
-    > NeuraLayer for NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
+        F: Float + From<f64> + std::fmt::Debug + 'static,
+        Act: NeuraDerivable<F>,
+        Reg: NeuraDerivable<F>,
+        R: Rng,
+    > NeuraPartialLayer for NeuraDenseLayerPartial<F, Act, Reg, R>
+where
+    rand_distr::StandardNormal: rand_distr::Distribution<F>,
 {
-    type Input = NeuraVector<INPUT_LEN, f64>;
+    type Constructed = NeuraDenseLayer<F, Act, Reg>;
+    type Err = ();
+
+    fn construct(self, input_shape: NeuraShape) -> Result<Self::Constructed, Self::Err> {
+        let mut rng = self.rng;
+        Ok(NeuraDenseLayer::from_rng(
+            input_shape.size(),
+            self.output_size,
+            &mut rng,
+            self.activation,
+            self.regularization,
+        ))
+    }
+
+    fn output_shape(constructed: &Self::Constructed) -> NeuraShape {
+        NeuraShape::Vector(constructed.weights.shape().0)
+    }
+}

-    type Output = NeuraVector<OUTPUT_LEN, f64>;
+impl<
+        F: Float + From<f64> + std::fmt::Debug + 'static + std::ops::AddAssign + std::ops::MulAssign,
+        Act: NeuraDerivable<F>,
+        Reg: NeuraDerivable<F>,
+    > NeuraLayer<DVector<F>> for NeuraDenseLayer<F, Act, Reg>
+{
+    type Output = DVector<F>;

-    fn eval(&self, input: &Self::Input) -> Self::Output {
-        let mut result = self.weights.multiply_vector(input);
+    fn eval(&self, input: &DVector<F>) -> Self::Output {
+        assert_eq!(input.shape().0, self.weights.shape().1);

-        for i in 0..OUTPUT_LEN {
-            result[i] = self.activation.eval(result[i] + self.bias[i]);
-        }
+        let res = &self.weights * input + &self.bias;

-        result
+        res.map(|x| self.activation.eval(x))
    }
 }

 impl<
-        Act: NeuraDerivable<f64>,
-        Reg: NeuraDerivable<f64>,
-        const INPUT_LEN: usize,
-        const OUTPUT_LEN: usize,
-    > NeuraTrainableLayer for NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
+        F: Float + From<f64> + Into<f64> + std::fmt::Debug + 'static + std::ops::AddAssign + std::ops::MulAssign,
+        Act: NeuraDerivable<F>,
+        Reg: NeuraDerivable<F>,
+    > NeuraTrainableLayer<DVector<F>> for NeuraDenseLayer<F, Act, Reg>
 {
-    type Delta = (
-        NeuraMatrix<INPUT_LEN, OUTPUT_LEN, f64>,
-        NeuraVector<OUTPUT_LEN, f64>,
-    );
+    type Gradient = (DMatrix<F>, DVector<F>);

-    fn backpropagate(
+    fn default_gradient(&self) -> Self::Gradient {
+        (
+            DMatrix::zeros(self.weights.shape().0, self.weights.shape().1),
+            DVector::zeros(self.bias.shape().0),
+        )
+    }
+
+    fn backprop_layer(
        &self,
-        input: &Self::Input,
+        input: &DVector<F>,
        epsilon: Self::Output,
-    ) -> (Self::Input, Self::Delta) {
-        let evaluated = self.weights.multiply_vector(input);
+    ) -> (DVector<F>, Self::Gradient) {
+        let evaluated = &self.weights * input;
        // Compute delta (the input gradient of the neuron) from epsilon (the output gradient of the neuron),
        // with `self.activation'(input) ° epsilon = delta`
-        let mut delta: NeuraVector<OUTPUT_LEN, f64> = epsilon.clone();
-        for i in 0..OUTPUT_LEN {
+        let mut delta = epsilon.clone();
+
+        for i in 0..delta.len() {
            delta[i] *= self.activation.derivate(evaluated[i]);
        }

        // Compute the weight gradient
-        let weights_gradient = delta.reverse_dot(input);
+        let weights_gradient = &delta * input.transpose();

-        let new_epsilon = self.weights.transpose_multiply_vector(&delta);
+        let new_epsilon = self.weights.tr_mul(&delta);

        // According to https://datascience.stackexchange.com/questions/20139/gradients-for-bias-terms-in-backpropagation
        // The gradient of the bias is equal to the delta term of the backpropagation algorithm
@ -128,53 +183,12 @@ impl<
        (new_epsilon, (weights_gradient, bias_gradient))
    }

-    fn apply_gradient(&mut self, gradient: &Self::Delta) {
-        NeuraVectorSpace::add_assign(&mut self.weights, &gradient.0);
-        NeuraVectorSpace::add_assign(&mut self.bias, &gradient.1);
+    fn regularize_layer(&self) -> Self::Gradient {
+        (self.weights.map(|x| self.regularization.derivate(x)), DVector::zeros(self.bias.shape().0))
    }

-    fn regularize(&self) -> Self::Delta {
-        let mut res = Self::Delta::default();
-
-        for i in 0..OUTPUT_LEN {
-            for j in 0..INPUT_LEN {
-                res.0[i][j] = self.regularization.derivate(self.weights[i][j]);
-            }
-        }
-
-        // Note: biases aren't taken into account here, as per https://stats.stackexchange.com/questions/153605/no-regularisation-term-for-bias-unit-in-neural-network
-
-        res
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use super::*;
-    use crate::{
-        derivable::{activation::Relu, regularize::NeuraL0},
-        utils::uniform_vector,
-    };
-
-    #[test]
-    fn test_from_rng() {
-        let mut rng = rand::thread_rng();
-        let layer: NeuraDenseLayer<_, _, 64, 32> =
-            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0);
-        let mut input = [0.0; 64];
-        for x in 0..64 {
-            input[x] = rng.gen();
-        }
-        assert!(layer.eval(&input.into()).len() == 32);
-    }
-
-    #[test]
-    fn test_stack_overflow_big_layer() {
-        let layer = NeuraDenseLayer::from_rng(&mut rand::thread_rng(), Relu, NeuraL0)
-            as NeuraDenseLayer<Relu, NeuraL0, 1000, 1000>;
-
-        layer.backpropagate(&uniform_vector(), uniform_vector());
-
-        <NeuraDenseLayer<Relu, NeuraL0, 1000, 1000> as NeuraTrainableLayer>::Delta::zero();
+    fn apply_gradient(&mut self, gradient: &Self::Gradient) {
+        self.weights += &gradient.0;
+        self.bias += &gradient.1;
    }
 }
--- a/src/layer/mod.rs
+++ b/src/layer/mod.rs
@ -1,39 +1,55 @@
-mod dense;
-pub use dense::NeuraDenseLayer;
+use num::Float;

-mod convolution;
-pub use convolution::{NeuraConv1DPadLayer, NeuraConv2DBlockLayer, NeuraConv2DPadLayer};
+use crate::algebra::NeuraVectorSpace;

-mod dropout;
-pub use dropout::NeuraDropoutLayer;
+pub mod dense;
+pub use dense::NeuraDenseLayer;

-mod softmax;
-pub use softmax::NeuraSoftmaxLayer;
+#[derive(Clone, Copy, PartialEq, Debug)]
+pub enum NeuraShape {
+    Vector(usize),        // entries
+    Matrix(usize, usize), // rows, columns
+    Tensor(usize, usize, usize), // rows, columns, channels
+}

-mod one_hot;
-pub use one_hot::NeuraOneHotLayer;
+impl NeuraShape {
+    pub fn size(&self) -> usize {
+        match self {
+            NeuraShape::Vector(entries) => *entries,
+            NeuraShape::Matrix(rows, columns) => rows * columns,
+            NeuraShape::Tensor(rows, columns, channels) => rows * columns * channels
+        }
+    }
+}
+
+pub trait NeuraLayer<Input> {
+    type Output;

-mod lock;
-pub use lock::NeuraLockLayer;
+    fn eval(&self, input: &Input) -> Self::Output;
+}

-mod pool;
-pub use pool::{NeuraGlobalPoolLayer, NeuraPool1DLayer};
+impl<Input: Clone> NeuraLayer<Input> for () {
+    type Output = Input;

-mod reshape;
-pub use reshape::{NeuraFlattenLayer, NeuraReshapeLayer};
+    fn eval(&self, input: &Input) -> Self::Output {
+        input.clone()
+    }
+}

-use crate::algebra::NeuraVectorSpace;
+pub trait NeuraPartialLayer {
+    type Constructed;
+    type Err;

-pub trait NeuraLayer {
-    type Input;
-    type Output;
+    fn construct(self, input_shape: NeuraShape) -> Result<Self::Constructed, Self::Err>;

-    fn eval(&self, input: &Self::Input) -> Self::Output;
+    fn output_shape(constructed: &Self::Constructed) -> NeuraShape;
 }

-pub trait NeuraTrainableLayer: NeuraLayer {
+pub trait NeuraTrainableLayer<Input>: NeuraLayer<Input> {
    /// The representation of the layer gradient as a vector space
-    type Delta: NeuraVectorSpace;
+    type Gradient: NeuraVectorSpace;
+
+    fn default_gradient(&self) -> Self::Gradient;

    /// Computes the backpropagation term and the derivative of the internal weights,
    /// using the `input` vector outputted by the previous layer and the backpropagation term `epsilon` of the next layer.
@ -46,125 +62,28 @@ pub trait NeuraTrainableLayer: NeuraLayer {
    /// The function should then return a pair `(epsilon_{l-1}, δW_l)`,
    /// with `epsilon_{l-1}` being multiplied by `f_{l-1}'(activation)` by the next layer to obtain `delta_{l-1}`.
    /// Using this intermediate value for `delta` allows us to isolate it computation to the respective layers.
-    fn backpropagate(
+    fn backprop_layer(
        &self,
-        input: &Self::Input,
+        input: &Input,
        epsilon: Self::Output,
-    ) -> (Self::Input, Self::Delta);
+    ) -> (Input, Self::Gradient);

    /// Computes the regularization
-    fn regularize(&self) -> Self::Delta;
+    fn regularize_layer(&self) -> Self::Gradient;

    /// Applies `δW_l` to the weights of the layer
-    fn apply_gradient(&mut self, gradient: &Self::Delta);
-
-    /// Called before an iteration begins, to allow the layer to set itself up for training.
-    #[inline(always)]
-    fn prepare_epoch(&mut self) {}
+    fn apply_gradient(&mut self, gradient: &Self::Gradient);

-    /// Called at the end of training, to allow the layer to clean itself up
+    /// Arbitrary computation that can be executed at the start of an epoch
+    #[allow(unused_variables)]
    #[inline(always)]
-    fn cleanup(&mut self) {}
+    fn prepare_layer(&mut self, is_training: bool) {}
 }

+/// Temporary implementation of neura_layer
 #[macro_export]
 macro_rules! neura_layer {
-    ( "dense", $( $shape:expr ),*; $activation:expr ) => {
-        $crate::layer::NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation, $crate::derivable::regularize::NeuraL0)
-            as neura_layer!("_dense_shape", $($shape),*)
-    };
-
-    ( "dense", $( $shape:expr ),*; $activation:expr, $regularization:expr ) => {
-        $crate::layer::NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation, $regularization)
-            as neura_layer!("_dense_shape", $($shape),*)
-    };
-
-    ( "_dense_shape", $output:expr ) => {
-        $crate::layer::NeuraDenseLayer<_, _, _, $output>
-    };
-
-    ( "_dense_shape", $input:expr, $output:expr ) => {
-        $crate::layer::NeuraDenseLayer<_, _, $input, $output>
-    };
-
-    ( "dropout", $probability:expr ) => {
-        $crate::layer::NeuraDropoutLayer::new($probability, rand::thread_rng())
-            as $crate::layer::NeuraDropoutLayer<_, _>
-    };
-
-    ( "softmax" ) => {
-        $crate::layer::NeuraSoftmaxLayer::new() as $crate::layer::NeuraSoftmaxLayer<_>
-    };
-
-    ( "softmax", $length:expr ) => {
-        $crate::layer::NeuraSoftmaxLayer::new() as $crate::layer::NeuraSoftmaxLayer<$length>
-    };
-
-    ( "one_hot" ) => {
-        $crate::layer::NeuraOneHotLayer as $crate::layer::NeuraOneHotLayer<2, _>
-    };
-
-    ( "lock", $layer:expr ) => {
-        $crate::layer::NeuraLockLayer($layer)
-    };
-
-    ( "conv1d_pad", $length:expr, $feats:expr; $window:expr; $layer:expr ) => {
-        $crate::layer::NeuraConv1DPadLayer::new($layer, Default::default()) as $crate::layer::NeuraConv1DPadLayer<$length, $feats, $window, _>
-    };
-
-    ( "conv1d_pad"; $window:expr; $layer:expr ) => {
-        $crate::layer::NeuraConv1DPadLayer::new($layer, Default::default()) as $crate::layer::NeuraConv1DPadLayer<_, _, $window, _>
-    };
-
-    ( "conv2d_pad", $feats:expr, $length:expr; $width:expr, $window:expr; $layer:expr ) => {
-        $crate::layer::NeuraConv2DPadLayer::new($layer, Default::default(), $width) as $crate::layer::NeuraConv2DPadLayer<$length, $feats, $window, _>
-    };
-
-    ( "conv2d_pad"; $width:expr, $window:expr; $layer:expr ) => {
-        $crate::layer::NeuraConv2DPadLayer::new($layer, Default::default(), $width) as $crate::layer::NeuraConv2DPadLayer<_, _, $window, _>
-    };
-
-    ( "conv2d_block", $feats:expr, $width:expr, $height:expr; $block_size:expr; $layer:expr ) => {
-        $crate::layer::NeuraConv2DBlockLayer::new($layer) as $crate::layer::NeuraConv2DBlockLayer<$width, $height, $feats, $block_size, _>
-    };
-
-    ( "conv2d_block", $width:expr, $height:expr; $block_size:expr; $layer:expr ) => {
-        $crate::layer::NeuraConv2DBlockLayer::new($layer) as $crate::layer::NeuraConv2DBlockLayer<$width, $height, _, $block_size, _>
-    };
-
-    ( "pool_global"; $reduce:expr ) => {
-        $crate::layer::NeuraGlobalPoolLayer::new($reduce) as $crate::layer::NeuraGlobalPoolLayer<_, _, _>
-    };
-
-    ( "pool_global", $feats:expr, $length:expr; $reduce:expr ) => {
-        $crate::layer::NeuraGlobalPoolLayer::new($reduce) as $crate::layer::NeuraGlobalPoolLayer<$length, $feats, _>
-    };
-
-    ( "pool1d", $blocklength:expr; $reduce:expr ) => {
-        $crate::layer::NeuraPool1DLayer::new($reduce) as $crate::layer::NeuraPool1DLayer<_, $blocklength, _, _>
-    };
-
-    ( "pool1d", $blocks:expr, $blocklength:expr; $reduce:expr ) => {
-        $crate::layer::NeuraPool1DLayer::new($reduce) as $crate::layer::NeuraPool1DLayer<$blocks, $blocklength, _, _>
-    };
-
-    ( "pool1d", $feats:expr, $blocks:expr, $blocklength:expr; $reduce:expr ) => {
-        $crate::layer::NeuraPool1DLayer::new($reduce) as $crate::layer::NeuraPool1DLayer<$blocks, $blocklength, $feats, _>
-    };
-
-    ( "unstable_flatten" ) => {
-        $crate::layer::NeuraFlattenLayer::new() as $crate::layer::NeuraFlattenLayer<_, _, f64>
-    };
-
-    ( "unstable_flatten", $width:expr, $height:expr ) => {
-        $crate::layer::NeuraFlattenLayer::new() as $crate::layer::NeuraFlattenLayer<$width, $height, f64>
-    };
-
-    ( "unstable_reshape", $height:expr ) => {
-        $crate::layer::NeuraReshapeLayer::new() as $crate::layer::NeuraReshapeLayer<_, $height, f64>
-    };
-
-    ( "unstable_reshape", $width:expr, $height:expr ) => {
-        $crate::layer::NeuraReshapeLayer::new() as $crate::layer::NeuraReshapeLayer<$width, $height, f64>
-    };
+    ( "dense", $output:expr, $activation:expr ) => {
+        $crate::layer::dense::NeuraDenseLayer::new_partial($output, rand::thread_rng(), $activation, $crate::derivable::regularize::NeuraL0)
+    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,12 +1,15 @@
 #![feature(generic_arg_infer)]
 #![feature(generic_const_exprs)]
+#![feature(negative_impls)]

 pub mod algebra;
 pub mod derivable;
-pub mod layer;
+// pub mod layer;
 pub mod network;
 pub mod train;

+pub mod layer;
+
 mod utils;

 // TODO: move to a different file
@ -17,7 +20,7 @@ pub mod prelude {
    pub use crate::{neura_layer, neura_sequential};

    // Structs and traits
-    pub use crate::layer::{NeuraDenseLayer, NeuraDropoutLayer, NeuraLayer};
-    pub use crate::network::sequential::{NeuraSequential, NeuraSequentialTail};
+    pub use crate::layer::*;
+    pub use crate::network::sequential::{NeuraSequential, NeuraSequentialTail, NeuraSequentialBuild};
    pub use crate::train::{NeuraBackprop, NeuraBatchedTrainer};
 }
--- a/src/network/mod.rs
+++ b/src/network/mod.rs
@ -2,25 +2,24 @@ use crate::{algebra::NeuraVectorSpace, derivable::NeuraLoss, layer::NeuraLayer};

 pub mod sequential;

-pub trait NeuraTrainableNetwork: NeuraLayer {
+pub trait NeuraTrainableNetwork<Input>: NeuraLayer<Input> {
    type Delta: NeuraVectorSpace;

+    fn default_gradient(&self) -> Self::Delta;
+
    fn apply_gradient(&mut self, gradient: &Self::Delta);

    /// Should implement the backpropagation algorithm, see `NeuraTrainableLayer::backpropagate` for more information.
    fn backpropagate<Loss: NeuraLoss<Input = Self::Output>>(
        &self,
-        input: &Self::Input,
+        input: &Input,
        target: &Loss::Target,
        loss: Loss,
-    ) -> (Self::Input, Self::Delta);
+    ) -> (Input, Self::Delta);

    /// Should return the regularization gradient
    fn regularize(&self) -> Self::Delta;

-    /// Called before an iteration begins, to allow the network to set itself up for training.
-    fn prepare_epoch(&mut self);
-
-    /// Called at the end of training, to allow the network to clean itself up
-    fn cleanup(&mut self);
+    /// Called before an iteration begins, to allow the network to set itself up for training or not.
+    fn prepare(&mut self, train_iteration: bool);
 }
--- a/src/network/sequential.rs
+++ b/src/network/sequential.rs
@ -1,12 +1,14 @@
+use num::Float;
+
 use crate::{
    derivable::NeuraLoss,
-    layer::{NeuraLayer, NeuraTrainableLayer},
+    layer::{NeuraLayer, NeuraTrainableLayer, NeuraShape, NeuraPartialLayer},
 };

 use super::NeuraTrainableNetwork;

 #[derive(Clone, Debug)]
-pub struct NeuraSequential<Layer: NeuraLayer, ChildNetwork> {
+pub struct NeuraSequential<Layer, ChildNetwork> {
    pub layer: Layer,
    pub child_network: Box<ChildNetwork>,
 }
@ -14,13 +16,13 @@ pub struct NeuraSequential<Layer: NeuraLayer, ChildNetwork> {
 /// Operations on the tail end of a sequential network
 pub trait NeuraSequentialTail {
    type TailTrimmed;
-    type TailPushed<T: NeuraLayer>;
+    type TailPushed<T>;

    fn trim_tail(self) -> Self::TailTrimmed;
-    fn push_tail<T: NeuraLayer>(self, layer: T) -> Self::TailPushed<T>;
+    fn push_tail<T>(self, layer: T) -> Self::TailPushed<T>;
 }

-impl<Layer: NeuraLayer, ChildNetwork> NeuraSequential<Layer, ChildNetwork> {
+impl<Layer, ChildNetwork> NeuraSequential<Layer, ChildNetwork> {
    pub fn new(layer: Layer, child_network: ChildNetwork) -> Self {
        Self {
            layer,
@ -28,9 +30,10 @@ impl<Layer: NeuraLayer, ChildNetwork> NeuraSequential<Layer, ChildNetwork> {
        }
    }

-    pub fn new_match_output(layer: Layer, child_network: ChildNetwork) -> Self
+    pub fn new_match_output<Input>(layer: Layer, child_network: ChildNetwork) -> Self
    where
-        ChildNetwork: NeuraLayer<Input = Layer::Output>,
+        Layer: NeuraLayer<Input>,
+        ChildNetwork: NeuraLayer<Layer::Output>,
    {
        Self::new(layer, child_network)
    }
@ -39,7 +42,10 @@ impl<Layer: NeuraLayer, ChildNetwork> NeuraSequential<Layer, ChildNetwork> {
        *self.child_network
    }

-    pub fn push_front<T: NeuraLayer>(self, layer: T) -> NeuraSequential<T, Self> {
+    pub fn push_front<Input, Input2, T: NeuraLayer<Input2, Output=Input>>(self, layer: T) -> NeuraSequential<T, Self>
+    where
+        Layer: NeuraLayer<Input>
+    {
        NeuraSequential {
            layer: layer,
            child_network: Box::new(self),
@ -48,15 +54,15 @@ impl<Layer: NeuraLayer, ChildNetwork> NeuraSequential<Layer, ChildNetwork> {
 }

 // Trimming the last layer returns an empty network
-impl<Layer: NeuraLayer> NeuraSequentialTail for NeuraSequential<Layer, ()> {
+impl<Layer> NeuraSequentialTail for NeuraSequential<Layer, ()> {
    type TailTrimmed = ();
-    type TailPushed<T: NeuraLayer> = NeuraSequential<Layer, NeuraSequential<T, ()>>;
+    type TailPushed<T> = NeuraSequential<Layer, NeuraSequential<T, ()>>;

    fn trim_tail(self) -> Self::TailTrimmed {
        ()
    }

-    fn push_tail<T: NeuraLayer>(self, layer: T) -> Self::TailPushed<T> {
+    fn push_tail<T>(self, layer: T) -> Self::TailPushed<T> {
        NeuraSequential {
            layer: self.layer,
            child_network: Box::new(NeuraSequential {
@ -68,11 +74,11 @@ impl<Layer: NeuraLayer> NeuraSequentialTail for NeuraSequential<Layer, ()> {
 }

 // Trimming another layer returns a network which calls trim recursively
-impl<Layer: NeuraLayer, ChildNetwork: NeuraSequentialTail> NeuraSequentialTail
+impl<Layer, ChildNetwork: NeuraSequentialTail> NeuraSequentialTail
    for NeuraSequential<Layer, ChildNetwork>
 {
    type TailTrimmed = NeuraSequential<Layer, <ChildNetwork as NeuraSequentialTail>::TailTrimmed>;
-    type TailPushed<T: NeuraLayer> =
+    type TailPushed<T> =
        NeuraSequential<Layer, <ChildNetwork as NeuraSequentialTail>::TailPushed<T>>;

    fn trim_tail(self) -> Self::TailTrimmed {
@ -82,7 +88,7 @@ impl<Layer: NeuraLayer, ChildNetwork: NeuraSequentialTail> NeuraSequentialTail
        }
    }

-    fn push_tail<T: NeuraLayer>(self, layer: T) -> Self::TailPushed<T> {
+    fn push_tail<T>(self, layer: T) -> Self::TailPushed<T> {
        NeuraSequential {
            layer: self.layer,
            child_network: Box::new(self.child_network.push_tail(layer)),
@ -90,62 +96,55 @@ impl<Layer: NeuraLayer, ChildNetwork: NeuraSequentialTail> NeuraSequentialTail
    }
 }

-impl<Layer: NeuraLayer> NeuraLayer for NeuraSequential<Layer, ()> {
-    type Input = Layer::Input;
-    type Output = Layer::Output;
-
-    fn eval(&self, input: &Self::Input) -> Self::Output {
-        self.layer.eval(input)
-    }
-}
-
-impl<Layer: NeuraLayer, ChildNetwork: NeuraLayer<Input = Layer::Output>> NeuraLayer
+impl<Input, Layer: NeuraLayer<Input>, ChildNetwork: NeuraLayer<Layer::Output>> NeuraLayer<Input>
    for NeuraSequential<Layer, ChildNetwork>
 {
-    type Input = Layer::Input;
-
    type Output = ChildNetwork::Output;

-    fn eval(&self, input: &Self::Input) -> Self::Output {
+    fn eval(&self, input: &Input) -> Self::Output {
        self.child_network.eval(&self.layer.eval(input))
    }
 }

-impl<Layer: NeuraTrainableLayer> NeuraTrainableNetwork for NeuraSequential<Layer, ()> {
-    type Delta = Layer::Delta;
+impl<Input: Clone> NeuraTrainableNetwork<Input> for () {
+    type Delta = ();

-    fn apply_gradient(&mut self, gradient: &Self::Delta) {
-        self.layer.apply_gradient(gradient);
+    fn default_gradient(&self) -> () {
+        ()
+    }
+
+    fn apply_gradient(&mut self, _gradient: &()) {
+        // Noop
    }

    fn backpropagate<Loss: NeuraLoss<Input = Self::Output>>(
        &self,
-        input: &Self::Input,
+        final_activation: &Input,
        target: &Loss::Target,
        loss: Loss,
-    ) -> (Self::Input, Self::Delta) {
-        let final_activation = self.layer.eval(input);
+    ) -> (Input, Self::Delta) {
        let backprop_epsilon = loss.nabla(target, &final_activation);
-        self.layer.backpropagate(&input, backprop_epsilon)
-    }

-    fn regularize(&self) -> Self::Delta {
-        self.layer.regularize()
+        (backprop_epsilon, ())
    }

-    fn prepare_epoch(&mut self) {
-        self.layer.prepare_epoch();
+    fn regularize(&self) -> () {
+        ()
    }

-    fn cleanup(&mut self) {
-        self.layer.cleanup();
+    fn prepare(&mut self, _is_training: bool) {
+        // Noop
    }
 }

-impl<Layer: NeuraTrainableLayer, ChildNetwork: NeuraTrainableNetwork<Input = Layer::Output>>
-    NeuraTrainableNetwork for NeuraSequential<Layer, ChildNetwork>
+impl<Input, Layer: NeuraTrainableLayer<Input>, ChildNetwork: NeuraTrainableNetwork<Layer::Output>>
+    NeuraTrainableNetwork<Input> for NeuraSequential<Layer, ChildNetwork>
 {
-    type Delta = (Layer::Delta, Box<ChildNetwork::Delta>);
+    type Delta = (Layer::Gradient, Box<ChildNetwork::Delta>);
+
+    fn default_gradient(&self) -> Self::Delta {
+        (self.layer.default_gradient(), Box::new(self.child_network.default_gradient()))
+    }

    fn apply_gradient(&mut self, gradient: &Self::Delta) {
        self.layer.apply_gradient(&gradient.0);
@ -154,16 +153,16 @@ impl<Layer: NeuraTrainableLayer, ChildNetwork: NeuraTrainableNetwork<Input = Lay

    fn backpropagate<Loss: NeuraLoss<Input = Self::Output>>(
        &self,
-        input: &Self::Input,
+        input: &Input,
        target: &Loss::Target,
        loss: Loss,
-    ) -> (Self::Input, Self::Delta) {
+    ) -> (Input, Self::Delta) {
        let next_activation = self.layer.eval(input);
        let (backprop_gradient, weights_gradient) =
            self.child_network
                .backpropagate(&next_activation, target, loss);
        let (backprop_gradient, layer_gradient) =
-            self.layer.backpropagate(input, backprop_gradient);
+            self.layer.backprop_layer(input, backprop_gradient);

        (
            backprop_gradient,
@ -173,23 +172,18 @@ impl<Layer: NeuraTrainableLayer, ChildNetwork: NeuraTrainableNetwork<Input = Lay

    fn regularize(&self) -> Self::Delta {
        (
-            self.layer.regularize(),
+            self.layer.regularize_layer(),
            Box::new(self.child_network.regularize()),
        )
    }

-    fn prepare_epoch(&mut self) {
-        self.layer.prepare_epoch();
-        self.child_network.prepare_epoch();
-    }
-
-    fn cleanup(&mut self) {
-        self.layer.cleanup();
-        self.child_network.cleanup();
+    fn prepare(&mut self, is_training: bool) {
+        self.layer.prepare_layer(is_training);
+        self.child_network.prepare(is_training);
    }
 }

-impl<Layer: NeuraLayer> From<Layer> for NeuraSequential<Layer, ()> {
+impl<Layer> From<Layer> for NeuraSequential<Layer, ()> {
    fn from(layer: Layer) -> Self {
        Self {
            layer,
@ -198,6 +192,53 @@ impl<Layer: NeuraLayer> From<Layer> for NeuraSequential<Layer, ()> {
    }
 }

+pub trait NeuraSequentialBuild {
+    type Constructed;
+    type Err;
+
+    fn construct(self, input_shape: NeuraShape) -> Result<Self::Constructed, Self::Err>;
+}
+
+#[derive(Debug, Clone)]
+pub enum NeuraSequentialBuildErr<Err, ChildErr> {
+    Current(Err),
+    Child(ChildErr),
+}
+
+impl<Layer: NeuraPartialLayer> NeuraSequentialBuild for NeuraSequential<Layer, ()> {
+    type Constructed = NeuraSequential<Layer::Constructed, ()>;
+    type Err = Layer::Err;
+
+    fn construct(self, input_shape: NeuraShape) -> Result<Self::Constructed, Self::Err> {
+        Ok(NeuraSequential {
+            layer: self.layer.construct(input_shape)?,
+            child_network: Box::new(())
+        })
+    }
+}
+
+impl<Layer: NeuraPartialLayer + , ChildNetwork: NeuraSequentialBuild> NeuraSequentialBuild for NeuraSequential<Layer, ChildNetwork> {
+    type Constructed = NeuraSequential<Layer::Constructed, ChildNetwork::Constructed>;
+    type Err = NeuraSequentialBuildErr<Layer::Err, ChildNetwork::Err>;
+
+    fn construct(self, input_shape: NeuraShape) -> Result<Self::Constructed, Self::Err> {
+        let layer = self.layer.construct(input_shape).map_err(|e| NeuraSequentialBuildErr::Current(e))?;
+
+        // TODO: ensure that this operation (and all recursive operations) are directly allocated on the heap
+        let child_network = self.child_network
+            .construct(Layer::output_shape(&layer))
+            .map_err(|e| NeuraSequentialBuildErr::Child(e))?;
+        let child_network = Box::new(child_network);
+
+        Ok(NeuraSequential {
+            layer,
+            child_network,
+        })
+    }
+
+
+}
+
 /// An utility to recursively create a NeuraSequential network, while writing it in a declarative and linear fashion.
 /// Note that this can quickly create big and unwieldly types.
 #[macro_export]
@ -211,41 +252,47 @@ macro_rules! neura_sequential {
    };

    [ $first:expr, $($rest:expr),+ $(,)? ] => {
-        $crate::network::sequential::NeuraSequential::new_match_output($first, neura_sequential![$($rest),+])
+        $crate::network::sequential::NeuraSequential::new($first, neura_sequential![$($rest),+])
    };
 }

 #[cfg(test)]
 mod test {
+    use nalgebra::dvector;
+
    use crate::{
        derivable::{activation::Relu, regularize::NeuraL0},
-        layer::NeuraDenseLayer,
+        layer::{NeuraDenseLayer, NeuraShape, NeuraLayer},
        neura_layer,
    };

+    use super::NeuraSequentialBuild;
+
    #[test]
    fn test_neura_network_macro() {
        let mut rng = rand::thread_rng();

        let _ = neura_sequential![
-            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, 8, 16>,
-            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, _, 12>,
-            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, _, 2>
+            NeuraDenseLayer::from_rng(8, 12, &mut rng, Relu, NeuraL0) as NeuraDenseLayer<f64, _, _>,
+            NeuraDenseLayer::from_rng(12, 16, &mut rng, Relu, NeuraL0) as NeuraDenseLayer<f64, _, _>,
+            NeuraDenseLayer::from_rng(16, 2, &mut rng, Relu, NeuraL0) as NeuraDenseLayer<f64, _, _>
        ];

        let _ = neura_sequential![
-            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, 8, 16>,
+            NeuraDenseLayer::from_rng(2, 2, &mut rng, Relu, NeuraL0) as NeuraDenseLayer<f64, _, _>,
        ];

        let _ = neura_sequential![
-            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, 8, 16>,
-            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, _, 12>,
+            NeuraDenseLayer::from_rng(8, 16, &mut rng, Relu, NeuraL0) as NeuraDenseLayer<f64, _, _>,
+            NeuraDenseLayer::from_rng(16, 12, &mut rng, Relu, NeuraL0) as NeuraDenseLayer<f64, _, _>,
        ];

-        let _ = neura_sequential![
-            neura_layer!("dense", 8, 16; Relu),
-            neura_layer!("dense", 12; Relu),
-            neura_layer!("dense", 2; Relu)
-        ];
+        let network = neura_sequential![
+            neura_layer!("dense", 16, Relu),
+            neura_layer!("dense", 12, Relu),
+            neura_layer!("dense", 2, Relu)
+        ].construct(NeuraShape::Vector(2)).unwrap();
+
+        network.eval(&dvector![0.0f64, 0.0]);
    }
 }
--- a/src/old_layer/convolution.rs
+++ b/src/old_layer/convolution.rs
--- a/src/old_layer/dense.rs
+++ b/src/old_layer/dense.rs
@ -0,0 +1,180 @@
+use super::{NeuraLayer, NeuraTrainableLayer};
+use crate::{
+    algebra::{NeuraMatrix, NeuraVector, NeuraVectorSpace},
+    derivable::NeuraDerivable,
+};
+
+use rand::Rng;
+use rand_distr::Distribution;
+
+#[derive(Clone, Debug)]
+pub struct NeuraDenseLayer<
+    Act: NeuraDerivable<f64>,
+    Reg: NeuraDerivable<f64>,
+    const INPUT_LEN: usize,
+    const OUTPUT_LEN: usize,
+> {
+    weights: NeuraMatrix<INPUT_LEN, OUTPUT_LEN, f64>,
+    bias: NeuraVector<OUTPUT_LEN, f64>,
+    activation: Act,
+    regularization: Reg,
+}
+
+impl<
+        Act: NeuraDerivable<f64>,
+        Reg: NeuraDerivable<f64>,
+        const INPUT_LEN: usize,
+        const OUTPUT_LEN: usize,
+    > NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
+{
+    pub fn new(
+        weights: NeuraMatrix<INPUT_LEN, OUTPUT_LEN, f64>,
+        bias: NeuraVector<OUTPUT_LEN, f64>,
+        activation: Act,
+        regularization: Reg,
+    ) -> Self {
+        Self {
+            weights,
+            bias,
+            activation,
+            regularization,
+        }
+    }
+
+    pub fn from_rng(rng: &mut impl Rng, activation: Act, regularization: Reg) -> Self {
+        let mut weights: NeuraMatrix<INPUT_LEN, OUTPUT_LEN, f64> = NeuraMatrix::from_value(0.0f64);
+
+        // Use Xavier (or He) initialisation, using the harmonic mean
+        // Ref: https://www.deeplearning.ai/ai-notes/initialization/index.html
+        let distribution = rand_distr::Normal::new(
+            0.0,
+            activation.variance_hint() * 2.0 / (INPUT_LEN as f64 + OUTPUT_LEN as f64),
+        )
+        .unwrap();
+        // let distribution = rand_distr::Uniform::new(-0.5, 0.5);
+
+        for i in 0..OUTPUT_LEN {
+            for j in 0..INPUT_LEN {
+                weights[i][j] = distribution.sample(rng);
+            }
+        }
+
+        Self {
+            weights,
+            // Biases are initialized based on the activation's hint
+            bias: NeuraVector::from_value(activation.bias_hint()),
+            activation,
+            regularization,
+        }
+    }
+}
+
+impl<
+        Act: NeuraDerivable<f64>,
+        Reg: NeuraDerivable<f64>,
+        const INPUT_LEN: usize,
+        const OUTPUT_LEN: usize,
+    > NeuraLayer for NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
+{
+    type Input = NeuraVector<INPUT_LEN, f64>;
+
+    type Output = NeuraVector<OUTPUT_LEN, f64>;
+
+    fn eval(&self, input: &Self::Input) -> Self::Output {
+        let mut result = self.weights.multiply_vector(input);
+
+        for i in 0..OUTPUT_LEN {
+            result[i] = self.activation.eval(result[i] + self.bias[i]);
+        }
+
+        result
+    }
+}
+
+impl<
+        Act: NeuraDerivable<f64>,
+        Reg: NeuraDerivable<f64>,
+        const INPUT_LEN: usize,
+        const OUTPUT_LEN: usize,
+    > NeuraTrainableLayer for NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
+{
+    type Delta = (
+        NeuraMatrix<INPUT_LEN, OUTPUT_LEN, f64>,
+        NeuraVector<OUTPUT_LEN, f64>,
+    );
+
+    fn backpropagate(
+        &self,
+        input: &Self::Input,
+        epsilon: Self::Output,
+    ) -> (Self::Input, Self::Delta) {
+        let evaluated = self.weights.multiply_vector(input);
+        // Compute delta (the input gradient of the neuron) from epsilon (the output gradient of the neuron),
+        // with `self.activation'(input) ° epsilon = delta`
+        let mut delta: NeuraVector<OUTPUT_LEN, f64> = epsilon.clone();
+        for i in 0..OUTPUT_LEN {
+            delta[i] *= self.activation.derivate(evaluated[i]);
+        }
+
+        // Compute the weight gradient
+        let weights_gradient = delta.reverse_dot(input);
+
+        let new_epsilon = self.weights.transpose_multiply_vector(&delta);
+
+        // According to https://datascience.stackexchange.com/questions/20139/gradients-for-bias-terms-in-backpropagation
+        // The gradient of the bias is equal to the delta term of the backpropagation algorithm
+        let bias_gradient = delta;
+
+        (new_epsilon, (weights_gradient, bias_gradient))
+    }
+
+    fn apply_gradient(&mut self, gradient: &Self::Delta) {
+        NeuraVectorSpace::add_assign(&mut self.weights, &gradient.0);
+        NeuraVectorSpace::add_assign(&mut self.bias, &gradient.1);
+    }
+
+    fn regularize(&self) -> Self::Delta {
+        let mut res = Self::Delta::default();
+
+        for i in 0..OUTPUT_LEN {
+            for j in 0..INPUT_LEN {
+                res.0[i][j] = self.regularization.derivate(self.weights[i][j]);
+            }
+        }
+
+        // Note: biases aren't taken into account here, as per https://stats.stackexchange.com/questions/153605/no-regularisation-term-for-bias-unit-in-neural-network
+
+        res
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::{
+        derivable::{activation::Relu, regularize::NeuraL0},
+        utils::uniform_vector,
+    };
+
+    #[test]
+    fn test_from_rng() {
+        let mut rng = rand::thread_rng();
+        let layer: NeuraDenseLayer<_, _, 64, 32> =
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0);
+        let mut input = [0.0; 64];
+        for x in 0..64 {
+            input[x] = rng.gen();
+        }
+        assert!(layer.eval(&input.into()).len() == 32);
+    }
+
+    #[test]
+    fn test_stack_overflow_big_layer() {
+        let layer = NeuraDenseLayer::from_rng(&mut rand::thread_rng(), Relu, NeuraL0)
+            as NeuraDenseLayer<Relu, NeuraL0, 1000, 1000>;
+
+        layer.backpropagate(&uniform_vector(), uniform_vector());
+
+        <NeuraDenseLayer<Relu, NeuraL0, 1000, 1000> as NeuraTrainableLayer>::Delta::zero();
+    }
+}
--- a/src/old_layer/dropout.rs
+++ b/src/old_layer/dropout.rs
--- a/src/old_layer/lock.rs
+++ b/src/old_layer/lock.rs
--- a/src/old_layer/mod.rs
+++ b/src/old_layer/mod.rs
@ -0,0 +1,170 @@
+mod dense;
+pub use dense::NeuraDenseLayer;
+
+mod convolution;
+pub use convolution::{NeuraConv1DPadLayer, NeuraConv2DBlockLayer, NeuraConv2DPadLayer};
+
+mod dropout;
+pub use dropout::NeuraDropoutLayer;
+
+mod softmax;
+pub use softmax::NeuraSoftmaxLayer;
+
+mod one_hot;
+pub use one_hot::NeuraOneHotLayer;
+
+mod lock;
+pub use lock::NeuraLockLayer;
+
+mod pool;
+pub use pool::{NeuraGlobalPoolLayer, NeuraPool1DLayer};
+
+mod reshape;
+pub use reshape::{NeuraFlattenLayer, NeuraReshapeLayer};
+
+use crate::algebra::NeuraVectorSpace;
+
+pub trait NeuraLayer {
+    type Input;
+    type Output;
+
+    fn eval(&self, input: &Self::Input) -> Self::Output;
+}
+
+pub trait NeuraTrainableLayer: NeuraLayer {
+    /// The representation of the layer gradient as a vector space
+    type Delta: NeuraVectorSpace;
+
+    /// Computes the backpropagation term and the derivative of the internal weights,
+    /// using the `input` vector outputted by the previous layer and the backpropagation term `epsilon` of the next layer.
+    ///
+    /// Note: we introduce the term `epsilon`, which together with the activation of the current function can be used to compute `delta_l`:
+    /// ```no_rust
+    /// f_l'(a_l) * epsilon_l = delta_l
+    /// ```
+    ///
+    /// The function should then return a pair `(epsilon_{l-1}, δW_l)`,
+    /// with `epsilon_{l-1}` being multiplied by `f_{l-1}'(activation)` by the next layer to obtain `delta_{l-1}`.
+    /// Using this intermediate value for `delta` allows us to isolate it computation to the respective layers.
+    fn backpropagate(
+        &self,
+        input: &Self::Input,
+        epsilon: Self::Output,
+    ) -> (Self::Input, Self::Delta);
+
+    /// Computes the regularization
+    fn regularize(&self) -> Self::Delta;
+
+    /// Applies `δW_l` to the weights of the layer
+    fn apply_gradient(&mut self, gradient: &Self::Delta);
+
+    /// Called before an iteration begins, to allow the layer to set itself up for training.
+    #[inline(always)]
+    fn prepare_epoch(&mut self) {}
+
+    /// Called at the end of training, to allow the layer to clean itself up
+    #[inline(always)]
+    fn cleanup(&mut self) {}
+}
+
+#[macro_export]
+macro_rules! neura_layer {
+    ( "dense", $( $shape:expr ),*; $activation:expr ) => {
+        $crate::layer::NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation, $crate::derivable::regularize::NeuraL0)
+            as neura_layer!("_dense_shape", $($shape),*)
+    };
+
+    ( "dense", $( $shape:expr ),*; $activation:expr, $regularization:expr ) => {
+        $crate::layer::NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation, $regularization)
+            as neura_layer!("_dense_shape", $($shape),*)
+    };
+
+    ( "_dense_shape", $output:expr ) => {
+        $crate::layer::NeuraDenseLayer<_, _, _, $output>
+    };
+
+    ( "_dense_shape", $input:expr, $output:expr ) => {
+        $crate::layer::NeuraDenseLayer<_, _, $input, $output>
+    };
+
+    ( "dropout", $probability:expr ) => {
+        $crate::layer::NeuraDropoutLayer::new($probability, rand::thread_rng())
+            as $crate::layer::NeuraDropoutLayer<_, _>
+    };
+
+    ( "softmax" ) => {
+        $crate::layer::NeuraSoftmaxLayer::new() as $crate::layer::NeuraSoftmaxLayer<_>
+    };
+
+    ( "softmax", $length:expr ) => {
+        $crate::layer::NeuraSoftmaxLayer::new() as $crate::layer::NeuraSoftmaxLayer<$length>
+    };
+
+    ( "one_hot" ) => {
+        $crate::layer::NeuraOneHotLayer as $crate::layer::NeuraOneHotLayer<2, _>
+    };
+
+    ( "lock", $layer:expr ) => {
+        $crate::layer::NeuraLockLayer($layer)
+    };
+
+    ( "conv1d_pad", $length:expr, $feats:expr; $window:expr; $layer:expr ) => {
+        $crate::layer::NeuraConv1DPadLayer::new($layer, Default::default()) as $crate::layer::NeuraConv1DPadLayer<$length, $feats, $window, _>
+    };
+
+    ( "conv1d_pad"; $window:expr; $layer:expr ) => {
+        $crate::layer::NeuraConv1DPadLayer::new($layer, Default::default()) as $crate::layer::NeuraConv1DPadLayer<_, _, $window, _>
+    };
+
+    ( "conv2d_pad", $feats:expr, $length:expr; $width:expr, $window:expr; $layer:expr ) => {
+        $crate::layer::NeuraConv2DPadLayer::new($layer, Default::default(), $width) as $crate::layer::NeuraConv2DPadLayer<$length, $feats, $window, _>
+    };
+
+    ( "conv2d_pad"; $width:expr, $window:expr; $layer:expr ) => {
+        $crate::layer::NeuraConv2DPadLayer::new($layer, Default::default(), $width) as $crate::layer::NeuraConv2DPadLayer<_, _, $window, _>
+    };
+
+    ( "conv2d_block", $feats:expr, $width:expr, $height:expr; $block_size:expr; $layer:expr ) => {
+        $crate::layer::NeuraConv2DBlockLayer::new($layer) as $crate::layer::NeuraConv2DBlockLayer<$width, $height, $feats, $block_size, _>
+    };
+
+    ( "conv2d_block", $width:expr, $height:expr; $block_size:expr; $layer:expr ) => {
+        $crate::layer::NeuraConv2DBlockLayer::new($layer) as $crate::layer::NeuraConv2DBlockLayer<$width, $height, _, $block_size, _>
+    };
+
+    ( "pool_global"; $reduce:expr ) => {
+        $crate::layer::NeuraGlobalPoolLayer::new($reduce) as $crate::layer::NeuraGlobalPoolLayer<_, _, _>
+    };
+
+    ( "pool_global", $feats:expr, $length:expr; $reduce:expr ) => {
+        $crate::layer::NeuraGlobalPoolLayer::new($reduce) as $crate::layer::NeuraGlobalPoolLayer<$length, $feats, _>
+    };
+
+    ( "pool1d", $blocklength:expr; $reduce:expr ) => {
+        $crate::layer::NeuraPool1DLayer::new($reduce) as $crate::layer::NeuraPool1DLayer<_, $blocklength, _, _>
+    };
+
+    ( "pool1d", $blocks:expr, $blocklength:expr; $reduce:expr ) => {
+        $crate::layer::NeuraPool1DLayer::new($reduce) as $crate::layer::NeuraPool1DLayer<$blocks, $blocklength, _, _>
+    };
+
+    ( "pool1d", $feats:expr, $blocks:expr, $blocklength:expr; $reduce:expr ) => {
+        $crate::layer::NeuraPool1DLayer::new($reduce) as $crate::layer::NeuraPool1DLayer<$blocks, $blocklength, $feats, _>
+    };
+
+    ( "unstable_flatten" ) => {
+        $crate::layer::NeuraFlattenLayer::new() as $crate::layer::NeuraFlattenLayer<_, _, f64>
+    };
+
+    ( "unstable_flatten", $width:expr, $height:expr ) => {
+        $crate::layer::NeuraFlattenLayer::new() as $crate::layer::NeuraFlattenLayer<$width, $height, f64>
+    };
+
+    ( "unstable_reshape", $height:expr ) => {
+        $crate::layer::NeuraReshapeLayer::new() as $crate::layer::NeuraReshapeLayer<_, $height, f64>
+    };
+
+    ( "unstable_reshape", $width:expr, $height:expr ) => {
+        $crate::layer::NeuraReshapeLayer::new() as $crate::layer::NeuraReshapeLayer<$width, $height, f64>
+    };
+}
--- a/src/old_layer/one_hot.rs
+++ b/src/old_layer/one_hot.rs
--- a/src/old_layer/pool.rs
+++ b/src/old_layer/pool.rs
--- a/src/old_layer/reshape.rs
+++ b/src/old_layer/reshape.rs
--- a/src/old_layer/softmax.rs
+++ b/src/old_layer/softmax.rs
--- a/src/train.rs
+++ b/src/train.rs
@ -5,26 +5,20 @@ use crate::{
    network::{sequential::NeuraSequential, NeuraTrainableNetwork},
 };

-pub trait NeuraGradientSolver<Output, Target = Output> {
-    fn get_gradient<Layer: NeuraLayer, ChildNetwork>(
+pub trait NeuraGradientSolver<Input, Target, Trainable: NeuraTrainableNetwork<Input>> {
+    fn get_gradient(
        &self,
-        trainable: &NeuraSequential<Layer, ChildNetwork>,
-        input: &Layer::Input,
+        trainable: &Trainable,
+        input: &Input,
        target: &Target,
-    ) -> <NeuraSequential<Layer, ChildNetwork> as NeuraTrainableNetwork>::Delta
-    where
-        NeuraSequential<Layer, ChildNetwork>:
-            NeuraTrainableNetwork<Input = Layer::Input, Output = Output>;
+    ) -> Trainable::Delta;

-    fn score<Layer: NeuraLayer, ChildNetwork>(
+    fn score(
        &self,
-        trainable: &NeuraSequential<Layer, ChildNetwork>,
-        input: &Layer::Input,
+        trainable: &Trainable,
+        input: &Input,
        target: &Target,
-    ) -> f64
-    where
-        NeuraSequential<Layer, ChildNetwork>:
-            NeuraTrainableNetwork<Input = Layer::Input, Output = Output>;
+    ) -> f64;
 }

 #[non_exhaustive]
@ -38,32 +32,24 @@ impl<Loss: NeuraLoss + Clone> NeuraBackprop<Loss> {
    }
 }

-impl<const N: usize, Loss: NeuraLoss<Input = NeuraVector<N, f64>> + Clone>
-    NeuraGradientSolver<NeuraVector<N, f64>, Loss::Target> for NeuraBackprop<Loss>
+impl<Input, Target, Trainable: NeuraTrainableNetwork<Input>, Loss: NeuraLoss<Input = Trainable::Output, Target = Target> + Clone>
+    NeuraGradientSolver<Input, Target, Trainable> for NeuraBackprop<Loss>
 {
-    fn get_gradient<Layer: NeuraLayer, ChildNetwork>(
+    fn get_gradient(
        &self,
-        trainable: &NeuraSequential<Layer, ChildNetwork>,
-        input: &Layer::Input,
-        target: &Loss::Target,
-    ) -> <NeuraSequential<Layer, ChildNetwork> as NeuraTrainableNetwork>::Delta
-    where
-        NeuraSequential<Layer, ChildNetwork>:
-            NeuraTrainableNetwork<Input = Layer::Input, Output = NeuraVector<N, f64>>,
-    {
+        trainable: &Trainable,
+        input: &Input,
+        target: &Target,
+    ) -> Trainable::Delta {
        trainable.backpropagate(input, target, self.loss.clone()).1
    }

-    fn score<Layer: NeuraLayer, ChildNetwork>(
+    fn score(
        &self,
-        trainable: &NeuraSequential<Layer, ChildNetwork>,
-        input: &Layer::Input,
-        target: &Loss::Target,
-    ) -> f64
-    where
-        NeuraSequential<Layer, ChildNetwork>:
-            NeuraTrainableNetwork<Input = Layer::Input, Output = NeuraVector<N, f64>>,
-    {
+        trainable: &Trainable,
+        input: &Input,
+        target: &Target,
+    ) -> f64 {
        let output = trainable.eval(&input);
        self.loss.eval(target, &output)
    }
@ -137,41 +123,32 @@ impl NeuraBatchedTrainer {
    }

    pub fn train<
-        Output,
+        Input: Clone,
        Target: Clone,
-        GradientSolver: NeuraGradientSolver<Output, Target>,
-        Layer: NeuraLayer,
-        ChildNetwork,
-        Inputs: IntoIterator<Item = (Layer::Input, Target)>,
+        Network: NeuraTrainableNetwork<Input>,
+        GradientSolver: NeuraGradientSolver<Input, Target, Network>,
+        Inputs: IntoIterator<Item = (Input, Target)>,
    >(
        &self,
        gradient_solver: GradientSolver,
-        network: &mut NeuraSequential<Layer, ChildNetwork>,
+        network: &mut Network,
        inputs: Inputs,
-        test_inputs: &[(Layer::Input, Target)],
-    ) where
-        NeuraSequential<Layer, ChildNetwork>:
-            NeuraTrainableNetwork<Input = Layer::Input, Output = Output>,
-        Layer::Input: Clone,
-    {
+        test_inputs: &[(Input, Target)],
+    ) {
        let mut iter = inputs.into_iter();
        let factor = -self.learning_rate / (self.batch_size as f64);
        let momentum_factor = self.learning_momentum / self.learning_rate;
        let reg_factor = -self.learning_rate;

        // Contains `momentum_factor * factor * gradient_sum_previous_iter`
-        let mut previous_gradient_sum =
-            Box::<<NeuraSequential<Layer, ChildNetwork> as NeuraTrainableNetwork>::Delta>::zero();
+        let mut previous_gradient_sum = network.default_gradient();
        'd: for iteration in 0..self.iterations {
-            let mut gradient_sum = Box::<
-                <NeuraSequential<Layer, ChildNetwork> as NeuraTrainableNetwork>::Delta,
-            >::zero();
-            network.prepare_epoch();
+            let mut gradient_sum = network.default_gradient();
+            network.prepare(true);

            for _ in 0..self.batch_size {
                if let Some((input, target)) = iter.next() {
-                    let gradient =
-                        Box::new(gradient_solver.get_gradient(&network, &input, &target));
+                    let gradient = gradient_solver.get_gradient(&network, &input, &target);
                    gradient_sum.add_assign(&gradient);
                } else {
                    break 'd;
@ -194,7 +171,7 @@ impl NeuraBatchedTrainer {
            }

            if self.log_iterations > 0 && (iteration + 1) % self.log_iterations == 0 {
-                network.cleanup();
+                network.prepare(false);
                let mut loss_sum = 0.0;
                for (input, target) in test_inputs {
                    loss_sum += gradient_solver.score(&network, input, target);
@ -204,12 +181,14 @@ impl NeuraBatchedTrainer {
            }
        }

-        network.cleanup();
+        network.prepare(false);
    }
 }

 #[cfg(test)]
 mod test {
+    use nalgebra::{DMatrix, dmatrix, dvector};
+
    use super::*;
    use crate::{
        assert_approx,
@ -224,19 +203,19 @@ mod test {
        for wa in [0.0, 0.25, 0.5, 1.0] {
            for wb in [0.0, 0.25, 0.5, 1.0] {
                let network = NeuraSequential::new(
-                    NeuraDenseLayer::new([[wa, wb]].into(), [0.0].into(), Linear, NeuraL0),
+                    NeuraDenseLayer::new(dmatrix![wa, wb], dvector![0.0], Linear, NeuraL0),
                    (),
                );

-                let gradient = NeuraBackprop::new(Euclidean).get_gradient(
+                let (gradient, _) = NeuraBackprop::new(Euclidean).get_gradient(
                    &network,
-                    &[1.0, 1.0].into(),
-                    &[0.0].into(),
+                    &dvector![1.0, 1.0],
+                    &dvector![0.0],
                );

                let expected = wa + wb;
-                assert!((gradient.0[0][0] - expected) < 0.001);
-                assert!((gradient.0[0][1] - expected) < 0.001);
+                assert!((gradient.0[(0, 0)] - expected) < 0.001);
+                assert!((gradient.0[(0, 1)] - expected) < 0.001);
            }
        }
    }
@ -247,42 +226,42 @@ mod test {
        // Test that we get the same values as https://hmkcode.com/ai/backpropagation-step-by-step/
        let network = neura_sequential![
            NeuraDenseLayer::new(
-                [[0.11, 0.21], [0.12, 0.08]].into(),
-                [0.0; 2].into(),
+                dmatrix![0.11, 0.21; 0.12, 0.08],
+                dvector![0.0, 0.0],
                Linear,
                NeuraL0
            ),
-            NeuraDenseLayer::new([[0.14, 0.15]].into(), [0.0].into(), Linear, NeuraL0)
+            NeuraDenseLayer::new(dmatrix![0.14, 0.15], dvector![0.0], Linear, NeuraL0)
        ];

-        let input = [2.0, 3.0];
-        let target = [1.0];
+        let input = dvector![2.0, 3.0];
+        let target = dvector![1.0];

-        let intermediary = network.clone().trim_tail().eval(&input.into());
+        let intermediary = network.clone().trim_tail().eval(&input);
        assert_approx!(0.85, intermediary[0], EPSILON);
        assert_approx!(0.48, intermediary[1], EPSILON);
-        assert_approx!(0.191, network.eval(&input.into())[0], EPSILON);
+        assert_approx!(0.191, network.eval(&input)[0], EPSILON);

        assert_approx!(
            0.327,
-            Euclidean.eval(&target.into(), &network.eval(&input.into())),
+            Euclidean.eval(&target, &network.eval(&input)),
            0.001
        );

-        let delta = network.eval(&input.into())[0] - target[0];
+        let delta = network.eval(&input)[0] - target[0];

        let (gradient_first, gradient_second) =
-            NeuraBackprop::new(Euclidean).get_gradient(&network, &input.into(), &target.into());
+            NeuraBackprop::new(Euclidean).get_gradient(&network, &input, &target);
        let gradient_first = gradient_first.0;
-        let gradient_second = gradient_second.0[0];
+        let gradient_second = gradient_second.0.0;

        assert_approx!(gradient_second[0], intermediary[0] * delta, EPSILON);
        assert_approx!(gradient_second[1], intermediary[1] * delta, EPSILON);

-        assert_approx!(gradient_first[0][0], input[0] * delta * 0.14, EPSILON);
-        assert_approx!(gradient_first[0][1], input[1] * delta * 0.14, EPSILON);
+        assert_approx!(gradient_first[(0, 0)], input[0] * delta * 0.14, EPSILON);
+        assert_approx!(gradient_first[(0, 1)], input[1] * delta * 0.14, EPSILON);

-        assert_approx!(gradient_first[1][0], input[0] * delta * 0.15, EPSILON);
-        assert_approx!(gradient_first[1][1], input[1] * delta * 0.15, EPSILON);
+        assert_approx!(gradient_first[(1, 0)], input[0] * delta * 0.15, EPSILON);
+        assert_approx!(gradient_first[(1, 1)], input[1] * delta * 0.15, EPSILON);
    }
 }