🔥 🚚 ♻️ Refactoring the previous layer system

It was becoming almost impossible to manage the dimensions of the layers, especially with convolution layers. Generic consts are nice, but they are a bit too early to have right now for this use-case. We'll probably be expanding the implementations to accept const or dynamically-sized layers at some point, for performance-critical applications.
2 years ago · 2edbff860c
parent cc7686569a
commit 2edbff860c
21 changed files with 796 additions and 458 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -7,6 +7,7 @@ edition = "2021"
 [dependencies]
 boxed-array = "0.1.0"
 nalgebra = { version = "^0.32", features = ["std", "macros", "rand"] }
 ndarray = "^0.15"
 num = "^0.4"
 # num-traits = "0.2.15"
--- a/examples/xor.rs
+++ b/examples/xor.rs
@ -1,22 +1,24 @@
 #![feature(generic_arg_infer)]
-use neuramethyst::algebra::NeuraVector;
+use nalgebra::dvector;
 use neuramethyst::derivable::activation::Relu;
 use neuramethyst::derivable::loss::Euclidean;
-use neuramethyst::{cycle_shuffling, prelude::*};
+use neuramethyst::prelude::*;
 use neuramethyst::cycle_shuffling;
 fn main() {
    let mut network = neura_sequential![
-        neura_layer!("dense", 2, 4; Relu),
+        neura_layer!("dense", 4, Relu),
-        neura_layer!("dense", 3; Relu),
+        neura_layer!("dense", 3, Relu),
-        neura_layer!("dense", 1; Relu)
+        neura_layer!("dense", 1, Relu)
-    ];
+    ].construct(NeuraShape::Vector(2)).unwrap();
-    let inputs: [(NeuraVector<2, f64>, NeuraVector<1, f64>); 4] = [
+    let inputs = [
-        ([0.0, 0.0].into(), [0.0].into()),
+        (dvector![0.0, 0.0], dvector![0.0]),
-        ([0.0, 1.0].into(), [1.0].into()),
+        (dvector![0.0, 1.0], dvector![1.0]),
-        ([1.0, 0.0].into(), [1.0].into()),
+        (dvector![1.0, 0.0], dvector![1.0]),
-        ([1.0, 1.0].into(), [0.0].into()),
+        (dvector![1.0, 1.0], dvector![0.0]),
    ];
    for (input, target) in &inputs {
--- a/src/algebra/matrix.rs
+++ b/src/algebra/matrix.rs
@ -167,10 +167,10 @@ impl<const WIDTH: usize, const HEIGHT: usize, F: NeuraVectorSpace + Clone> Neura
        }
    }
-    #[inline(always)]
+    // #[inline(always)]
-    fn zero() -> Self {
+    // fn zero() -> Self {
-        Self::from_value(F::zero())
+    //     Self::from_value(F::zero())
-    }
+    // }
    fn norm_squared(&self) -> f64 {
        let mut sum = 0.0;
--- a/src/algebra/mod.rs
+++ b/src/algebra/mod.rs
@ -2,6 +2,8 @@ mod matrix;
 pub use matrix::NeuraMatrix;
 mod vector;
 use nalgebra::Matrix;
 use num::Float;
 pub use vector::NeuraVector;
 /// An extension of `std::ops::AddAssign` and `std::ops::Default`
@ -10,7 +12,7 @@ pub trait NeuraVectorSpace {
    fn mul_assign(&mut self, by: f64);
-    fn zero() -> Self;
+    // fn zero() -> Self;
    fn norm_squared(&self) -> f64;
 }
@ -26,10 +28,10 @@ impl NeuraVectorSpace for () {
        // Noop
    }
-    #[inline(always)]
+    // #[inline(always)]
-    fn zero() -> Self {
+    // fn zero() -> Self {
-        ()
+    //     ()
-    }
+    // }
    fn norm_squared(&self) -> f64 {
        0.0
@ -45,9 +47,9 @@ impl<T: NeuraVectorSpace> NeuraVectorSpace for Box<T> {
        self.as_mut().mul_assign(by);
    }
-    fn zero() -> Self {
+    // fn zero() -> Self {
-        Box::new(T::zero())
+    //     Box::new(T::zero())
-    }
+    // }
    fn norm_squared(&self) -> f64 {
        self.as_ref().norm_squared()
@ -65,9 +67,9 @@ impl<Left: NeuraVectorSpace, Right: NeuraVectorSpace> NeuraVectorSpace for (Left
        NeuraVectorSpace::mul_assign(&mut self.1, by);
    }
-    fn zero() -> Self {
+    // fn zero() -> Self {
-        (Left::zero(), Right::zero())
+    //     (Left::zero(), Right::zero())
-    }
+    // }
    fn norm_squared(&self) -> f64 {
        self.0.norm_squared() + self.1.norm_squared()
@ -87,24 +89,43 @@ impl<const N: usize, T: NeuraVectorSpace + Clone> NeuraVectorSpace for [T; N] {
        }
    }
-    fn zero() -> Self {
+    // fn zero() -> Self {
-        let mut res: Vec<T> = Vec::with_capacity(N);
+    //     let mut res: Vec<T> = Vec::with_capacity(N);
-        for _ in 0..N {
+    //     for _ in 0..N {
-            res.push(T::zero());
+    //         res.push(T::zero());
-        }
+    //     }
-        res.try_into().unwrap_or_else(|_| {
+    //     res.try_into().unwrap_or_else(|_| {
-            // TODO: check that this panic is optimized away
+    //         // TODO: check that this panic is optimized away
-            unreachable!()
+    //         unreachable!()
-        })
+    //     })
-    }
+    // }
    fn norm_squared(&self) -> f64 {
        self.iter().map(T::norm_squared).sum()
    }
 }
 impl<F: Float, R: nalgebra::Dim, C: nalgebra::Dim, S: nalgebra::RawStorage<F, R, C>> NeuraVectorSpace for Matrix<F, R, C, S>
 where
    Matrix<F, R, C, S>: std::ops::MulAssign<F>,
    for<'c> Matrix<F, R, C, S>: std::ops::AddAssign<&'c Matrix<F, R, C, S>>,
    F: From<f64> + Into<f64>
 {
    fn add_assign(&mut self, other: &Self) {
        *self += other;
    }
    fn mul_assign(&mut self, by: f64) {
        *self *= <F as From<f64>>::from(by);
    }
    fn norm_squared(&self) -> f64 {
        self.iter().map(|x| *x * *x).reduce(|sum, curr| sum + curr).unwrap_or(F::zero()).into()
    }
 }
 macro_rules! base {
    ( $type:ty ) => {
        impl NeuraVectorSpace for $type {
@ -116,9 +137,9 @@ macro_rules! base {
                std::ops::MulAssign::mul_assign(self, other as $type);
            }
-            fn zero() -> Self {
+            // fn zero() -> Self {
-                <Self as Default>::default()
+            //     <Self as Default>::default()
-            }
+            // }
            fn norm_squared(&self) -> f64 {
                (self * self) as f64
--- a/src/algebra/vector.rs
+++ b/src/algebra/vector.rs
@ -95,10 +95,10 @@ impl<const LENGTH: usize, F: Float + From<f64> + Into<f64>> NeuraVectorSpace
        }
    }
-    #[inline(always)]
+    // #[inline(always)]
-    fn zero() -> Self {
+    // fn zero() -> Self {
-        Self::from_value(F::zero())
+    //     Self::from_value(F::zero())
-    }
+    // }
    fn norm_squared(&self) -> f64 {
        let mut sum = F::zero();
--- a/src/derivable/loss.rs
+++ b/src/derivable/loss.rs
@ -1,19 +1,22 @@
 use nalgebra::DVector;
 use crate::algebra::NeuraVector;
 use super::NeuraLoss;
 #[derive(Clone, Copy, Debug, PartialEq)]
-pub struct Euclidean<const N: usize>;
+pub struct Euclidean;
-impl<const N: usize> NeuraLoss for Euclidean<N> {
+impl NeuraLoss for Euclidean {
-    type Input = NeuraVector<N, f64>;
+    type Input = DVector<f64>;
-    type Target = NeuraVector<N, f64>;
+    type Target = DVector<f64>;
    #[inline]
-    fn eval(&self, target: &NeuraVector<N, f64>, actual: &NeuraVector<N, f64>) -> f64 {
+    fn eval(&self, target: &DVector<f64>, actual: &DVector<f64>) -> f64 {
        assert_eq!(target.shape(), actual.shape());
        let mut sum_squared = 0.0;
-        for i in 0..N {
+        for i in 0..target.len() {
            sum_squared += (target[i] - actual[i]) * (target[i] - actual[i]);
        }
@ -23,13 +26,13 @@ impl<const N: usize> NeuraLoss for Euclidean<N> {
    #[inline]
    fn nabla(
        &self,
-        target: &NeuraVector<N, f64>,
+        target: &DVector<f64>,
-        actual: &NeuraVector<N, f64>,
+        actual: &DVector<f64>,
-    ) -> NeuraVector<N, f64> {
+    ) -> DVector<f64> {
-        let mut res = NeuraVector::default();
+        let mut res = DVector::zeros(target.len());
        // ∂E(y)/∂yᵢ = yᵢ - yᵢ'
-        for i in 0..N {
+        for i in 0..target.len() {
            res[i] = actual[i] - target[i];
        }
--- a/src/layer/dense.rs
+++ b/src/layer/dense.rs
@ -1,38 +1,49 @@
-use super::{NeuraLayer, NeuraTrainableLayer};
+use std::marker::PhantomData;
 use crate::{
    algebra::{NeuraMatrix, NeuraVector, NeuraVectorSpace},
    derivable::NeuraDerivable,
 };
 use nalgebra::{DMatrix, DVector};
 use num::Float;
 use rand::Rng;
-use rand_distr::Distribution;
+
 use crate::derivable::NeuraDerivable;
 use super::*;
 #[derive(Clone, Debug)]
-pub struct NeuraDenseLayer<
+pub struct NeuraDenseLayer<F: Float, Act: NeuraDerivable<F>, Reg: NeuraDerivable<F>> {
-    Act: NeuraDerivable<f64>,
+    weights: DMatrix<F>,
-    Reg: NeuraDerivable<f64>,
+    bias: DVector<F>,
-    const INPUT_LEN: usize,
+    activation: Act,
-    const OUTPUT_LEN: usize,
+    regularization: Reg,
 }
 #[derive(Clone, Debug)]
 pub struct NeuraDenseLayerPartial<
    F: Float,
    Act: NeuraDerivable<F>,
    Reg: NeuraDerivable<F>,
    R: Rng,
 > {
    weights: NeuraMatrix<INPUT_LEN, OUTPUT_LEN, f64>,
    bias: NeuraVector<OUTPUT_LEN, f64>,
    activation: Act,
    regularization: Reg,
    output_size: usize,
    rng: R,
    phantom: PhantomData<F>,
 }
 impl<
-        Act: NeuraDerivable<f64>,
+        F: Float + From<f64> + std::fmt::Debug + 'static,
-        Reg: NeuraDerivable<f64>,
+        Act: NeuraDerivable<F>,
-        const INPUT_LEN: usize,
+        Reg: NeuraDerivable<F>,
-        const OUTPUT_LEN: usize,
+    > NeuraDenseLayer<F, Act, Reg>
    > NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
 {
    pub fn new(
-        weights: NeuraMatrix<INPUT_LEN, OUTPUT_LEN, f64>,
+        weights: DMatrix<F>,
-        bias: NeuraVector<OUTPUT_LEN, f64>,
+        bias: DVector<F>,
        activation: Act,
        regularization: Reg,
    ) -> Self {
        assert_eq!(bias.shape().0, weights.shape().0);
        Self {
            weights,
            bias,
@ -41,85 +52,129 @@ impl<
        }
    }
-    pub fn from_rng(rng: &mut impl Rng, activation: Act, regularization: Reg) -> Self {
+    pub fn from_rng(
-        let mut weights: NeuraMatrix<INPUT_LEN, OUTPUT_LEN, f64> = NeuraMatrix::from_value(0.0f64);
+        input_size: usize,
-
+        output_size: usize,
-        // Use Xavier (or He) initialisation, using the harmonic mean
+        rng: &mut impl Rng,
-        // Ref: https://www.deeplearning.ai/ai-notes/initialization/index.html
+        activation: Act,
        regularization: Reg,
    ) -> Self
    where
        rand_distr::StandardNormal: rand_distr::Distribution<F>,
    {
        let distribution = rand_distr::Normal::new(
-            0.0,
+            F::zero(),
-            activation.variance_hint() * 2.0 / (INPUT_LEN as f64 + OUTPUT_LEN as f64),
+            <F as From<f64>>::from(
                activation.variance_hint() * 2.0 / (input_size as f64 + output_size as f64),
            ),
        )
        .unwrap();
        // let distribution = rand_distr::Uniform::new(-0.5, 0.5);
-        for i in 0..OUTPUT_LEN {
+        Self {
-            for j in 0..INPUT_LEN {
+            weights: DMatrix::from_distribution(output_size, input_size, &distribution, rng),
-                weights[i][j] = distribution.sample(rng);
+            bias: DVector::from_element(
-            }
+                output_size,
                <F as From<f64>>::from(activation.bias_hint()),
            ),
            activation,
            regularization,
        }
    }
-        Self {
+    pub fn new_partial<R: Rng>(
-            weights,
+        output_size: usize,
-            // Biases are initialized based on the activation's hint
+        rng: R,
-            bias: NeuraVector::from_value(activation.bias_hint()),
+        activation: Act,
        regularization: Reg,
    ) -> NeuraDenseLayerPartial<F, Act, Reg, R> {
        NeuraDenseLayerPartial {
            activation,
            regularization,
            output_size,
            rng,
            phantom: PhantomData,
        }
    }
 }
 impl<
-        Act: NeuraDerivable<f64>,
+        F: Float + From<f64> + std::fmt::Debug + 'static,
-        Reg: NeuraDerivable<f64>,
+        Act: NeuraDerivable<F>,
-        const INPUT_LEN: usize,
+        Reg: NeuraDerivable<F>,
-        const OUTPUT_LEN: usize,
+        R: Rng,
-    > NeuraLayer for NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
+    > NeuraPartialLayer for NeuraDenseLayerPartial<F, Act, Reg, R>
 where
    rand_distr::StandardNormal: rand_distr::Distribution<F>,
 {
-    type Input = NeuraVector<INPUT_LEN, f64>;
+    type Constructed = NeuraDenseLayer<F, Act, Reg>;
    type Err = ();
    fn construct(self, input_shape: NeuraShape) -> Result<Self::Constructed, Self::Err> {
        let mut rng = self.rng;
        Ok(NeuraDenseLayer::from_rng(
            input_shape.size(),
            self.output_size,
            &mut rng,
            self.activation,
            self.regularization,
        ))
    }
    fn output_shape(constructed: &Self::Constructed) -> NeuraShape {
        NeuraShape::Vector(constructed.weights.shape().0)
    }
 }
-    type Output = NeuraVector<OUTPUT_LEN, f64>;
+impl<
        F: Float + From<f64> + std::fmt::Debug + 'static + std::ops::AddAssign + std::ops::MulAssign,
        Act: NeuraDerivable<F>,
        Reg: NeuraDerivable<F>,
    > NeuraLayer<DVector<F>> for NeuraDenseLayer<F, Act, Reg>
 {
    type Output = DVector<F>;
-    fn eval(&self, input: &Self::Input) -> Self::Output {
+    fn eval(&self, input: &DVector<F>) -> Self::Output {
-        let mut result = self.weights.multiply_vector(input);
+        assert_eq!(input.shape().0, self.weights.shape().1);
-        for i in 0..OUTPUT_LEN {
+        let res = &self.weights * input + &self.bias;
            result[i] = self.activation.eval(result[i] + self.bias[i]);
        }
-        result
+        res.map(|x| self.activation.eval(x))
    }
 }
 impl<
-        Act: NeuraDerivable<f64>,
+        F: Float + From<f64> + Into<f64> + std::fmt::Debug + 'static + std::ops::AddAssign + std::ops::MulAssign,
-        Reg: NeuraDerivable<f64>,
+        Act: NeuraDerivable<F>,
-        const INPUT_LEN: usize,
+        Reg: NeuraDerivable<F>,
-        const OUTPUT_LEN: usize,
+    > NeuraTrainableLayer<DVector<F>> for NeuraDenseLayer<F, Act, Reg>
    > NeuraTrainableLayer for NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
 {
-    type Delta = (
+    type Gradient = (DMatrix<F>, DVector<F>);
        NeuraMatrix<INPUT_LEN, OUTPUT_LEN, f64>,
        NeuraVector<OUTPUT_LEN, f64>,
    );
-    fn backpropagate(
+    fn default_gradient(&self) -> Self::Gradient {
        (
            DMatrix::zeros(self.weights.shape().0, self.weights.shape().1),
            DVector::zeros(self.bias.shape().0),
        )
    }
    fn backprop_layer(
        &self,
-        input: &Self::Input,
+        input: &DVector<F>,
        epsilon: Self::Output,
-    ) -> (Self::Input, Self::Delta) {
+    ) -> (DVector<F>, Self::Gradient) {
-        let evaluated = self.weights.multiply_vector(input);
+        let evaluated = &self.weights * input;
        // Compute delta (the input gradient of the neuron) from epsilon (the output gradient of the neuron),
        // with `self.activation'(input) ° epsilon = delta`
-        let mut delta: NeuraVector<OUTPUT_LEN, f64> = epsilon.clone();
+        let mut delta = epsilon.clone();
-        for i in 0..OUTPUT_LEN {
+
        for i in 0..delta.len() {
            delta[i] *= self.activation.derivate(evaluated[i]);
        }
        // Compute the weight gradient
-        let weights_gradient = delta.reverse_dot(input);
+        let weights_gradient = &delta * input.transpose();
-        let new_epsilon = self.weights.transpose_multiply_vector(&delta);
+        let new_epsilon = self.weights.tr_mul(&delta);
        // According to https://datascience.stackexchange.com/questions/20139/gradients-for-bias-terms-in-backpropagation
        // The gradient of the bias is equal to the delta term of the backpropagation algorithm
@ -128,53 +183,12 @@ impl<
        (new_epsilon, (weights_gradient, bias_gradient))
    }
-    fn apply_gradient(&mut self, gradient: &Self::Delta) {
+    fn regularize_layer(&self) -> Self::Gradient {
-        NeuraVectorSpace::add_assign(&mut self.weights, &gradient.0);
+        (self.weights.map(|x| self.regularization.derivate(x)), DVector::zeros(self.bias.shape().0))
        NeuraVectorSpace::add_assign(&mut self.bias, &gradient.1);
    }
-    fn regularize(&self) -> Self::Delta {
+    fn apply_gradient(&mut self, gradient: &Self::Gradient) {
-        let mut res = Self::Delta::default();
+        self.weights += &gradient.0;
-
+        self.bias += &gradient.1;
        for i in 0..OUTPUT_LEN {
            for j in 0..INPUT_LEN {
                res.0[i][j] = self.regularization.derivate(self.weights[i][j]);
            }
        }
        // Note: biases aren't taken into account here, as per https://stats.stackexchange.com/questions/153605/no-regularisation-term-for-bias-unit-in-neural-network
        res
    }
 }
 #[cfg(test)]
 mod test {
    use super::*;
    use crate::{
        derivable::{activation::Relu, regularize::NeuraL0},
        utils::uniform_vector,
    };
    #[test]
    fn test_from_rng() {
        let mut rng = rand::thread_rng();
        let layer: NeuraDenseLayer<_, _, 64, 32> =
            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0);
        let mut input = [0.0; 64];
        for x in 0..64 {
            input[x] = rng.gen();
        }
        assert!(layer.eval(&input.into()).len() == 32);
    }
    #[test]
    fn test_stack_overflow_big_layer() {
        let layer = NeuraDenseLayer::from_rng(&mut rand::thread_rng(), Relu, NeuraL0)
            as NeuraDenseLayer<Relu, NeuraL0, 1000, 1000>;
        layer.backpropagate(&uniform_vector(), uniform_vector());
        <NeuraDenseLayer<Relu, NeuraL0, 1000, 1000> as NeuraTrainableLayer>::Delta::zero();
    }
 }
--- a/src/layer/mod.rs
+++ b/src/layer/mod.rs
@ -1,39 +1,55 @@
-mod dense;
+use num::Float;
 pub use dense::NeuraDenseLayer;
-mod convolution;
+use crate::algebra::NeuraVectorSpace;
 pub use convolution::{NeuraConv1DPadLayer, NeuraConv2DBlockLayer, NeuraConv2DPadLayer};
-mod dropout;
+pub mod dense;
-pub use dropout::NeuraDropoutLayer;
+pub use dense::NeuraDenseLayer;
-mod softmax;
+#[derive(Clone, Copy, PartialEq, Debug)]
-pub use softmax::NeuraSoftmaxLayer;
+pub enum NeuraShape {
    Vector(usize),        // entries
    Matrix(usize, usize), // rows, columns
    Tensor(usize, usize, usize), // rows, columns, channels
 }
-mod one_hot;
+impl NeuraShape {
-pub use one_hot::NeuraOneHotLayer;
+    pub fn size(&self) -> usize {
        match self {
            NeuraShape::Vector(entries) => *entries,
            NeuraShape::Matrix(rows, columns) => rows * columns,
            NeuraShape::Tensor(rows, columns, channels) => rows * columns * channels
        }
    }
 }
 pub trait NeuraLayer<Input> {
    type Output;
-mod lock;
+    fn eval(&self, input: &Input) -> Self::Output;
-pub use lock::NeuraLockLayer;
+}
-mod pool;
+impl<Input: Clone> NeuraLayer<Input> for () {
-pub use pool::{NeuraGlobalPoolLayer, NeuraPool1DLayer};
+    type Output = Input;
-mod reshape;
+    fn eval(&self, input: &Input) -> Self::Output {
-pub use reshape::{NeuraFlattenLayer, NeuraReshapeLayer};
+        input.clone()
    }
 }
-use crate::algebra::NeuraVectorSpace;
+pub trait NeuraPartialLayer {
    type Constructed;
    type Err;
-pub trait NeuraLayer {
+    fn construct(self, input_shape: NeuraShape) -> Result<Self::Constructed, Self::Err>;
    type Input;
    type Output;
-    fn eval(&self, input: &Self::Input) -> Self::Output;
+    fn output_shape(constructed: &Self::Constructed) -> NeuraShape;
 }
-pub trait NeuraTrainableLayer: NeuraLayer {
+pub trait NeuraTrainableLayer<Input>: NeuraLayer<Input> {
    /// The representation of the layer gradient as a vector space
-    type Delta: NeuraVectorSpace;
+    type Gradient: NeuraVectorSpace;
    fn default_gradient(&self) -> Self::Gradient;
    /// Computes the backpropagation term and the derivative of the internal weights,
    /// using the `input` vector outputted by the previous layer and the backpropagation term `epsilon` of the next layer.
@ -46,125 +62,28 @@ pub trait NeuraTrainableLayer: NeuraLayer {
    /// The function should then return a pair `(epsilon_{l-1}, δW_l)`,
    /// with `epsilon_{l-1}` being multiplied by `f_{l-1}'(activation)` by the next layer to obtain `delta_{l-1}`.
    /// Using this intermediate value for `delta` allows us to isolate it computation to the respective layers.
-    fn backpropagate(
+    fn backprop_layer(
        &self,
-        input: &Self::Input,
+        input: &Input,
        epsilon: Self::Output,
-    ) -> (Self::Input, Self::Delta);
+    ) -> (Input, Self::Gradient);
    /// Computes the regularization
-    fn regularize(&self) -> Self::Delta;
+    fn regularize_layer(&self) -> Self::Gradient;
    /// Applies `δW_l` to the weights of the layer
-    fn apply_gradient(&mut self, gradient: &Self::Delta);
+    fn apply_gradient(&mut self, gradient: &Self::Gradient);
    /// Called before an iteration begins, to allow the layer to set itself up for training.
    #[inline(always)]
    fn prepare_epoch(&mut self) {}
-    /// Called at the end of training, to allow the layer to clean itself up
+    /// Arbitrary computation that can be executed at the start of an epoch
    #[allow(unused_variables)]
    #[inline(always)]
-    fn cleanup(&mut self) {}
+    fn prepare_layer(&mut self, is_training: bool) {}
 }
 /// Temporary implementation of neura_layer
 #[macro_export]
 macro_rules! neura_layer {
-    ( "dense", $( $shape:expr ),*; $activation:expr ) => {
+    ( "dense", $output:expr, $activation:expr ) => {
-        $crate::layer::NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation, $crate::derivable::regularize::NeuraL0)
+        $crate::layer::dense::NeuraDenseLayer::new_partial($output, rand::thread_rng(), $activation, $crate::derivable::regularize::NeuraL0)
-            as neura_layer!("_dense_shape", $($shape),*)
+    }
    };
    ( "dense", $( $shape:expr ),*; $activation:expr, $regularization:expr ) => {
        $crate::layer::NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation, $regularization)
            as neura_layer!("_dense_shape", $($shape),*)
    };
    ( "_dense_shape", $output:expr ) => {
        $crate::layer::NeuraDenseLayer<_, _, _, $output>
    };
    ( "_dense_shape", $input:expr, $output:expr ) => {
        $crate::layer::NeuraDenseLayer<_, _, $input, $output>
    };
    ( "dropout", $probability:expr ) => {
        $crate::layer::NeuraDropoutLayer::new($probability, rand::thread_rng())
            as $crate::layer::NeuraDropoutLayer<_, _>
    };
    ( "softmax" ) => {
        $crate::layer::NeuraSoftmaxLayer::new() as $crate::layer::NeuraSoftmaxLayer<_>
    };
    ( "softmax", $length:expr ) => {
        $crate::layer::NeuraSoftmaxLayer::new() as $crate::layer::NeuraSoftmaxLayer<$length>
    };
    ( "one_hot" ) => {
        $crate::layer::NeuraOneHotLayer as $crate::layer::NeuraOneHotLayer<2, _>
    };
    ( "lock", $layer:expr ) => {
        $crate::layer::NeuraLockLayer($layer)
    };
    ( "conv1d_pad", $length:expr, $feats:expr; $window:expr; $layer:expr ) => {
        $crate::layer::NeuraConv1DPadLayer::new($layer, Default::default()) as $crate::layer::NeuraConv1DPadLayer<$length, $feats, $window, _>
    };
    ( "conv1d_pad"; $window:expr; $layer:expr ) => {
        $crate::layer::NeuraConv1DPadLayer::new($layer, Default::default()) as $crate::layer::NeuraConv1DPadLayer<_, _, $window, _>
    };
    ( "conv2d_pad", $feats:expr, $length:expr; $width:expr, $window:expr; $layer:expr ) => {
        $crate::layer::NeuraConv2DPadLayer::new($layer, Default::default(), $width) as $crate::layer::NeuraConv2DPadLayer<$length, $feats, $window, _>
    };
    ( "conv2d_pad"; $width:expr, $window:expr; $layer:expr ) => {
        $crate::layer::NeuraConv2DPadLayer::new($layer, Default::default(), $width) as $crate::layer::NeuraConv2DPadLayer<_, _, $window, _>
    };
    ( "conv2d_block", $feats:expr, $width:expr, $height:expr; $block_size:expr; $layer:expr ) => {
        $crate::layer::NeuraConv2DBlockLayer::new($layer) as $crate::layer::NeuraConv2DBlockLayer<$width, $height, $feats, $block_size, _>
    };
    ( "conv2d_block", $width:expr, $height:expr; $block_size:expr; $layer:expr ) => {
        $crate::layer::NeuraConv2DBlockLayer::new($layer) as $crate::layer::NeuraConv2DBlockLayer<$width, $height, _, $block_size, _>
    };
    ( "pool_global"; $reduce:expr ) => {
        $crate::layer::NeuraGlobalPoolLayer::new($reduce) as $crate::layer::NeuraGlobalPoolLayer<_, _, _>
    };
    ( "pool_global", $feats:expr, $length:expr; $reduce:expr ) => {
        $crate::layer::NeuraGlobalPoolLayer::new($reduce) as $crate::layer::NeuraGlobalPoolLayer<$length, $feats, _>
    };
    ( "pool1d", $blocklength:expr; $reduce:expr ) => {
        $crate::layer::NeuraPool1DLayer::new($reduce) as $crate::layer::NeuraPool1DLayer<_, $blocklength, _, _>
    };
    ( "pool1d", $blocks:expr, $blocklength:expr; $reduce:expr ) => {
        $crate::layer::NeuraPool1DLayer::new($reduce) as $crate::layer::NeuraPool1DLayer<$blocks, $blocklength, _, _>
    };
    ( "pool1d", $feats:expr, $blocks:expr, $blocklength:expr; $reduce:expr ) => {
        $crate::layer::NeuraPool1DLayer::new($reduce) as $crate::layer::NeuraPool1DLayer<$blocks, $blocklength, $feats, _>
    };
    ( "unstable_flatten" ) => {
        $crate::layer::NeuraFlattenLayer::new() as $crate::layer::NeuraFlattenLayer<_, _, f64>
    };
    ( "unstable_flatten", $width:expr, $height:expr ) => {
        $crate::layer::NeuraFlattenLayer::new() as $crate::layer::NeuraFlattenLayer<$width, $height, f64>
    };
    ( "unstable_reshape", $height:expr ) => {
        $crate::layer::NeuraReshapeLayer::new() as $crate::layer::NeuraReshapeLayer<_, $height, f64>
    };
    ( "unstable_reshape", $width:expr, $height:expr ) => {
        $crate::layer::NeuraReshapeLayer::new() as $crate::layer::NeuraReshapeLayer<$width, $height, f64>
    };
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,12 +1,15 @@
 #![feature(generic_arg_infer)]
 #![feature(generic_const_exprs)]
 #![feature(negative_impls)]
 pub mod algebra;
 pub mod derivable;
-pub mod layer;
+// pub mod layer;
 pub mod network;
 pub mod train;
 pub mod layer;
 mod utils;
 // TODO: move to a different file
@ -17,7 +20,7 @@ pub mod prelude {
    pub use crate::{neura_layer, neura_sequential};
    // Structs and traits
-    pub use crate::layer::{NeuraDenseLayer, NeuraDropoutLayer, NeuraLayer};
+    pub use crate::layer::*;
-    pub use crate::network::sequential::{NeuraSequential, NeuraSequentialTail};
+    pub use crate::network::sequential::{NeuraSequential, NeuraSequentialTail, NeuraSequentialBuild};
    pub use crate::train::{NeuraBackprop, NeuraBatchedTrainer};
 }
--- a/src/network/mod.rs
+++ b/src/network/mod.rs
@ -2,25 +2,24 @@ use crate::{algebra::NeuraVectorSpace, derivable::NeuraLoss, layer::NeuraLayer};
 pub mod sequential;
-pub trait NeuraTrainableNetwork: NeuraLayer {
+pub trait NeuraTrainableNetwork<Input>: NeuraLayer<Input> {
    type Delta: NeuraVectorSpace;
    fn default_gradient(&self) -> Self::Delta;
    fn apply_gradient(&mut self, gradient: &Self::Delta);
    /// Should implement the backpropagation algorithm, see `NeuraTrainableLayer::backpropagate` for more information.
    fn backpropagate<Loss: NeuraLoss<Input = Self::Output>>(
        &self,
-        input: &Self::Input,
+        input: &Input,
        target: &Loss::Target,
        loss: Loss,
-    ) -> (Self::Input, Self::Delta);
+    ) -> (Input, Self::Delta);
    /// Should return the regularization gradient
    fn regularize(&self) -> Self::Delta;
-    /// Called before an iteration begins, to allow the network to set itself up for training.
+    /// Called before an iteration begins, to allow the network to set itself up for training or not.
-    fn prepare_epoch(&mut self);
+    fn prepare(&mut self, train_iteration: bool);
    /// Called at the end of training, to allow the network to clean itself up
    fn cleanup(&mut self);
 }
--- a/src/network/sequential.rs
+++ b/src/network/sequential.rs
@ -1,12 +1,14 @@
 use num::Float;
 use crate::{
    derivable::NeuraLoss,
-    layer::{NeuraLayer, NeuraTrainableLayer},
+    layer::{NeuraLayer, NeuraTrainableLayer, NeuraShape, NeuraPartialLayer},
 };
 use super::NeuraTrainableNetwork;
 #[derive(Clone, Debug)]
-pub struct NeuraSequential<Layer: NeuraLayer, ChildNetwork> {
+pub struct NeuraSequential<Layer, ChildNetwork> {
    pub layer: Layer,
    pub child_network: Box<ChildNetwork>,
 }
@ -14,13 +16,13 @@ pub struct NeuraSequential<Layer: NeuraLayer, ChildNetwork> {
 /// Operations on the tail end of a sequential network
 pub trait NeuraSequentialTail {
    type TailTrimmed;
-    type TailPushed<T: NeuraLayer>;
+    type TailPushed<T>;
    fn trim_tail(self) -> Self::TailTrimmed;
-    fn push_tail<T: NeuraLayer>(self, layer: T) -> Self::TailPushed<T>;
+    fn push_tail<T>(self, layer: T) -> Self::TailPushed<T>;
 }
-impl<Layer: NeuraLayer, ChildNetwork> NeuraSequential<Layer, ChildNetwork> {
+impl<Layer, ChildNetwork> NeuraSequential<Layer, ChildNetwork> {
    pub fn new(layer: Layer, child_network: ChildNetwork) -> Self {
        Self {
            layer,
@ -28,9 +30,10 @@ impl<Layer: NeuraLayer, ChildNetwork> NeuraSequential<Layer, ChildNetwork> {
        }
    }
-    pub fn new_match_output(layer: Layer, child_network: ChildNetwork) -> Self
+    pub fn new_match_output<Input>(layer: Layer, child_network: ChildNetwork) -> Self
    where
-        ChildNetwork: NeuraLayer<Input = Layer::Output>,
+        Layer: NeuraLayer<Input>,
        ChildNetwork: NeuraLayer<Layer::Output>,
    {
        Self::new(layer, child_network)
    }
@ -39,7 +42,10 @@ impl<Layer: NeuraLayer, ChildNetwork> NeuraSequential<Layer, ChildNetwork> {
        *self.child_network
    }
-    pub fn push_front<T: NeuraLayer>(self, layer: T) -> NeuraSequential<T, Self> {
+    pub fn push_front<Input, Input2, T: NeuraLayer<Input2, Output=Input>>(self, layer: T) -> NeuraSequential<T, Self>
    where
        Layer: NeuraLayer<Input>
    {
        NeuraSequential {
            layer: layer,
            child_network: Box::new(self),
@ -48,15 +54,15 @@ impl<Layer: NeuraLayer, ChildNetwork> NeuraSequential<Layer, ChildNetwork> {
 }
 // Trimming the last layer returns an empty network
-impl<Layer: NeuraLayer> NeuraSequentialTail for NeuraSequential<Layer, ()> {
+impl<Layer> NeuraSequentialTail for NeuraSequential<Layer, ()> {
    type TailTrimmed = ();
-    type TailPushed<T: NeuraLayer> = NeuraSequential<Layer, NeuraSequential<T, ()>>;
+    type TailPushed<T> = NeuraSequential<Layer, NeuraSequential<T, ()>>;
    fn trim_tail(self) -> Self::TailTrimmed {
        ()
    }
-    fn push_tail<T: NeuraLayer>(self, layer: T) -> Self::TailPushed<T> {
+    fn push_tail<T>(self, layer: T) -> Self::TailPushed<T> {
        NeuraSequential {
            layer: self.layer,
            child_network: Box::new(NeuraSequential {
@ -68,11 +74,11 @@ impl<Layer: NeuraLayer> NeuraSequentialTail for NeuraSequential<Layer, ()> {
 }
 // Trimming another layer returns a network which calls trim recursively
-impl<Layer: NeuraLayer, ChildNetwork: NeuraSequentialTail> NeuraSequentialTail
+impl<Layer, ChildNetwork: NeuraSequentialTail> NeuraSequentialTail
    for NeuraSequential<Layer, ChildNetwork>
 {
    type TailTrimmed = NeuraSequential<Layer, <ChildNetwork as NeuraSequentialTail>::TailTrimmed>;
-    type TailPushed<T: NeuraLayer> =
+    type TailPushed<T> =
        NeuraSequential<Layer, <ChildNetwork as NeuraSequentialTail>::TailPushed<T>>;
    fn trim_tail(self) -> Self::TailTrimmed {
@ -82,7 +88,7 @@ impl<Layer: NeuraLayer, ChildNetwork: NeuraSequentialTail> NeuraSequentialTail
        }
    }
-    fn push_tail<T: NeuraLayer>(self, layer: T) -> Self::TailPushed<T> {
+    fn push_tail<T>(self, layer: T) -> Self::TailPushed<T> {
        NeuraSequential {
            layer: self.layer,
            child_network: Box::new(self.child_network.push_tail(layer)),
@ -90,62 +96,55 @@ impl<Layer: NeuraLayer, ChildNetwork: NeuraSequentialTail> NeuraSequentialTail
    }
 }
-impl<Layer: NeuraLayer> NeuraLayer for NeuraSequential<Layer, ()> {
+impl<Input, Layer: NeuraLayer<Input>, ChildNetwork: NeuraLayer<Layer::Output>> NeuraLayer<Input>
    type Input = Layer::Input;
    type Output = Layer::Output;
    fn eval(&self, input: &Self::Input) -> Self::Output {
        self.layer.eval(input)
    }
 }
 impl<Layer: NeuraLayer, ChildNetwork: NeuraLayer<Input = Layer::Output>> NeuraLayer
    for NeuraSequential<Layer, ChildNetwork>
 {
    type Input = Layer::Input;
    type Output = ChildNetwork::Output;
-    fn eval(&self, input: &Self::Input) -> Self::Output {
+    fn eval(&self, input: &Input) -> Self::Output {
        self.child_network.eval(&self.layer.eval(input))
    }
 }
-impl<Layer: NeuraTrainableLayer> NeuraTrainableNetwork for NeuraSequential<Layer, ()> {
+impl<Input: Clone> NeuraTrainableNetwork<Input> for () {
-    type Delta = Layer::Delta;
+    type Delta = ();
-    fn apply_gradient(&mut self, gradient: &Self::Delta) {
+    fn default_gradient(&self) -> () {
-        self.layer.apply_gradient(gradient);
+        ()
    }
    fn apply_gradient(&mut self, _gradient: &()) {
        // Noop
    }
    fn backpropagate<Loss: NeuraLoss<Input = Self::Output>>(
        &self,
-        input: &Self::Input,
+        final_activation: &Input,
        target: &Loss::Target,
        loss: Loss,
-    ) -> (Self::Input, Self::Delta) {
+    ) -> (Input, Self::Delta) {
        let final_activation = self.layer.eval(input);
        let backprop_epsilon = loss.nabla(target, &final_activation);
        self.layer.backpropagate(&input, backprop_epsilon)
    }
-    fn regularize(&self) -> Self::Delta {
+        (backprop_epsilon, ())
        self.layer.regularize()
    }
-    fn prepare_epoch(&mut self) {
+    fn regularize(&self) -> () {
-        self.layer.prepare_epoch();
+        ()
    }
-    fn cleanup(&mut self) {
+    fn prepare(&mut self, _is_training: bool) {
-        self.layer.cleanup();
+        // Noop
    }
 }
-impl<Layer: NeuraTrainableLayer, ChildNetwork: NeuraTrainableNetwork<Input = Layer::Output>>
+impl<Input, Layer: NeuraTrainableLayer<Input>, ChildNetwork: NeuraTrainableNetwork<Layer::Output>>
-    NeuraTrainableNetwork for NeuraSequential<Layer, ChildNetwork>
+    NeuraTrainableNetwork<Input> for NeuraSequential<Layer, ChildNetwork>
 {
-    type Delta = (Layer::Delta, Box<ChildNetwork::Delta>);
+    type Delta = (Layer::Gradient, Box<ChildNetwork::Delta>);
    fn default_gradient(&self) -> Self::Delta {
        (self.layer.default_gradient(), Box::new(self.child_network.default_gradient()))
    }
    fn apply_gradient(&mut self, gradient: &Self::Delta) {
        self.layer.apply_gradient(&gradient.0);
@ -154,16 +153,16 @@ impl<Layer: NeuraTrainableLayer, ChildNetwork: NeuraTrainableNetwork<Input = Lay
    fn backpropagate<Loss: NeuraLoss<Input = Self::Output>>(
        &self,
-        input: &Self::Input,
+        input: &Input,
        target: &Loss::Target,
        loss: Loss,
-    ) -> (Self::Input, Self::Delta) {
+    ) -> (Input, Self::Delta) {
        let next_activation = self.layer.eval(input);
        let (backprop_gradient, weights_gradient) =
            self.child_network
                .backpropagate(&next_activation, target, loss);
        let (backprop_gradient, layer_gradient) =
-            self.layer.backpropagate(input, backprop_gradient);
+            self.layer.backprop_layer(input, backprop_gradient);
        (
            backprop_gradient,
@ -173,23 +172,18 @@ impl<Layer: NeuraTrainableLayer, ChildNetwork: NeuraTrainableNetwork<Input = Lay
    fn regularize(&self) -> Self::Delta {
        (
-            self.layer.regularize(),
+            self.layer.regularize_layer(),
            Box::new(self.child_network.regularize()),
        )
    }
-    fn prepare_epoch(&mut self) {
+    fn prepare(&mut self, is_training: bool) {
-        self.layer.prepare_epoch();
+        self.layer.prepare_layer(is_training);
-        self.child_network.prepare_epoch();
+        self.child_network.prepare(is_training);
    }
    fn cleanup(&mut self) {
        self.layer.cleanup();
        self.child_network.cleanup();
    }
 }
-impl<Layer: NeuraLayer> From<Layer> for NeuraSequential<Layer, ()> {
+impl<Layer> From<Layer> for NeuraSequential<Layer, ()> {
    fn from(layer: Layer) -> Self {
        Self {
            layer,
@ -198,6 +192,53 @@ impl<Layer: NeuraLayer> From<Layer> for NeuraSequential<Layer, ()> {
    }
 }
 pub trait NeuraSequentialBuild {
    type Constructed;
    type Err;
    fn construct(self, input_shape: NeuraShape) -> Result<Self::Constructed, Self::Err>;
 }
 #[derive(Debug, Clone)]
 pub enum NeuraSequentialBuildErr<Err, ChildErr> {
    Current(Err),
    Child(ChildErr),
 }
 impl<Layer: NeuraPartialLayer> NeuraSequentialBuild for NeuraSequential<Layer, ()> {
    type Constructed = NeuraSequential<Layer::Constructed, ()>;
    type Err = Layer::Err;
    fn construct(self, input_shape: NeuraShape) -> Result<Self::Constructed, Self::Err> {
        Ok(NeuraSequential {
            layer: self.layer.construct(input_shape)?,
            child_network: Box::new(())
        })
    }
 }
 impl<Layer: NeuraPartialLayer + , ChildNetwork: NeuraSequentialBuild> NeuraSequentialBuild for NeuraSequential<Layer, ChildNetwork> {
    type Constructed = NeuraSequential<Layer::Constructed, ChildNetwork::Constructed>;
    type Err = NeuraSequentialBuildErr<Layer::Err, ChildNetwork::Err>;
    fn construct(self, input_shape: NeuraShape) -> Result<Self::Constructed, Self::Err> {
        let layer = self.layer.construct(input_shape).map_err(|e| NeuraSequentialBuildErr::Current(e))?;
        // TODO: ensure that this operation (and all recursive operations) are directly allocated on the heap
        let child_network = self.child_network
            .construct(Layer::output_shape(&layer))
            .map_err(|e| NeuraSequentialBuildErr::Child(e))?;
        let child_network = Box::new(child_network);
        Ok(NeuraSequential {
            layer,
            child_network,
        })
    }
 }
 /// An utility to recursively create a NeuraSequential network, while writing it in a declarative and linear fashion.
 /// Note that this can quickly create big and unwieldly types.
 #[macro_export]
@ -211,41 +252,47 @@ macro_rules! neura_sequential {
    };
    [ $first:expr, $($rest:expr),+ $(,)? ] => {
-        $crate::network::sequential::NeuraSequential::new_match_output($first, neura_sequential![$($rest),+])
+        $crate::network::sequential::NeuraSequential::new($first, neura_sequential![$($rest),+])
    };
 }
 #[cfg(test)]
 mod test {
    use nalgebra::dvector;
    use crate::{
        derivable::{activation::Relu, regularize::NeuraL0},
-        layer::NeuraDenseLayer,
+        layer::{NeuraDenseLayer, NeuraShape, NeuraLayer},
        neura_layer,
    };
    use super::NeuraSequentialBuild;
    #[test]
    fn test_neura_network_macro() {
        let mut rng = rand::thread_rng();
        let _ = neura_sequential![
-            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, 8, 16>,
+            NeuraDenseLayer::from_rng(8, 12, &mut rng, Relu, NeuraL0) as NeuraDenseLayer<f64, _, _>,
-            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, _, 12>,
+            NeuraDenseLayer::from_rng(12, 16, &mut rng, Relu, NeuraL0) as NeuraDenseLayer<f64, _, _>,
-            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, _, 2>
+            NeuraDenseLayer::from_rng(16, 2, &mut rng, Relu, NeuraL0) as NeuraDenseLayer<f64, _, _>
        ];
        let _ = neura_sequential![
-            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, 8, 16>,
+            NeuraDenseLayer::from_rng(2, 2, &mut rng, Relu, NeuraL0) as NeuraDenseLayer<f64, _, _>,
        ];
        let _ = neura_sequential![
-            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, 8, 16>,
+            NeuraDenseLayer::from_rng(8, 16, &mut rng, Relu, NeuraL0) as NeuraDenseLayer<f64, _, _>,
-            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, _, 12>,
+            NeuraDenseLayer::from_rng(16, 12, &mut rng, Relu, NeuraL0) as NeuraDenseLayer<f64, _, _>,
        ];
-        let _ = neura_sequential![
+        let network = neura_sequential![
-            neura_layer!("dense", 8, 16; Relu),
+            neura_layer!("dense", 16, Relu),
-            neura_layer!("dense", 12; Relu),
+            neura_layer!("dense", 12, Relu),
-            neura_layer!("dense", 2; Relu)
+            neura_layer!("dense", 2, Relu)
-        ];
+        ].construct(NeuraShape::Vector(2)).unwrap();
        network.eval(&dvector![0.0f64, 0.0]);
    }
 }
--- a/src/old_layer/convolution.rs
+++ b/src/old_layer/convolution.rs
--- a/src/old_layer/dense.rs
+++ b/src/old_layer/dense.rs
@ -0,0 +1,180 @@
 use super::{NeuraLayer, NeuraTrainableLayer};
 use crate::{
    algebra::{NeuraMatrix, NeuraVector, NeuraVectorSpace},
    derivable::NeuraDerivable,
 };
 use rand::Rng;
 use rand_distr::Distribution;
 #[derive(Clone, Debug)]
 pub struct NeuraDenseLayer<
    Act: NeuraDerivable<f64>,
    Reg: NeuraDerivable<f64>,
    const INPUT_LEN: usize,
    const OUTPUT_LEN: usize,
 > {
    weights: NeuraMatrix<INPUT_LEN, OUTPUT_LEN, f64>,
    bias: NeuraVector<OUTPUT_LEN, f64>,
    activation: Act,
    regularization: Reg,
 }
 impl<
        Act: NeuraDerivable<f64>,
        Reg: NeuraDerivable<f64>,
        const INPUT_LEN: usize,
        const OUTPUT_LEN: usize,
    > NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
 {
    pub fn new(
        weights: NeuraMatrix<INPUT_LEN, OUTPUT_LEN, f64>,
        bias: NeuraVector<OUTPUT_LEN, f64>,
        activation: Act,
        regularization: Reg,
    ) -> Self {
        Self {
            weights,
            bias,
            activation,
            regularization,
        }
    }
    pub fn from_rng(rng: &mut impl Rng, activation: Act, regularization: Reg) -> Self {
        let mut weights: NeuraMatrix<INPUT_LEN, OUTPUT_LEN, f64> = NeuraMatrix::from_value(0.0f64);
        // Use Xavier (or He) initialisation, using the harmonic mean
        // Ref: https://www.deeplearning.ai/ai-notes/initialization/index.html
        let distribution = rand_distr::Normal::new(
            0.0,
            activation.variance_hint() * 2.0 / (INPUT_LEN as f64 + OUTPUT_LEN as f64),
        )
        .unwrap();
        // let distribution = rand_distr::Uniform::new(-0.5, 0.5);
        for i in 0..OUTPUT_LEN {
            for j in 0..INPUT_LEN {
                weights[i][j] = distribution.sample(rng);
            }
        }
        Self {
            weights,
            // Biases are initialized based on the activation's hint
            bias: NeuraVector::from_value(activation.bias_hint()),
            activation,
            regularization,
        }
    }
 }
 impl<
        Act: NeuraDerivable<f64>,
        Reg: NeuraDerivable<f64>,
        const INPUT_LEN: usize,
        const OUTPUT_LEN: usize,
    > NeuraLayer for NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
 {
    type Input = NeuraVector<INPUT_LEN, f64>;
    type Output = NeuraVector<OUTPUT_LEN, f64>;
    fn eval(&self, input: &Self::Input) -> Self::Output {
        let mut result = self.weights.multiply_vector(input);
        for i in 0..OUTPUT_LEN {
            result[i] = self.activation.eval(result[i] + self.bias[i]);
        }
        result
    }
 }
 impl<
        Act: NeuraDerivable<f64>,
        Reg: NeuraDerivable<f64>,
        const INPUT_LEN: usize,
        const OUTPUT_LEN: usize,
    > NeuraTrainableLayer for NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
 {
    type Delta = (
        NeuraMatrix<INPUT_LEN, OUTPUT_LEN, f64>,
        NeuraVector<OUTPUT_LEN, f64>,
    );
    fn backpropagate(
        &self,
        input: &Self::Input,
        epsilon: Self::Output,
    ) -> (Self::Input, Self::Delta) {
        let evaluated = self.weights.multiply_vector(input);
        // Compute delta (the input gradient of the neuron) from epsilon (the output gradient of the neuron),
        // with `self.activation'(input) ° epsilon = delta`
        let mut delta: NeuraVector<OUTPUT_LEN, f64> = epsilon.clone();
        for i in 0..OUTPUT_LEN {
            delta[i] *= self.activation.derivate(evaluated[i]);
        }
        // Compute the weight gradient
        let weights_gradient = delta.reverse_dot(input);
        let new_epsilon = self.weights.transpose_multiply_vector(&delta);
        // According to https://datascience.stackexchange.com/questions/20139/gradients-for-bias-terms-in-backpropagation
        // The gradient of the bias is equal to the delta term of the backpropagation algorithm
        let bias_gradient = delta;
        (new_epsilon, (weights_gradient, bias_gradient))
    }
    fn apply_gradient(&mut self, gradient: &Self::Delta) {
        NeuraVectorSpace::add_assign(&mut self.weights, &gradient.0);
        NeuraVectorSpace::add_assign(&mut self.bias, &gradient.1);
    }
    fn regularize(&self) -> Self::Delta {
        let mut res = Self::Delta::default();
        for i in 0..OUTPUT_LEN {
            for j in 0..INPUT_LEN {
                res.0[i][j] = self.regularization.derivate(self.weights[i][j]);
            }
        }
        // Note: biases aren't taken into account here, as per https://stats.stackexchange.com/questions/153605/no-regularisation-term-for-bias-unit-in-neural-network
        res
    }
 }
 #[cfg(test)]
 mod test {
    use super::*;
    use crate::{
        derivable::{activation::Relu, regularize::NeuraL0},
        utils::uniform_vector,
    };
    #[test]
    fn test_from_rng() {
        let mut rng = rand::thread_rng();
        let layer: NeuraDenseLayer<_, _, 64, 32> =
            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0);
        let mut input = [0.0; 64];
        for x in 0..64 {
            input[x] = rng.gen();
        }
        assert!(layer.eval(&input.into()).len() == 32);
    }
    #[test]
    fn test_stack_overflow_big_layer() {
        let layer = NeuraDenseLayer::from_rng(&mut rand::thread_rng(), Relu, NeuraL0)
            as NeuraDenseLayer<Relu, NeuraL0, 1000, 1000>;
        layer.backpropagate(&uniform_vector(), uniform_vector());
        <NeuraDenseLayer<Relu, NeuraL0, 1000, 1000> as NeuraTrainableLayer>::Delta::zero();
    }
 }
--- a/src/old_layer/dropout.rs
+++ b/src/old_layer/dropout.rs
--- a/src/old_layer/lock.rs
+++ b/src/old_layer/lock.rs
--- a/src/old_layer/mod.rs
+++ b/src/old_layer/mod.rs
@ -0,0 +1,170 @@
 mod dense;
 pub use dense::NeuraDenseLayer;
 mod convolution;
 pub use convolution::{NeuraConv1DPadLayer, NeuraConv2DBlockLayer, NeuraConv2DPadLayer};
 mod dropout;
 pub use dropout::NeuraDropoutLayer;
 mod softmax;
 pub use softmax::NeuraSoftmaxLayer;
 mod one_hot;
 pub use one_hot::NeuraOneHotLayer;
 mod lock;
 pub use lock::NeuraLockLayer;
 mod pool;
 pub use pool::{NeuraGlobalPoolLayer, NeuraPool1DLayer};
 mod reshape;
 pub use reshape::{NeuraFlattenLayer, NeuraReshapeLayer};
 use crate::algebra::NeuraVectorSpace;
 pub trait NeuraLayer {
    type Input;
    type Output;
    fn eval(&self, input: &Self::Input) -> Self::Output;
 }
 pub trait NeuraTrainableLayer: NeuraLayer {
    /// The representation of the layer gradient as a vector space
    type Delta: NeuraVectorSpace;
    /// Computes the backpropagation term and the derivative of the internal weights,
    /// using the `input` vector outputted by the previous layer and the backpropagation term `epsilon` of the next layer.
    ///
    /// Note: we introduce the term `epsilon`, which together with the activation of the current function can be used to compute `delta_l`:
    /// ```no_rust
    /// f_l'(a_l) * epsilon_l = delta_l
    /// ```
    ///
    /// The function should then return a pair `(epsilon_{l-1}, δW_l)`,
    /// with `epsilon_{l-1}` being multiplied by `f_{l-1}'(activation)` by the next layer to obtain `delta_{l-1}`.
    /// Using this intermediate value for `delta` allows us to isolate it computation to the respective layers.
    fn backpropagate(
        &self,
        input: &Self::Input,
        epsilon: Self::Output,
    ) -> (Self::Input, Self::Delta);
    /// Computes the regularization
    fn regularize(&self) -> Self::Delta;
    /// Applies `δW_l` to the weights of the layer
    fn apply_gradient(&mut self, gradient: &Self::Delta);
    /// Called before an iteration begins, to allow the layer to set itself up for training.
    #[inline(always)]
    fn prepare_epoch(&mut self) {}
    /// Called at the end of training, to allow the layer to clean itself up
    #[inline(always)]
    fn cleanup(&mut self) {}
 }
 #[macro_export]
 macro_rules! neura_layer {
    ( "dense", $( $shape:expr ),*; $activation:expr ) => {
        $crate::layer::NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation, $crate::derivable::regularize::NeuraL0)
            as neura_layer!("_dense_shape", $($shape),*)
    };
    ( "dense", $( $shape:expr ),*; $activation:expr, $regularization:expr ) => {
        $crate::layer::NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation, $regularization)
            as neura_layer!("_dense_shape", $($shape),*)
    };
    ( "_dense_shape", $output:expr ) => {
        $crate::layer::NeuraDenseLayer<_, _, _, $output>
    };
    ( "_dense_shape", $input:expr, $output:expr ) => {
        $crate::layer::NeuraDenseLayer<_, _, $input, $output>
    };
    ( "dropout", $probability:expr ) => {
        $crate::layer::NeuraDropoutLayer::new($probability, rand::thread_rng())
            as $crate::layer::NeuraDropoutLayer<_, _>
    };
    ( "softmax" ) => {
        $crate::layer::NeuraSoftmaxLayer::new() as $crate::layer::NeuraSoftmaxLayer<_>
    };
    ( "softmax", $length:expr ) => {
        $crate::layer::NeuraSoftmaxLayer::new() as $crate::layer::NeuraSoftmaxLayer<$length>
    };
    ( "one_hot" ) => {
        $crate::layer::NeuraOneHotLayer as $crate::layer::NeuraOneHotLayer<2, _>
    };
    ( "lock", $layer:expr ) => {
        $crate::layer::NeuraLockLayer($layer)
    };
    ( "conv1d_pad", $length:expr, $feats:expr; $window:expr; $layer:expr ) => {
        $crate::layer::NeuraConv1DPadLayer::new($layer, Default::default()) as $crate::layer::NeuraConv1DPadLayer<$length, $feats, $window, _>
    };
    ( "conv1d_pad"; $window:expr; $layer:expr ) => {
        $crate::layer::NeuraConv1DPadLayer::new($layer, Default::default()) as $crate::layer::NeuraConv1DPadLayer<_, _, $window, _>
    };
    ( "conv2d_pad", $feats:expr, $length:expr; $width:expr, $window:expr; $layer:expr ) => {
        $crate::layer::NeuraConv2DPadLayer::new($layer, Default::default(), $width) as $crate::layer::NeuraConv2DPadLayer<$length, $feats, $window, _>
    };
    ( "conv2d_pad"; $width:expr, $window:expr; $layer:expr ) => {
        $crate::layer::NeuraConv2DPadLayer::new($layer, Default::default(), $width) as $crate::layer::NeuraConv2DPadLayer<_, _, $window, _>
    };
    ( "conv2d_block", $feats:expr, $width:expr, $height:expr; $block_size:expr; $layer:expr ) => {
        $crate::layer::NeuraConv2DBlockLayer::new($layer) as $crate::layer::NeuraConv2DBlockLayer<$width, $height, $feats, $block_size, _>
    };
    ( "conv2d_block", $width:expr, $height:expr; $block_size:expr; $layer:expr ) => {
        $crate::layer::NeuraConv2DBlockLayer::new($layer) as $crate::layer::NeuraConv2DBlockLayer<$width, $height, _, $block_size, _>
    };
    ( "pool_global"; $reduce:expr ) => {
        $crate::layer::NeuraGlobalPoolLayer::new($reduce) as $crate::layer::NeuraGlobalPoolLayer<_, _, _>
    };
    ( "pool_global", $feats:expr, $length:expr; $reduce:expr ) => {
        $crate::layer::NeuraGlobalPoolLayer::new($reduce) as $crate::layer::NeuraGlobalPoolLayer<$length, $feats, _>
    };
    ( "pool1d", $blocklength:expr; $reduce:expr ) => {
        $crate::layer::NeuraPool1DLayer::new($reduce) as $crate::layer::NeuraPool1DLayer<_, $blocklength, _, _>
    };
    ( "pool1d", $blocks:expr, $blocklength:expr; $reduce:expr ) => {
        $crate::layer::NeuraPool1DLayer::new($reduce) as $crate::layer::NeuraPool1DLayer<$blocks, $blocklength, _, _>
    };
    ( "pool1d", $feats:expr, $blocks:expr, $blocklength:expr; $reduce:expr ) => {
        $crate::layer::NeuraPool1DLayer::new($reduce) as $crate::layer::NeuraPool1DLayer<$blocks, $blocklength, $feats, _>
    };
    ( "unstable_flatten" ) => {
        $crate::layer::NeuraFlattenLayer::new() as $crate::layer::NeuraFlattenLayer<_, _, f64>
    };
    ( "unstable_flatten", $width:expr, $height:expr ) => {
        $crate::layer::NeuraFlattenLayer::new() as $crate::layer::NeuraFlattenLayer<$width, $height, f64>
    };
    ( "unstable_reshape", $height:expr ) => {
        $crate::layer::NeuraReshapeLayer::new() as $crate::layer::NeuraReshapeLayer<_, $height, f64>
    };
    ( "unstable_reshape", $width:expr, $height:expr ) => {
        $crate::layer::NeuraReshapeLayer::new() as $crate::layer::NeuraReshapeLayer<$width, $height, f64>
    };
 }
--- a/src/old_layer/one_hot.rs
+++ b/src/old_layer/one_hot.rs
--- a/src/old_layer/pool.rs
+++ b/src/old_layer/pool.rs
--- a/src/old_layer/reshape.rs
+++ b/src/old_layer/reshape.rs
--- a/src/old_layer/softmax.rs
+++ b/src/old_layer/softmax.rs
--- a/src/train.rs
+++ b/src/train.rs
@ -5,26 +5,20 @@ use crate::{
    network::{sequential::NeuraSequential, NeuraTrainableNetwork},
 };
-pub trait NeuraGradientSolver<Output, Target = Output> {
+pub trait NeuraGradientSolver<Input, Target, Trainable: NeuraTrainableNetwork<Input>> {
-    fn get_gradient<Layer: NeuraLayer, ChildNetwork>(
+    fn get_gradient(
        &self,
-        trainable: &NeuraSequential<Layer, ChildNetwork>,
+        trainable: &Trainable,
-        input: &Layer::Input,
+        input: &Input,
        target: &Target,
-    ) -> <NeuraSequential<Layer, ChildNetwork> as NeuraTrainableNetwork>::Delta
+    ) -> Trainable::Delta;
    where
        NeuraSequential<Layer, ChildNetwork>:
            NeuraTrainableNetwork<Input = Layer::Input, Output = Output>;
-    fn score<Layer: NeuraLayer, ChildNetwork>(
+    fn score(
        &self,
-        trainable: &NeuraSequential<Layer, ChildNetwork>,
+        trainable: &Trainable,
-        input: &Layer::Input,
+        input: &Input,
        target: &Target,
-    ) -> f64
+    ) -> f64;
    where
        NeuraSequential<Layer, ChildNetwork>:
            NeuraTrainableNetwork<Input = Layer::Input, Output = Output>;
 }
 #[non_exhaustive]
@ -38,32 +32,24 @@ impl<Loss: NeuraLoss + Clone> NeuraBackprop<Loss> {
    }
 }
-impl<const N: usize, Loss: NeuraLoss<Input = NeuraVector<N, f64>> + Clone>
+impl<Input, Target, Trainable: NeuraTrainableNetwork<Input>, Loss: NeuraLoss<Input = Trainable::Output, Target = Target> + Clone>
-    NeuraGradientSolver<NeuraVector<N, f64>, Loss::Target> for NeuraBackprop<Loss>
+    NeuraGradientSolver<Input, Target, Trainable> for NeuraBackprop<Loss>
 {
-    fn get_gradient<Layer: NeuraLayer, ChildNetwork>(
+    fn get_gradient(
        &self,
-        trainable: &NeuraSequential<Layer, ChildNetwork>,
+        trainable: &Trainable,
-        input: &Layer::Input,
+        input: &Input,
-        target: &Loss::Target,
+        target: &Target,
-    ) -> <NeuraSequential<Layer, ChildNetwork> as NeuraTrainableNetwork>::Delta
+    ) -> Trainable::Delta {
    where
        NeuraSequential<Layer, ChildNetwork>:
            NeuraTrainableNetwork<Input = Layer::Input, Output = NeuraVector<N, f64>>,
    {
        trainable.backpropagate(input, target, self.loss.clone()).1
    }
-    fn score<Layer: NeuraLayer, ChildNetwork>(
+    fn score(
        &self,
-        trainable: &NeuraSequential<Layer, ChildNetwork>,
+        trainable: &Trainable,
-        input: &Layer::Input,
+        input: &Input,
-        target: &Loss::Target,
+        target: &Target,
-    ) -> f64
+    ) -> f64 {
    where
        NeuraSequential<Layer, ChildNetwork>:
            NeuraTrainableNetwork<Input = Layer::Input, Output = NeuraVector<N, f64>>,
    {
        let output = trainable.eval(&input);
        self.loss.eval(target, &output)
    }
@ -137,41 +123,32 @@ impl NeuraBatchedTrainer {
    }
    pub fn train<
-        Output,
+        Input: Clone,
        Target: Clone,
-        GradientSolver: NeuraGradientSolver<Output, Target>,
+        Network: NeuraTrainableNetwork<Input>,
-        Layer: NeuraLayer,
+        GradientSolver: NeuraGradientSolver<Input, Target, Network>,
-        ChildNetwork,
+        Inputs: IntoIterator<Item = (Input, Target)>,
        Inputs: IntoIterator<Item = (Layer::Input, Target)>,
    >(
        &self,
        gradient_solver: GradientSolver,
-        network: &mut NeuraSequential<Layer, ChildNetwork>,
+        network: &mut Network,
        inputs: Inputs,
-        test_inputs: &[(Layer::Input, Target)],
+        test_inputs: &[(Input, Target)],
-    ) where
+    ) {
        NeuraSequential<Layer, ChildNetwork>:
            NeuraTrainableNetwork<Input = Layer::Input, Output = Output>,
        Layer::Input: Clone,
    {
        let mut iter = inputs.into_iter();
        let factor = -self.learning_rate / (self.batch_size as f64);
        let momentum_factor = self.learning_momentum / self.learning_rate;
        let reg_factor = -self.learning_rate;
        // Contains `momentum_factor * factor * gradient_sum_previous_iter`
-        let mut previous_gradient_sum =
+        let mut previous_gradient_sum = network.default_gradient();
            Box::<<NeuraSequential<Layer, ChildNetwork> as NeuraTrainableNetwork>::Delta>::zero();
        'd: for iteration in 0..self.iterations {
-            let mut gradient_sum = Box::<
+            let mut gradient_sum = network.default_gradient();
-                <NeuraSequential<Layer, ChildNetwork> as NeuraTrainableNetwork>::Delta,
+            network.prepare(true);
            >::zero();
            network.prepare_epoch();
            for _ in 0..self.batch_size {
                if let Some((input, target)) = iter.next() {
-                    let gradient =
+                    let gradient = gradient_solver.get_gradient(&network, &input, &target);
                        Box::new(gradient_solver.get_gradient(&network, &input, &target));
                    gradient_sum.add_assign(&gradient);
                } else {
                    break 'd;
@ -194,7 +171,7 @@ impl NeuraBatchedTrainer {
            }
            if self.log_iterations > 0 && (iteration + 1) % self.log_iterations == 0 {
-                network.cleanup();
+                network.prepare(false);
                let mut loss_sum = 0.0;
                for (input, target) in test_inputs {
                    loss_sum += gradient_solver.score(&network, input, target);
@ -204,12 +181,14 @@ impl NeuraBatchedTrainer {
            }
        }
-        network.cleanup();
+        network.prepare(false);
    }
 }
 #[cfg(test)]
 mod test {
    use nalgebra::{DMatrix, dmatrix, dvector};
    use super::*;
    use crate::{
        assert_approx,
@ -224,19 +203,19 @@ mod test {
        for wa in [0.0, 0.25, 0.5, 1.0] {
            for wb in [0.0, 0.25, 0.5, 1.0] {
                let network = NeuraSequential::new(
-                    NeuraDenseLayer::new([[wa, wb]].into(), [0.0].into(), Linear, NeuraL0),
+                    NeuraDenseLayer::new(dmatrix![wa, wb], dvector![0.0], Linear, NeuraL0),
                    (),
                );
-                let gradient = NeuraBackprop::new(Euclidean).get_gradient(
+                let (gradient, _) = NeuraBackprop::new(Euclidean).get_gradient(
                    &network,
-                    &[1.0, 1.0].into(),
+                    &dvector![1.0, 1.0],
-                    &[0.0].into(),
+                    &dvector![0.0],
                );
                let expected = wa + wb;
-                assert!((gradient.0[0][0] - expected) < 0.001);
+                assert!((gradient.0[(0, 0)] - expected) < 0.001);
-                assert!((gradient.0[0][1] - expected) < 0.001);
+                assert!((gradient.0[(0, 1)] - expected) < 0.001);
            }
        }
    }
@ -247,42 +226,42 @@ mod test {
        // Test that we get the same values as https://hmkcode.com/ai/backpropagation-step-by-step/
        let network = neura_sequential![
            NeuraDenseLayer::new(
-                [[0.11, 0.21], [0.12, 0.08]].into(),
+                dmatrix![0.11, 0.21; 0.12, 0.08],
-                [0.0; 2].into(),
+                dvector![0.0, 0.0],
                Linear,
                NeuraL0
            ),
-            NeuraDenseLayer::new([[0.14, 0.15]].into(), [0.0].into(), Linear, NeuraL0)
+            NeuraDenseLayer::new(dmatrix![0.14, 0.15], dvector![0.0], Linear, NeuraL0)
        ];
-        let input = [2.0, 3.0];
+        let input = dvector![2.0, 3.0];
-        let target = [1.0];
+        let target = dvector![1.0];
-        let intermediary = network.clone().trim_tail().eval(&input.into());
+        let intermediary = network.clone().trim_tail().eval(&input);
        assert_approx!(0.85, intermediary[0], EPSILON);
        assert_approx!(0.48, intermediary[1], EPSILON);
-        assert_approx!(0.191, network.eval(&input.into())[0], EPSILON);
+        assert_approx!(0.191, network.eval(&input)[0], EPSILON);
        assert_approx!(
            0.327,
-            Euclidean.eval(&target.into(), &network.eval(&input.into())),
+            Euclidean.eval(&target, &network.eval(&input)),
            0.001
        );
-        let delta = network.eval(&input.into())[0] - target[0];
+        let delta = network.eval(&input)[0] - target[0];
        let (gradient_first, gradient_second) =
-            NeuraBackprop::new(Euclidean).get_gradient(&network, &input.into(), &target.into());
+            NeuraBackprop::new(Euclidean).get_gradient(&network, &input, &target);
        let gradient_first = gradient_first.0;
-        let gradient_second = gradient_second.0[0];
+        let gradient_second = gradient_second.0.0;
        assert_approx!(gradient_second[0], intermediary[0] * delta, EPSILON);
        assert_approx!(gradient_second[1], intermediary[1] * delta, EPSILON);
-        assert_approx!(gradient_first[0][0], input[0] * delta * 0.14, EPSILON);
+        assert_approx!(gradient_first[(0, 0)], input[0] * delta * 0.14, EPSILON);
-        assert_approx!(gradient_first[0][1], input[1] * delta * 0.14, EPSILON);
+        assert_approx!(gradient_first[(0, 1)], input[1] * delta * 0.14, EPSILON);
-        assert_approx!(gradient_first[1][0], input[0] * delta * 0.15, EPSILON);
+        assert_approx!(gradient_first[(1, 0)], input[0] * delta * 0.15, EPSILON);
-        assert_approx!(gradient_first[1][1], input[1] * delta * 0.15, EPSILON);
+        assert_approx!(gradient_first[(1, 1)], input[1] * delta * 0.15, EPSILON);
    }
 }