From d40098d2efba6f6a634722ba1a8bfa4412e46e70 Mon Sep 17 00:00:00 2001
From: Adrien Burgun <adrien.burgun@orange.fr>
Date: Sat, 22 Apr 2023 10:31:46 +0200
Subject: [PATCH] :fire: Refactor of NeuraTrainableLayer, split it into
 multiple traits

---
 examples/generate-tests.rs             |  4 +-
 src/gradient_solver/backprop.rs        | 60 ++++++++++------
 src/gradient_solver/forward_forward.rs | 23 +++---
 src/gradient_solver/mod.rs             | 17 +++--
 src/layer/dense.rs                     | 77 +++++++++++++++------
 src/layer/dropout.rs                   | 51 ++++++++++----
 src/layer/mod.rs                       | 90 +++++++++++++++++-------
 src/layer/normalize.rs                 | 59 +++++++++++-----
 src/layer/softmax.rs                   | 59 +++++++++++-----
 src/network/mod.rs                     |  1 +
 src/network/sequential/layer_impl.rs   | 96 ++++++++++++++++++++++++++
 src/network/sequential/mod.rs          | 68 +++---------------
 src/train.rs                           |  5 +-
 tests/xor.rs                           | 16 +++--
 14 files changed, 425 insertions(+), 201 deletions(-)
 create mode 100644 src/network/sequential/layer_impl.rs
diff --git a/examples/generate-tests.rs b/examples/generate-tests.rs
index 1b33f3b..c35e334 100644
--- a/examples/generate-tests.rs
+++ b/examples/generate-tests.rs
@@ -30,7 +30,7 @@ fn main() {
         network.layer.weights.clone(),
         network.layer.bias.clone(),
         network.child_network.layer.weights.clone(),
-        network.child_network.layer.bias.clone()
+        network.child_network.layer.bias.clone(),
     )];
 
     for iteration in 0..4 {
@@ -45,7 +45,7 @@ fn main() {
             network.layer.weights.clone(),
             network.layer.bias.clone(),
             network.child_network.layer.weights.clone(),
-            network.child_network.layer.bias.clone()
+            network.child_network.layer.bias.clone(),
         ));
     }
 
diff --git a/src/gradient_solver/backprop.rs b/src/gradient_solver/backprop.rs
index 637253b..d6635ec 100644
--- a/src/gradient_solver/backprop.rs
+++ b/src/gradient_solver/backprop.rs
@@ -1,6 +1,9 @@
 use num::ToPrimitive;
 
-use crate::{derivable::NeuraLoss, layer::NeuraTrainableLayer, network::NeuraTrainableNetworkBase};
+use crate::{
+    derivable::NeuraLoss, layer::NeuraTrainableLayerBackprop, layer::NeuraTrainableLayerSelf,
+    network::NeuraTrainableNetworkBase,
+};
 
 use super::*;
 
@@ -53,23 +56,26 @@ impl<LayerOutput, Target, Loss: NeuraLoss<LayerOutput, Target = Target>>
     }
 }
 
-impl<LayerOutput, Target, Loss> NeuraGradientSolverTransient<LayerOutput>
-    for (&NeuraBackprop<Loss>, &Target)
-{
-    fn eval_layer<
+impl<
         Input,
-        NetworkGradient,
-        RecGradient,
-        Layer: NeuraTrainableLayer<Input, Output = LayerOutput>,
-    >(
+        Target,
+        Loss,
+        Layer: NeuraTrainableLayerBackprop<Input> + NeuraTrainableLayerSelf<Input>,
+    > NeuraGradientSolverTransient<Input, Layer> for (&NeuraBackprop<Loss>, &Target)
+{
+    fn eval_layer<NetworkGradient, RecGradient>(
         &self,
         layer: &Layer,
         input: &Input,
-        rec_opt_output: Self::Output<LayerOutput, RecGradient>,
+        _output: &Layer::Output,
+        intermediary: &Layer::IntermediaryRepr,
+        rec_opt_output: Self::Output<Layer::Output, RecGradient>,
         combine_gradients: impl Fn(Layer::Gradient, RecGradient) -> NetworkGradient,
     ) -> Self::Output<Input, NetworkGradient> {
         let (epsilon_in, rec_gradient) = rec_opt_output;
-        let (epsilon_out, layer_gradient) = layer.backprop_layer(input, epsilon_in);
+
+        let epsilon_out = layer.backprop_layer(input, intermediary, &epsilon_in);
+        let layer_gradient = layer.get_gradient(input, intermediary, &epsilon_in);
 
         (epsilon_out, combine_gradients(layer_gradient, rec_gradient))
     }
@@ -80,7 +86,11 @@ mod test {
     use approx::assert_relative_eq;
 
     use super::*;
-    use crate::{prelude::*, derivable::{activation::Tanh, loss::Euclidean, NeuraDerivable}, utils::uniform_vector};
+    use crate::{
+        derivable::{activation::Tanh, loss::Euclidean, NeuraDerivable},
+        prelude::*,
+        utils::uniform_vector,
+    };
 
     #[test]
     fn test_backprop_epsilon_bias() {
@@ -91,16 +101,22 @@ mod test {
             let network = neura_sequential![
                 neura_layer!("dense", 4, f64).activation(Tanh),
                 neura_layer!("dense", 2, f64).activation(Tanh)
-            ].construct(NeuraShape::Vector(4)).unwrap();
+            ]
+            .construct(NeuraShape::Vector(4))
+            .unwrap();
 
             let optimizer = NeuraBackprop::new(Euclidean);
             let input = uniform_vector(4);
             let target = uniform_vector(2);
 
             let layer1_intermediary = &network.layer.weights * &input;
-            let layer2_intermediary = &network.child_network.layer.weights * layer1_intermediary.map(|x| x.tanh());
+            let layer2_intermediary =
+                &network.child_network.layer.weights * layer1_intermediary.map(|x| x.tanh());
 
-            assert_relative_eq!(layer1_intermediary.map(|x| x.tanh()), network.clone().trim_tail().eval(&input));
+            assert_relative_eq!(
+                layer1_intermediary.map(|x| x.tanh()),
+                network.clone().trim_tail().eval(&input)
+            );
 
             let output = network.eval(&input);
 
@@ -110,25 +126,27 @@ mod test {
             for i in 0..2 {
                 delta2_expected[i] *= Tanh.derivate(layer2_intermediary[i]);
             }
-            let delta2_actual = gradient.1.0.1;
+            let delta2_actual = gradient.1 .0 .1;
 
             assert_relative_eq!(delta2_actual.as_slice(), delta2_expected.as_slice());
 
-            let gradient2_expected = &delta2_expected * layer1_intermediary.map(|x| x.tanh()).transpose();
-            let gradient2_actual = gradient.1.0.0;
+            let gradient2_expected =
+                &delta2_expected * layer1_intermediary.map(|x| x.tanh()).transpose();
+            let gradient2_actual = gradient.1 .0 .0;
 
             assert_relative_eq!(gradient2_actual.as_slice(), gradient2_expected.as_slice());
 
-            let mut delta1_expected = network.child_network.layer.weights.transpose() * delta2_expected;
+            let mut delta1_expected =
+                network.child_network.layer.weights.transpose() * delta2_expected;
             for i in 0..4 {
                 delta1_expected[i] *= Tanh.derivate(layer1_intermediary[i]);
             }
-            let delta1_actual = gradient.0.1;
+            let delta1_actual = gradient.0 .1;
 
             assert_relative_eq!(delta1_actual.as_slice(), delta1_expected.as_slice());
 
             let gradient1_expected = &delta1_expected * input.transpose();
-            let gradient1_actual = gradient.0.0;
+            let gradient1_actual = gradient.0 .0;
 
             assert_relative_eq!(gradient1_actual.as_slice(), gradient1_expected.as_slice());
         }
diff --git a/src/gradient_solver/forward_forward.rs b/src/gradient_solver/forward_forward.rs
index 76c8b70..2b88a00 100644
--- a/src/gradient_solver/forward_forward.rs
+++ b/src/gradient_solver/forward_forward.rs
@@ -1,7 +1,7 @@
 use nalgebra::{DVector, Scalar};
 use num::{traits::NumAssignOps, Float, ToPrimitive};
 
-use crate::derivable::NeuraDerivable;
+use crate::{derivable::NeuraDerivable, prelude::NeuraTrainableLayerSelf};
 
 use super::*;
 
@@ -90,22 +90,23 @@ impl<Act, LayerOutput> NeuraGradientSolverFinal<LayerOutput> for NeuraForwardPai
     }
 }
 
-impl<F: Float + Scalar + NumAssignOps, Act: NeuraDerivable<F>>
-    NeuraGradientSolverTransient<DVector<F>> for NeuraForwardPair<Act>
-{
-    fn eval_layer<
+impl<
+        F: Float + Scalar + NumAssignOps,
+        Act: NeuraDerivable<F>,
         Input,
-        NetworkGradient,
-        RecGradient,
-        Layer: NeuraTrainableLayer<Input, Output = DVector<F>>,
-    >(
+        Layer: NeuraTrainableLayerSelf<Input, Output = DVector<F>>,
+    > NeuraGradientSolverTransient<Input, Layer> for NeuraForwardPair<Act>
+{
+    fn eval_layer<NetworkGradient, RecGradient>(
         &self,
         layer: &Layer,
         input: &Input,
+        output: &Layer::Output,
+        intermediary: &Layer::IntermediaryRepr,
         rec_gradient: RecGradient,
         combine_gradients: impl Fn(Layer::Gradient, RecGradient) -> NetworkGradient,
     ) -> Self::Output<Input, NetworkGradient> {
-        let output = layer.eval(input);
+        // let output = layer.eval(input);
         let goodness = output
             .iter()
             .copied()
@@ -129,7 +130,7 @@ impl<F: Float + Scalar + NumAssignOps, Act: NeuraDerivable<F>>
         }
 
         // TODO: split backprop_layer into eval_training, get_gradient and get_backprop
-        let (_, layer_gradient) = layer.backprop_layer(input, goodness_derivative);
+        let layer_gradient = layer.get_gradient(input, intermediary, &goodness_derivative);
 
         combine_gradients(layer_gradient, rec_gradient)
     }
diff --git a/src/gradient_solver/mod.rs b/src/gradient_solver/mod.rs
index 3f68076..732ca2c 100644
--- a/src/gradient_solver/mod.rs
+++ b/src/gradient_solver/mod.rs
@@ -5,7 +5,7 @@ mod forward_forward;
 pub use forward_forward::NeuraForwardForward;
 
 use crate::{
-    layer::NeuraTrainableLayer,
+    layer::NeuraTrainableLayerBase,
     network::{NeuraTrainableNetwork, NeuraTrainableNetworkBase},
 };
 
@@ -17,17 +17,16 @@ pub trait NeuraGradientSolverFinal<LayerOutput>: NeuraGradientSolverBase {
     fn eval_final(&self, output: LayerOutput) -> Self::Output<LayerOutput, ()>;
 }
 
-pub trait NeuraGradientSolverTransient<LayerOutput>: NeuraGradientSolverBase {
-    fn eval_layer<
-        Input,
-        NetworkGradient,
-        RecGradient,
-        Layer: NeuraTrainableLayer<Input, Output = LayerOutput>,
-    >(
+pub trait NeuraGradientSolverTransient<Input, Layer: NeuraTrainableLayerBase<Input>>:
+    NeuraGradientSolverBase
+{
+    fn eval_layer<NetworkGradient, RecGradient>(
         &self,
         layer: &Layer,
         input: &Input,
-        rec_opt_output: Self::Output<LayerOutput, RecGradient>,
+        output: &Layer::Output,
+        layer_intermediary: &Layer::IntermediaryRepr,
+        rec_opt_output: Self::Output<Layer::Output, RecGradient>,
         combine_gradients: impl Fn(Layer::Gradient, RecGradient) -> NetworkGradient,
     ) -> Self::Output<Input, NetworkGradient>;
 }
diff --git a/src/layer/dense.rs b/src/layer/dense.rs
index 2437f4d..0df187a 100644
--- a/src/layer/dense.rs
+++ b/src/layer/dense.rs
@@ -161,9 +161,9 @@ impl<
     fn eval(&self, input: &DVector<F>) -> Self::Output {
         assert_eq!(input.shape().0, self.weights.shape().1);
 
-        let res = &self.weights * input + &self.bias;
+        let evaluated = &self.weights * input + &self.bias;
 
-        res.map(|x| self.activation.eval(x))
+        evaluated.map(|x| self.activation.eval(x))
     }
 }
 
@@ -171,9 +171,17 @@ impl<
         F: Float + std::fmt::Debug + 'static + std::ops::AddAssign + std::ops::MulAssign,
         Act: NeuraDerivable<F>,
         Reg: NeuraDerivable<F>,
-    > NeuraTrainableLayer<DVector<F>> for NeuraDenseLayer<F, Act, Reg>
+    > NeuraTrainableLayerBase<DVector<F>> for NeuraDenseLayer<F, Act, Reg>
 {
     type Gradient = (DMatrix<F>, DVector<F>);
+    type IntermediaryRepr = DVector<F>; // pre-activation values
+
+    fn eval_training(&self, input: &DVector<F>) -> (Self::Output, Self::IntermediaryRepr) {
+        let evaluated = &self.weights * input + &self.bias;
+        let output = evaluated.map(|x| self.activation.eval(x));
+
+        (output, evaluated)
+    }
 
     fn default_gradient(&self) -> Self::Gradient {
         (
@@ -182,41 +190,70 @@ impl<
         )
     }
 
-    fn backprop_layer(
+    fn apply_gradient(&mut self, gradient: &Self::Gradient) {
+        self.weights += &gradient.0;
+        self.bias += &gradient.1;
+    }
+}
+
+impl<
+        F: Float + std::fmt::Debug + 'static + std::ops::AddAssign + std::ops::MulAssign,
+        Act: NeuraDerivable<F>,
+        Reg: NeuraDerivable<F>,
+    > NeuraTrainableLayerSelf<DVector<F>> for NeuraDenseLayer<F, Act, Reg>
+{
+    fn regularize_layer(&self) -> Self::Gradient {
+        (
+            self.weights.map(|x| self.regularization.derivate(x)),
+            DVector::zeros(self.bias.shape().0),
+        )
+    }
+
+    fn get_gradient(
         &self,
         input: &DVector<F>,
-        epsilon: Self::Output,
-    ) -> (DVector<F>, Self::Gradient) {
-        let evaluated = &self.weights * input + &self.bias;
+        evaluated: &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> Self::Gradient {
         // Compute delta (the input gradient of the neuron) from epsilon (the output gradient of the neuron),
         // with `self.activation'(input) ° epsilon = delta`
         let mut delta = epsilon.clone();
 
         for i in 0..delta.len() {
+            // TODO: remove `- self.bias[i]`
             delta[i] *= self.activation.derivate(evaluated[i]);
         }
 
-        // Compute the weight gradient
         let weights_gradient = &delta * input.transpose();
 
-        let new_epsilon = self.weights.tr_mul(&delta);
-
         // According to https://datascience.stackexchange.com/questions/20139/gradients-for-bias-terms-in-backpropagation
         // The gradient of the bias is equal to the delta term of the backpropagation algorithm
         let bias_gradient = delta;
 
-        (new_epsilon, (weights_gradient, bias_gradient))
+        (weights_gradient, bias_gradient)
     }
+}
 
-    fn regularize_layer(&self) -> Self::Gradient {
-        (
-            self.weights.map(|x| self.regularization.derivate(x)),
-            DVector::zeros(self.bias.shape().0),
-        )
-    }
+impl<
+        F: Float + std::fmt::Debug + 'static + std::ops::AddAssign + std::ops::MulAssign,
+        Act: NeuraDerivable<F>,
+        Reg: NeuraDerivable<F>,
+    > NeuraTrainableLayerBackprop<DVector<F>> for NeuraDenseLayer<F, Act, Reg>
+{
+    fn backprop_layer(
+        &self,
+        input: &DVector<F>,
+        evaluated: &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> DVector<F> {
+        // Compute delta (the input gradient of the neuron) from epsilon (the output gradient of the neuron),
+        // with `self.activation'(input) ° epsilon = delta`
+        let mut delta = epsilon.clone();
 
-    fn apply_gradient(&mut self, gradient: &Self::Gradient) {
-        self.weights += &gradient.0;
-        self.bias += &gradient.1;
+        for i in 0..delta.len() {
+            delta[i] *= self.activation.derivate(evaluated[i]);
+        }
+
+        self.weights.tr_mul(&delta)
     }
 }
diff --git a/src/layer/dropout.rs b/src/layer/dropout.rs
index afd0511..b44fb32 100644
--- a/src/layer/dropout.rs
+++ b/src/layer/dropout.rs
@@ -61,24 +61,15 @@ impl<R: Rng, F: Float> NeuraLayer<DVector<F>> for NeuraDropoutLayer<R> {
     }
 }
 
-impl<R: Rng, F: Float> NeuraTrainableLayer<DVector<F>> for NeuraDropoutLayer<R> {
+impl<R: Rng, F: Float> NeuraTrainableLayerBase<DVector<F>> for NeuraDropoutLayer<R> {
     type Gradient = ();
+    type IntermediaryRepr = ();
 
-    fn default_gradient(&self) -> Self::Gradient {
-        ()
+    fn eval_training(&self, input: &DVector<F>) -> (Self::Output, Self::IntermediaryRepr) {
+        (self.eval(input), ())
     }
 
-    fn backprop_layer(
-        &self,
-        _input: &DVector<F>,
-        mut epsilon: Self::Output,
-    ) -> (DVector<F>, Self::Gradient) {
-        self.apply_dropout(&mut epsilon);
-
-        (epsilon, ())
-    }
-
-    fn regularize_layer(&self) -> Self::Gradient {
+    fn default_gradient(&self) -> Self::Gradient {
         ()
     }
 
@@ -110,6 +101,36 @@ impl<R: Rng, F: Float> NeuraTrainableLayer<DVector<F>> for NeuraDropoutLayer<R>
     }
 }
 
+impl<R: Rng, F: Float> NeuraTrainableLayerSelf<DVector<F>> for NeuraDropoutLayer<R> {
+    fn regularize_layer(&self) -> Self::Gradient {
+        ()
+    }
+
+    fn get_gradient(
+        &self,
+        _input: &DVector<F>,
+        _intermediary: &Self::IntermediaryRepr,
+        _epsilon: &Self::Output,
+    ) -> Self::Gradient {
+        ()
+    }
+}
+
+impl<R: Rng, F: Float> NeuraTrainableLayerBackprop<DVector<F>> for NeuraDropoutLayer<R> {
+    fn backprop_layer(
+        &self,
+        _input: &DVector<F>,
+        _intermediary: &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> DVector<F> {
+        let mut epsilon = epsilon.clone();
+
+        self.apply_dropout(&mut epsilon);
+
+        epsilon
+    }
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
@@ -121,7 +142,7 @@ mod test {
             .unwrap();
 
         for _ in 0..100 {
-            <NeuraDropoutLayer<_> as NeuraTrainableLayer<DVector<f64>>>::prepare_layer(
+            <NeuraDropoutLayer<_> as NeuraTrainableLayerBase<DVector<f64>>>::prepare_layer(
                 &mut layer, true,
             );
             assert!(layer.multiplier.is_finite());
diff --git a/src/layer/mod.rs b/src/layer/mod.rs
index 10cc623..6a80e40 100644
--- a/src/layer/mod.rs
+++ b/src/layer/mod.rs
@@ -23,6 +23,7 @@ impl NeuraShape {
 }
 
 pub trait NeuraLayer<Input> {
+    /// What type the layer outputs
     type Output;
 
     fn eval(&self, input: &Input) -> Self::Output;
@@ -46,12 +47,64 @@ pub trait NeuraPartialLayer {
     fn output_shape(constructed: &Self::Constructed) -> NeuraShape;
 }
 
-pub trait NeuraTrainableLayer<Input>: NeuraLayer<Input> {
+pub trait NeuraTrainableLayerBase<Input>: NeuraLayer<Input> {
     /// The representation of the layer gradient as a vector space
     type Gradient: NeuraVectorSpace;
 
+    /// An intermediary object type to be passed to the various training methods
+    type IntermediaryRepr;
+
     fn default_gradient(&self) -> Self::Gradient;
 
+    /// Applies `δW_l` to the weights of the layer
+    fn apply_gradient(&mut self, gradient: &Self::Gradient);
+
+    fn eval_training(&self, input: &Input) -> (Self::Output, Self::IntermediaryRepr);
+
+    /// Arbitrary computation that can be executed at the start of an epoch
+    #[allow(unused_variables)]
+    #[inline(always)]
+    fn prepare_layer(&mut self, is_training: bool) {}
+}
+
+/// Contains methods relative to a layer's ability to compute its own weights gradients,
+/// given the derivative of the output variables.
+pub trait NeuraTrainableLayerSelf<Input>: NeuraTrainableLayerBase<Input> {
+    /// Computes the regularization
+    fn regularize_layer(&self) -> Self::Gradient;
+
+    /// Computes the layer's gradient,
+    ///
+    /// `intermediary` is guaranteed to have been generated by a previous call to `eval_training`,
+    /// without mutation of `self` in-between, and with the same `input`.
+    fn get_gradient(
+        &self,
+        input: &Input,
+        intermediary: &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> Self::Gradient;
+}
+
+// impl<Input, Layer: NeuraTrainableLayerBase<Input, Gradient = ()>> NeuraTrainableLayerSelf<Input>
+//     for Layer
+// {
+//     #[inline(always)]
+//     fn regularize_layer(&self) -> Self::Gradient {
+//         ()
+//     }
+
+//     #[inline(always)]
+//     fn get_gradient(
+//         &self,
+//         input: &Input,
+//         intermediary: &Self::IntermediaryRepr,
+//         epsilon: Self::Output,
+//     ) -> Self::Gradient {
+//         ()
+//     }
+// }
+
+pub trait NeuraTrainableLayerBackprop<Input>: NeuraTrainableLayerBase<Input> {
     /// Computes the backpropagation term and the derivative of the internal weights,
     /// using the `input` vector outputted by the previous layer and the backpropagation term `epsilon` of the next layer.
     ///
@@ -63,42 +116,31 @@ pub trait NeuraTrainableLayer<Input>: NeuraLayer<Input> {
     /// The function should then return a pair `(epsilon_{l-1}, δW_l)`,
     /// with `epsilon_{l-1}` being multiplied by `f_{l-1}'(activation)` by the next layer to obtain `delta_{l-1}`.
     /// Using this intermediate value for `delta` allows us to isolate it computation to the respective layers.
-    fn backprop_layer(&self, input: &Input, epsilon: Self::Output) -> (Input, Self::Gradient);
-
-    /// Computes the regularization
-    fn regularize_layer(&self) -> Self::Gradient;
-
-    /// Applies `δW_l` to the weights of the layer
-    fn apply_gradient(&mut self, gradient: &Self::Gradient);
-
-    /// Arbitrary computation that can be executed at the start of an epoch
-    #[allow(unused_variables)]
-    #[inline(always)]
-    fn prepare_layer(&mut self, is_training: bool) {}
+    fn backprop_layer(
+        &self,
+        input: &Input,
+        intermediary: &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> Input;
 }
 
-impl<Input: Clone> NeuraTrainableLayer<Input> for () {
+impl<Input: Clone> NeuraTrainableLayerBase<Input> for () {
     type Gradient = ();
+    type IntermediaryRepr = ();
 
     #[inline(always)]
     fn default_gradient(&self) -> Self::Gradient {
         ()
     }
 
-    #[inline(always)]
-    fn backprop_layer(&self, _input: &Input, epsilon: Self::Output) -> (Input, Self::Gradient) {
-        (epsilon, ())
-    }
-
-    #[inline(always)]
-    fn regularize_layer(&self) -> Self::Gradient {
-        ()
-    }
-
     #[inline(always)]
     fn apply_gradient(&mut self, _gradient: &Self::Gradient) {
         // Noop
     }
+
+    fn eval_training(&self, input: &Input) -> (Self::Output, Self::IntermediaryRepr) {
+        (self.eval(input), ())
+    }
 }
 
 /// Temporary implementation of neura_layer
diff --git a/src/layer/normalize.rs b/src/layer/normalize.rs
index 291657e..2cb81a1 100644
--- a/src/layer/normalize.rs
+++ b/src/layer/normalize.rs
@@ -1,4 +1,4 @@
-use nalgebra::{DVector, Scalar};
+use nalgebra::{DMatrix, DVector, Scalar};
 use num::{traits::NumAssignOps, Float};
 
 use super::*;
@@ -54,14 +54,19 @@ impl<F: Float + Scalar> NeuraLayer<DVector<F>> for NeuraNormalizeLayer {
     }
 }
 
-impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayer<DVector<F>> for NeuraNormalizeLayer {
+impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayerBase<DVector<F>> for NeuraNormalizeLayer {
     type Gradient = ();
+    type IntermediaryRepr = (DMatrix<F>, F); // Partial jacobian matrix (without the kroenecker term) and stddev
 
-    fn backprop_layer(
-        &self,
-        input: &DVector<F>,
-        epsilon: Self::Output,
-    ) -> (DVector<F>, Self::Gradient) {
+    fn default_gradient(&self) -> Self::Gradient {
+        ()
+    }
+
+    fn apply_gradient(&mut self, _gradient: &Self::Gradient) {
+        // Noop
+    }
+
+    fn eval_training(&self, input: &DVector<F>) -> (Self::Output, Self::IntermediaryRepr) {
         let (mean, variance, len) = mean_variance(input);
         let stddev = F::sqrt(variance);
         let input_centered = input.clone().map(|x| x - mean);
@@ -73,26 +78,42 @@ impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayer<DVector<F>> for Neura
             *value += F::one() / (stddev * len);
         }
 
-        let mut epsilon_out = jacobian_partial * &epsilon;
-
-        // Apply the δ_{ik}/σ term
-        for i in 0..epsilon_out.len() {
-            epsilon_out[i] += epsilon[i] / stddev;
-        }
-
-        (epsilon_out, ())
+        (input_centered / stddev, (jacobian_partial, stddev))
     }
+}
 
-    fn default_gradient(&self) -> Self::Gradient {
+impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayerSelf<DVector<F>> for NeuraNormalizeLayer {
+    fn regularize_layer(&self) -> Self::Gradient {
         ()
     }
 
-    fn regularize_layer(&self) -> Self::Gradient {
+    fn get_gradient(
+        &self,
+        input: &DVector<F>,
+        intermediary: &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> Self::Gradient {
         ()
     }
+}
 
-    fn apply_gradient(&mut self, _gradient: &Self::Gradient) {
-        // Noop
+impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayerBackprop<DVector<F>>
+    for NeuraNormalizeLayer
+{
+    fn backprop_layer(
+        &self,
+        input: &DVector<F>,
+        (jacobian_partial, stddev): &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> DVector<F> {
+        let mut epsilon_out = jacobian_partial * epsilon;
+
+        // Apply the δ_{ik}/σ term
+        for i in 0..epsilon_out.len() {
+            epsilon_out[i] += epsilon[i] / *stddev;
+        }
+
+        epsilon_out
     }
 }
 
diff --git a/src/layer/softmax.rs b/src/layer/softmax.rs
index e428677..6f97472 100644
--- a/src/layer/softmax.rs
+++ b/src/layer/softmax.rs
@@ -54,22 +54,53 @@ impl NeuraPartialLayer for NeuraSoftmaxLayer {
     }
 }
 
-impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayer<DVector<F>> for NeuraSoftmaxLayer {
+impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayerBase<DVector<F>> for NeuraSoftmaxLayer {
     type Gradient = ();
+    type IntermediaryRepr = Self::Output; // Result of self.eval
 
     fn default_gradient(&self) -> Self::Gradient {
         ()
     }
 
+    fn apply_gradient(&mut self, _gradient: &Self::Gradient) {
+        // Noop
+    }
+
+    fn eval_training(&self, input: &DVector<F>) -> (Self::Output, Self::IntermediaryRepr) {
+        let res = self.eval(input);
+        (res.clone(), res)
+    }
+}
+
+impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayerSelf<DVector<F>> for NeuraSoftmaxLayer {
+    #[inline(always)]
+    fn regularize_layer(&self) -> Self::Gradient {
+        ()
+    }
+
+    #[inline(always)]
+    fn get_gradient(
+        &self,
+        input: &DVector<F>,
+        intermediary: &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> Self::Gradient {
+        ()
+    }
+}
+
+impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayerBackprop<DVector<F>>
+    for NeuraSoftmaxLayer
+{
     fn backprop_layer(
         &self,
         input: &DVector<F>,
-        mut epsilon: Self::Output,
-    ) -> (DVector<F>, Self::Gradient) {
-        // Note: a constant value can be added to `input` to bring it to increase precision
-        let evaluated = self.eval(input);
+        evaluated: &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> DVector<F> {
+        let mut epsilon = epsilon.clone();
 
-        // Compute $a_{l-1,i} \epsilon_{l,i}$
+        // Compute $a_{l-1,i} ° \epsilon_{l,i}$
         hadamard_product(&mut epsilon, &evaluated);
 
         // Compute $\sum_{k}{a_{l-1,k} \epsilon_{l,k}}$
@@ -80,15 +111,7 @@ impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayer<DVector<F>> for Neura
             epsilon[i] -= evaluated[i] * sum_diagonal_terms;
         }
 
-        (epsilon, ())
-    }
-
-    fn regularize_layer(&self) -> Self::Gradient {
-        ()
-    }
-
-    fn apply_gradient(&mut self, _gradient: &Self::Gradient) {
-        // Noop
+        epsilon
     }
 }
 
@@ -132,8 +155,9 @@ mod test {
                 for epsilon1 in [1.7, 1.9, 2.3] {
                     for epsilon2 in [2.9, 3.1, 3.7] {
                         let epsilon = dvector![epsilon1, epsilon2];
+                        let evaluated = layer.eval(&input);
 
-                        let (epsilon, _) = layer.backprop_layer(&input, epsilon);
+                        let epsilon = layer.backprop_layer(&input, &evaluated, &epsilon);
                         let expected = [
                             output[0] * (1.0 - output[0]) * epsilon1
                                 - output[1] * output[0] * epsilon2,
@@ -165,7 +189,8 @@ mod test {
             derivative += DMatrix::from_diagonal(&evaluated);
 
             let expected = derivative * &loss;
-            let (actual, _) = layer.backprop_layer(&input, loss);
+            let evaluated = layer.eval(&input);
+            let actual = layer.backprop_layer(&input, &evaluated, &loss);
 
             for i in 0..4 {
                 assert!((expected[i] - actual[i]).abs() < EPSILON);
diff --git a/src/network/mod.rs b/src/network/mod.rs
index 823ee61..335889e 100644
--- a/src/network/mod.rs
+++ b/src/network/mod.rs
@@ -4,6 +4,7 @@ use crate::{
 
 pub mod sequential;
 
+// TODO: extract regularize from this, so that we can drop the trait constraints on NeuraSequential's impl
 pub trait NeuraTrainableNetworkBase<Input>: NeuraLayer<Input> {
     type Gradient: NeuraVectorSpace;
     type LayerOutput;
diff --git a/src/network/sequential/layer_impl.rs b/src/network/sequential/layer_impl.rs
new file mode 100644
index 0000000..e454b84
--- /dev/null
+++ b/src/network/sequential/layer_impl.rs
@@ -0,0 +1,96 @@
+use super::*;
+use crate::prelude::NeuraTrainableLayerBackprop;
+
+impl<Input, Layer: NeuraLayer<Input>, ChildNetwork: NeuraLayer<Layer::Output>> NeuraLayer<Input>
+    for NeuraSequential<Layer, ChildNetwork>
+{
+    type Output = ChildNetwork::Output;
+
+    fn eval(&self, input: &Input) -> Self::Output {
+        self.child_network.eval(&self.layer.eval(input))
+    }
+}
+
+impl<
+        Input,
+        Layer: NeuraTrainableLayerBase<Input>,
+        ChildNetwork: NeuraTrainableLayerBase<Layer::Output>,
+    > NeuraTrainableLayerBase<Input> for NeuraSequential<Layer, ChildNetwork>
+{
+    type Gradient = (Layer::Gradient, Box<ChildNetwork::Gradient>);
+    type IntermediaryRepr = (Layer::IntermediaryRepr, Box<ChildNetwork::IntermediaryRepr>);
+
+    fn default_gradient(&self) -> Self::Gradient {
+        (
+            self.layer.default_gradient(),
+            Box::new(self.child_network.default_gradient()),
+        )
+    }
+
+    fn eval_training(&self, input: &Input) -> (Self::Output, Self::IntermediaryRepr) {
+        let (layer_output, layer_intermediary) = self.layer.eval_training(input);
+        let (child_output, child_intermediary) = self.child_network.eval_training(&layer_output);
+
+        (
+            child_output,
+            (layer_intermediary, Box::new(child_intermediary)),
+        )
+    }
+
+    fn prepare_layer(&mut self, is_training: bool) {
+        self.layer.prepare_layer(is_training);
+        self.child_network.prepare_layer(is_training);
+    }
+
+    fn apply_gradient(&mut self, gradient: &Self::Gradient) {
+        self.layer.apply_gradient(&gradient.0);
+        self.child_network.apply_gradient(&gradient.1);
+    }
+}
+
+impl<
+        Input,
+        Layer: NeuraTrainableLayerSelf<Input>,
+        ChildNetwork: NeuraTrainableLayerSelf<Layer::Output> + NeuraTrainableLayerBackprop<Layer::Output>,
+    > NeuraTrainableLayerSelf<Input> for NeuraSequential<Layer, ChildNetwork>
+{
+    fn regularize_layer(&self) -> Self::Gradient {
+        (
+            self.layer.regularize_layer(),
+            Box::new(self.child_network.regularize_layer()),
+        )
+    }
+
+    fn get_gradient(
+        &self,
+        input: &Input,
+        intermediary: &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> Self::Gradient {
+        unimplemented!("NeuraSequential::get_gradient is not yet implemented, sorry");
+    }
+}
+
+impl<
+        Input,
+        Layer: NeuraTrainableLayerBackprop<Input>,
+        ChildNetwork: NeuraTrainableLayerBackprop<Layer::Output>,
+    > NeuraTrainableLayerBackprop<Input> for NeuraSequential<Layer, ChildNetwork>
+{
+    fn backprop_layer(
+        &self,
+        input: &Input,
+        intermediary: &Self::IntermediaryRepr,
+        incoming_epsilon: &Self::Output,
+    ) -> Input {
+        let transient_output = self.layer.eval(input);
+        let transient_epsilon =
+            self.child_network
+                .backprop_layer(&transient_output, &intermediary.1, incoming_epsilon);
+        let outgoing_epsilon =
+            self.layer
+                .backprop_layer(input, &intermediary.0, &transient_epsilon);
+
+        outgoing_epsilon
+    }
+}
diff --git a/src/network/sequential/mod.rs b/src/network/sequential/mod.rs
index a969d33..dfeb577 100644
--- a/src/network/sequential/mod.rs
+++ b/src/network/sequential/mod.rs
@@ -1,10 +1,12 @@
 use super::{NeuraTrainableNetwork, NeuraTrainableNetworkBase};
 use crate::{
     gradient_solver::{NeuraGradientSolverFinal, NeuraGradientSolverTransient},
-    layer::{NeuraLayer, NeuraPartialLayer, NeuraShape, NeuraTrainableLayer},
+    layer::{NeuraLayer, NeuraPartialLayer, NeuraShape, NeuraTrainableLayerBase},
+    prelude::NeuraTrainableLayerSelf,
 };
 
 mod construct;
+mod layer_impl;
 mod tail;
 
 pub use construct::*;
@@ -24,7 +26,7 @@ pub use tail::*;
 /// ## Notes on implemented traits
 ///
 /// The different implementations for `NeuraTrainableNetwork`,
-/// `NeuraLayer` and `NeuraTrainableLayer` each require that `ChildNetwork` implements those respective traits,
+/// `NeuraLayer` and `NeuraTrainableLayerBase` each require that `ChildNetwork` implements those respective traits,
 /// and that the output type of `Layer` matches the input type of `ChildNetwork`.
 ///
 /// If a method, like `eval`, is reported as missing,
@@ -74,61 +76,9 @@ impl<Layer, ChildNetwork> NeuraSequential<Layer, ChildNetwork> {
     }
 }
 
-impl<Input, Layer: NeuraLayer<Input>, ChildNetwork: NeuraLayer<Layer::Output>> NeuraLayer<Input>
-    for NeuraSequential<Layer, ChildNetwork>
-{
-    type Output = ChildNetwork::Output;
-
-    fn eval(&self, input: &Input) -> Self::Output {
-        self.child_network.eval(&self.layer.eval(input))
-    }
-}
-
-impl<
-        Input,
-        Layer: NeuraTrainableLayer<Input>,
-        ChildNetwork: NeuraTrainableLayer<Layer::Output>,
-    > NeuraTrainableLayer<Input> for NeuraSequential<Layer, ChildNetwork>
-{
-    type Gradient = (Layer::Gradient, Box<ChildNetwork::Gradient>);
-
-    fn default_gradient(&self) -> Self::Gradient {
-        (
-            self.layer.default_gradient(),
-            Box::new(self.child_network.default_gradient()),
-        )
-    }
-
-    fn backprop_layer(
-        &self,
-        input: &Input,
-        incoming_epsilon: Self::Output,
-    ) -> (Input, Self::Gradient) {
-        let output = self.layer.eval(input);
-        let (transient_epsilon, child_gradient) =
-            self.child_network.backprop_layer(&output, incoming_epsilon);
-        let (outgoing_epsilon, layer_gradient) =
-            self.layer.backprop_layer(input, transient_epsilon);
-
-        (outgoing_epsilon, (layer_gradient, Box::new(child_gradient)))
-    }
-
-    fn regularize_layer(&self) -> Self::Gradient {
-        (
-            self.layer.regularize_layer(),
-            Box::new(self.child_network.regularize_layer()),
-        )
-    }
-
-    fn apply_gradient(&mut self, gradient: &Self::Gradient) {
-        self.layer.apply_gradient(&gradient.0);
-        self.child_network.apply_gradient(&gradient.1);
-    }
-}
-
 impl<
         Input,
-        Layer: NeuraTrainableLayer<Input>,
+        Layer: NeuraTrainableLayerBase<Input> + NeuraTrainableLayerSelf<Input>,
         ChildNetwork: NeuraTrainableNetworkBase<Layer::Output>,
     > NeuraTrainableNetworkBase<Input> for NeuraSequential<Layer, ChildNetwork>
 {
@@ -188,8 +138,8 @@ impl<Input: Clone> NeuraTrainableNetworkBase<Input> for () {
 
 impl<
         Input,
-        Layer: NeuraTrainableLayer<Input>,
-        Optimizer: NeuraGradientSolverTransient<Layer::Output>,
+        Layer: NeuraTrainableLayerBase<Input> + NeuraTrainableLayerSelf<Input>,
+        Optimizer: NeuraGradientSolverTransient<Input, Layer>,
         ChildNetwork: NeuraTrainableNetworkBase<Layer::Output>,
     > NeuraTrainableNetwork<Input, Optimizer> for NeuraSequential<Layer, ChildNetwork>
 where
@@ -200,12 +150,14 @@ where
         input: &Input,
         optimizer: &Optimizer,
     ) -> Optimizer::Output<Input, Self::Gradient> {
-        let next_activation = self.layer.eval(input);
+        let (next_activation, intermediary) = self.layer.eval_training(input);
         let child_result = self.child_network.traverse(&next_activation, optimizer);
 
         optimizer.eval_layer(
             &self.layer,
             input,
+            &next_activation,
+            &intermediary,
             child_result,
             |layer_gradient, child_gradient| (layer_gradient, Box::new(child_gradient)),
         )
diff --git a/src/train.rs b/src/train.rs
index 576acca..24e1ce4 100644
--- a/src/train.rs
+++ b/src/train.rs
@@ -82,7 +82,10 @@ impl NeuraBatchedTrainer {
         network: &mut Network,
         inputs: Inputs,
         test_inputs: &[(Input, Target)],
-    ) -> Vec<(f64, f64)> {
+    ) -> Vec<(f64, f64)>
+    where
+        <Network as NeuraTrainableNetworkBase<Input>>::Gradient: std::fmt::Debug,
+    {
         let mut losses = Vec::new();
         let mut iter = inputs.into_iter();
         let factor = -self.learning_rate / (self.batch_size as f64);
diff --git a/tests/xor.rs b/tests/xor.rs
index 9f47459..1b9edb0 100644
--- a/tests/xor.rs
+++ b/tests/xor.rs
@@ -1,12 +1,20 @@
 use std::fs::File;
 
 use approx::assert_relative_eq;
-use nalgebra::{DMatrix, DVector, dvector};
-use neuramethyst::{prelude::{*, dense::NeuraDenseLayer}, derivable::{activation::{Relu, Tanh}, regularize::NeuraL0, loss::Euclidean}};
+use nalgebra::{dvector, DMatrix, DVector};
+use neuramethyst::{
+    derivable::{
+        activation::{Relu, Tanh},
+        loss::Euclidean,
+        regularize::NeuraL0,
+    },
+    prelude::{dense::NeuraDenseLayer, *},
+};
 
 fn load_test_data() -> Vec<(DMatrix<f64>, DVector<f64>, DMatrix<f64>, DVector<f64>)> {
     let file = File::open("tests/xor.json").unwrap();
-    let data: Vec<(DMatrix<f64>, DVector<f64>, DMatrix<f64>, DVector<f64>)> = serde_json::from_reader(&file).unwrap();
+    let data: Vec<(DMatrix<f64>, DVector<f64>, DMatrix<f64>, DVector<f64>)> =
+        serde_json::from_reader(&file).unwrap();
 
     data
 }
@@ -43,7 +51,7 @@ fn test_xor_training() {
             network.layer.weights.clone(),
             network.layer.bias.clone(),
             network.child_network.layer.weights.clone(),
-            network.child_network.layer.bias.clone()
+            network.child_network.layer.bias.clone(),
         );
 
         assert_relative_eq!(expected.0.as_slice(), actual.0.as_slice());