🔥 Refactor of NeuraTrainableLayer, split it into multiple traits

2 years ago · d40098d2ef
parent f3752bd411
commit d40098d2ef
14 changed files with 425 additions and 201 deletions
--- a/examples/generate-tests.rs
+++ b/examples/generate-tests.rs
@ -30,7 +30,7 @@ fn main() {
        network.layer.weights.clone(),
        network.layer.bias.clone(),
        network.child_network.layer.weights.clone(),
-        network.child_network.layer.bias.clone()
+        network.child_network.layer.bias.clone(),
    )];

    for iteration in 0..4 {
@ -45,7 +45,7 @@ fn main() {
            network.layer.weights.clone(),
            network.layer.bias.clone(),
            network.child_network.layer.weights.clone(),
-            network.child_network.layer.bias.clone()
+            network.child_network.layer.bias.clone(),
        ));
    }

--- a/src/gradient_solver/backprop.rs
+++ b/src/gradient_solver/backprop.rs
@ -1,6 +1,9 @@
 use num::ToPrimitive;

-use crate::{derivable::NeuraLoss, layer::NeuraTrainableLayer, network::NeuraTrainableNetworkBase};
+use crate::{
+    derivable::NeuraLoss, layer::NeuraTrainableLayerBackprop, layer::NeuraTrainableLayerSelf,
+    network::NeuraTrainableNetworkBase,
+};

 use super::*;

@ -53,23 +56,26 @@ impl<LayerOutput, Target, Loss: NeuraLoss<LayerOutput, Target = Target>>
    }
 }

-impl<LayerOutput, Target, Loss> NeuraGradientSolverTransient<LayerOutput>
-    for (&NeuraBackprop<Loss>, &Target)
-{
-    fn eval_layer<
+impl<
        Input,
-        NetworkGradient,
-        RecGradient,
-        Layer: NeuraTrainableLayer<Input, Output = LayerOutput>,
-    >(
+        Target,
+        Loss,
+        Layer: NeuraTrainableLayerBackprop<Input> + NeuraTrainableLayerSelf<Input>,
+    > NeuraGradientSolverTransient<Input, Layer> for (&NeuraBackprop<Loss>, &Target)
+{
+    fn eval_layer<NetworkGradient, RecGradient>(
        &self,
        layer: &Layer,
        input: &Input,
-        rec_opt_output: Self::Output<LayerOutput, RecGradient>,
+        _output: &Layer::Output,
+        intermediary: &Layer::IntermediaryRepr,
+        rec_opt_output: Self::Output<Layer::Output, RecGradient>,
        combine_gradients: impl Fn(Layer::Gradient, RecGradient) -> NetworkGradient,
    ) -> Self::Output<Input, NetworkGradient> {
        let (epsilon_in, rec_gradient) = rec_opt_output;
-        let (epsilon_out, layer_gradient) = layer.backprop_layer(input, epsilon_in);
+
+        let epsilon_out = layer.backprop_layer(input, intermediary, &epsilon_in);
+        let layer_gradient = layer.get_gradient(input, intermediary, &epsilon_in);

        (epsilon_out, combine_gradients(layer_gradient, rec_gradient))
    }
@ -80,7 +86,11 @@ mod test {
    use approx::assert_relative_eq;

    use super::*;
-    use crate::{prelude::*, derivable::{activation::Tanh, loss::Euclidean, NeuraDerivable}, utils::uniform_vector};
+    use crate::{
+        derivable::{activation::Tanh, loss::Euclidean, NeuraDerivable},
+        prelude::*,
+        utils::uniform_vector,
+    };

    #[test]
    fn test_backprop_epsilon_bias() {
@ -91,16 +101,22 @@ mod test {
            let network = neura_sequential![
                neura_layer!("dense", 4, f64).activation(Tanh),
                neura_layer!("dense", 2, f64).activation(Tanh)
-            ].construct(NeuraShape::Vector(4)).unwrap();
+            ]
+            .construct(NeuraShape::Vector(4))
+            .unwrap();

            let optimizer = NeuraBackprop::new(Euclidean);
            let input = uniform_vector(4);
            let target = uniform_vector(2);

            let layer1_intermediary = &network.layer.weights * &input;
-            let layer2_intermediary = &network.child_network.layer.weights * layer1_intermediary.map(|x| x.tanh());
+            let layer2_intermediary =
+                &network.child_network.layer.weights * layer1_intermediary.map(|x| x.tanh());

-            assert_relative_eq!(layer1_intermediary.map(|x| x.tanh()), network.clone().trim_tail().eval(&input));
+            assert_relative_eq!(
+                layer1_intermediary.map(|x| x.tanh()),
+                network.clone().trim_tail().eval(&input)
+            );

            let output = network.eval(&input);

@ -114,12 +130,14 @@ mod test {

            assert_relative_eq!(delta2_actual.as_slice(), delta2_expected.as_slice());

-            let gradient2_expected = &delta2_expected * layer1_intermediary.map(|x| x.tanh()).transpose();
+            let gradient2_expected =
+                &delta2_expected * layer1_intermediary.map(|x| x.tanh()).transpose();
            let gradient2_actual = gradient.1 .0 .0;

            assert_relative_eq!(gradient2_actual.as_slice(), gradient2_expected.as_slice());

-            let mut delta1_expected = network.child_network.layer.weights.transpose() * delta2_expected;
+            let mut delta1_expected =
+                network.child_network.layer.weights.transpose() * delta2_expected;
            for i in 0..4 {
                delta1_expected[i] *= Tanh.derivate(layer1_intermediary[i]);
            }
--- a/src/gradient_solver/forward_forward.rs
+++ b/src/gradient_solver/forward_forward.rs
@ -1,7 +1,7 @@
 use nalgebra::{DVector, Scalar};
 use num::{traits::NumAssignOps, Float, ToPrimitive};

-use crate::derivable::NeuraDerivable;
+use crate::{derivable::NeuraDerivable, prelude::NeuraTrainableLayerSelf};

 use super::*;

@ -90,22 +90,23 @@ impl<Act, LayerOutput> NeuraGradientSolverFinal<LayerOutput> for NeuraForwardPai
    }
 }

-impl<F: Float + Scalar + NumAssignOps, Act: NeuraDerivable<F>>
-    NeuraGradientSolverTransient<DVector<F>> for NeuraForwardPair<Act>
-{
-    fn eval_layer<
+impl<
+        F: Float + Scalar + NumAssignOps,
+        Act: NeuraDerivable<F>,
        Input,
-        NetworkGradient,
-        RecGradient,
-        Layer: NeuraTrainableLayer<Input, Output = DVector<F>>,
-    >(
+        Layer: NeuraTrainableLayerSelf<Input, Output = DVector<F>>,
+    > NeuraGradientSolverTransient<Input, Layer> for NeuraForwardPair<Act>
+{
+    fn eval_layer<NetworkGradient, RecGradient>(
        &self,
        layer: &Layer,
        input: &Input,
+        output: &Layer::Output,
+        intermediary: &Layer::IntermediaryRepr,
        rec_gradient: RecGradient,
        combine_gradients: impl Fn(Layer::Gradient, RecGradient) -> NetworkGradient,
    ) -> Self::Output<Input, NetworkGradient> {
-        let output = layer.eval(input);
+        // let output = layer.eval(input);
        let goodness = output
            .iter()
            .copied()
@ -129,7 +130,7 @@ impl<F: Float + Scalar + NumAssignOps, Act: NeuraDerivable<F>>
        }

        // TODO: split backprop_layer into eval_training, get_gradient and get_backprop
-        let (_, layer_gradient) = layer.backprop_layer(input, goodness_derivative);
+        let layer_gradient = layer.get_gradient(input, intermediary, &goodness_derivative);

        combine_gradients(layer_gradient, rec_gradient)
    }
--- a/src/gradient_solver/mod.rs
+++ b/src/gradient_solver/mod.rs
@ -5,7 +5,7 @@ mod forward_forward;
 pub use forward_forward::NeuraForwardForward;

 use crate::{
-    layer::NeuraTrainableLayer,
+    layer::NeuraTrainableLayerBase,
    network::{NeuraTrainableNetwork, NeuraTrainableNetworkBase},
 };

@ -17,17 +17,16 @@ pub trait NeuraGradientSolverFinal<LayerOutput>: NeuraGradientSolverBase {
    fn eval_final(&self, output: LayerOutput) -> Self::Output<LayerOutput, ()>;
 }

-pub trait NeuraGradientSolverTransient<LayerOutput>: NeuraGradientSolverBase {
-    fn eval_layer<
-        Input,
-        NetworkGradient,
-        RecGradient,
-        Layer: NeuraTrainableLayer<Input, Output = LayerOutput>,
-    >(
+pub trait NeuraGradientSolverTransient<Input, Layer: NeuraTrainableLayerBase<Input>>:
+    NeuraGradientSolverBase
+{
+    fn eval_layer<NetworkGradient, RecGradient>(
        &self,
        layer: &Layer,
        input: &Input,
-        rec_opt_output: Self::Output<LayerOutput, RecGradient>,
+        output: &Layer::Output,
+        layer_intermediary: &Layer::IntermediaryRepr,
+        rec_opt_output: Self::Output<Layer::Output, RecGradient>,
        combine_gradients: impl Fn(Layer::Gradient, RecGradient) -> NetworkGradient,
    ) -> Self::Output<Input, NetworkGradient>;
 }
--- a/src/layer/dense.rs
+++ b/src/layer/dense.rs
@ -161,9 +161,9 @@ impl<
    fn eval(&self, input: &DVector<F>) -> Self::Output {
        assert_eq!(input.shape().0, self.weights.shape().1);

-        let res = &self.weights * input + &self.bias;
+        let evaluated = &self.weights * input + &self.bias;

-        res.map(|x| self.activation.eval(x))
+        evaluated.map(|x| self.activation.eval(x))
    }
 }

@ -171,9 +171,17 @@ impl<
        F: Float + std::fmt::Debug + 'static + std::ops::AddAssign + std::ops::MulAssign,
        Act: NeuraDerivable<F>,
        Reg: NeuraDerivable<F>,
-    > NeuraTrainableLayer<DVector<F>> for NeuraDenseLayer<F, Act, Reg>
+    > NeuraTrainableLayerBase<DVector<F>> for NeuraDenseLayer<F, Act, Reg>
 {
    type Gradient = (DMatrix<F>, DVector<F>);
+    type IntermediaryRepr = DVector<F>; // pre-activation values
+
+    fn eval_training(&self, input: &DVector<F>) -> (Self::Output, Self::IntermediaryRepr) {
+        let evaluated = &self.weights * input + &self.bias;
+        let output = evaluated.map(|x| self.activation.eval(x));
+
+        (output, evaluated)
+    }

    fn default_gradient(&self) -> Self::Gradient {
        (
@ -182,41 +190,70 @@ impl<
        )
    }

-    fn backprop_layer(
+    fn apply_gradient(&mut self, gradient: &Self::Gradient) {
+        self.weights += &gradient.0;
+        self.bias += &gradient.1;
+    }
+}
+
+impl<
+        F: Float + std::fmt::Debug + 'static + std::ops::AddAssign + std::ops::MulAssign,
+        Act: NeuraDerivable<F>,
+        Reg: NeuraDerivable<F>,
+    > NeuraTrainableLayerSelf<DVector<F>> for NeuraDenseLayer<F, Act, Reg>
+{
+    fn regularize_layer(&self) -> Self::Gradient {
+        (
+            self.weights.map(|x| self.regularization.derivate(x)),
+            DVector::zeros(self.bias.shape().0),
+        )
+    }
+
+    fn get_gradient(
        &self,
        input: &DVector<F>,
-        epsilon: Self::Output,
-    ) -> (DVector<F>, Self::Gradient) {
-        let evaluated = &self.weights * input + &self.bias;
+        evaluated: &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> Self::Gradient {
        // Compute delta (the input gradient of the neuron) from epsilon (the output gradient of the neuron),
        // with `self.activation'(input) ° epsilon = delta`
        let mut delta = epsilon.clone();

        for i in 0..delta.len() {
+            // TODO: remove `- self.bias[i]`
            delta[i] *= self.activation.derivate(evaluated[i]);
        }

-        // Compute the weight gradient
        let weights_gradient = &delta * input.transpose();

-        let new_epsilon = self.weights.tr_mul(&delta);
-
        // According to https://datascience.stackexchange.com/questions/20139/gradients-for-bias-terms-in-backpropagation
        // The gradient of the bias is equal to the delta term of the backpropagation algorithm
        let bias_gradient = delta;

-        (new_epsilon, (weights_gradient, bias_gradient))
+        (weights_gradient, bias_gradient)
+    }
 }

-    fn regularize_layer(&self) -> Self::Gradient {
-        (
-            self.weights.map(|x| self.regularization.derivate(x)),
-            DVector::zeros(self.bias.shape().0),
-        )
+impl<
+        F: Float + std::fmt::Debug + 'static + std::ops::AddAssign + std::ops::MulAssign,
+        Act: NeuraDerivable<F>,
+        Reg: NeuraDerivable<F>,
+    > NeuraTrainableLayerBackprop<DVector<F>> for NeuraDenseLayer<F, Act, Reg>
+{
+    fn backprop_layer(
+        &self,
+        input: &DVector<F>,
+        evaluated: &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> DVector<F> {
+        // Compute delta (the input gradient of the neuron) from epsilon (the output gradient of the neuron),
+        // with `self.activation'(input) ° epsilon = delta`
+        let mut delta = epsilon.clone();
+
+        for i in 0..delta.len() {
+            delta[i] *= self.activation.derivate(evaluated[i]);
        }

-    fn apply_gradient(&mut self, gradient: &Self::Gradient) {
-        self.weights += &gradient.0;
-        self.bias += &gradient.1;
+        self.weights.tr_mul(&delta)
    }
 }
--- a/src/layer/dropout.rs
+++ b/src/layer/dropout.rs
@ -61,24 +61,15 @@ impl<R: Rng, F: Float> NeuraLayer<DVector<F>> for NeuraDropoutLayer<R> {
    }
 }

-impl<R: Rng, F: Float> NeuraTrainableLayer<DVector<F>> for NeuraDropoutLayer<R> {
+impl<R: Rng, F: Float> NeuraTrainableLayerBase<DVector<F>> for NeuraDropoutLayer<R> {
    type Gradient = ();
+    type IntermediaryRepr = ();

-    fn default_gradient(&self) -> Self::Gradient {
-        ()
+    fn eval_training(&self, input: &DVector<F>) -> (Self::Output, Self::IntermediaryRepr) {
+        (self.eval(input), ())
    }

-    fn backprop_layer(
-        &self,
-        _input: &DVector<F>,
-        mut epsilon: Self::Output,
-    ) -> (DVector<F>, Self::Gradient) {
-        self.apply_dropout(&mut epsilon);
-
-        (epsilon, ())
-    }
-
-    fn regularize_layer(&self) -> Self::Gradient {
+    fn default_gradient(&self) -> Self::Gradient {
        ()
    }

@ -110,6 +101,36 @@ impl<R: Rng, F: Float> NeuraTrainableLayer<DVector<F>> for NeuraDropoutLayer<R>
    }
 }

+impl<R: Rng, F: Float> NeuraTrainableLayerSelf<DVector<F>> for NeuraDropoutLayer<R> {
+    fn regularize_layer(&self) -> Self::Gradient {
+        ()
+    }
+
+    fn get_gradient(
+        &self,
+        _input: &DVector<F>,
+        _intermediary: &Self::IntermediaryRepr,
+        _epsilon: &Self::Output,
+    ) -> Self::Gradient {
+        ()
+    }
+}
+
+impl<R: Rng, F: Float> NeuraTrainableLayerBackprop<DVector<F>> for NeuraDropoutLayer<R> {
+    fn backprop_layer(
+        &self,
+        _input: &DVector<F>,
+        _intermediary: &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> DVector<F> {
+        let mut epsilon = epsilon.clone();
+
+        self.apply_dropout(&mut epsilon);
+
+        epsilon
+    }
+}
+
 #[cfg(test)]
 mod test {
    use super::*;
@ -121,7 +142,7 @@ mod test {
            .unwrap();

        for _ in 0..100 {
-            <NeuraDropoutLayer<_> as NeuraTrainableLayer<DVector<f64>>>::prepare_layer(
+            <NeuraDropoutLayer<_> as NeuraTrainableLayerBase<DVector<f64>>>::prepare_layer(
                &mut layer, true,
            );
            assert!(layer.multiplier.is_finite());
--- a/src/layer/mod.rs
+++ b/src/layer/mod.rs
@ -23,6 +23,7 @@ impl NeuraShape {
 }

 pub trait NeuraLayer<Input> {
+    /// What type the layer outputs
    type Output;

    fn eval(&self, input: &Input) -> Self::Output;
@ -46,12 +47,64 @@ pub trait NeuraPartialLayer {
    fn output_shape(constructed: &Self::Constructed) -> NeuraShape;
 }

-pub trait NeuraTrainableLayer<Input>: NeuraLayer<Input> {
+pub trait NeuraTrainableLayerBase<Input>: NeuraLayer<Input> {
    /// The representation of the layer gradient as a vector space
    type Gradient: NeuraVectorSpace;

+    /// An intermediary object type to be passed to the various training methods
+    type IntermediaryRepr;
+
    fn default_gradient(&self) -> Self::Gradient;

+    /// Applies `δW_l` to the weights of the layer
+    fn apply_gradient(&mut self, gradient: &Self::Gradient);
+
+    fn eval_training(&self, input: &Input) -> (Self::Output, Self::IntermediaryRepr);
+
+    /// Arbitrary computation that can be executed at the start of an epoch
+    #[allow(unused_variables)]
+    #[inline(always)]
+    fn prepare_layer(&mut self, is_training: bool) {}
+}
+
+/// Contains methods relative to a layer's ability to compute its own weights gradients,
+/// given the derivative of the output variables.
+pub trait NeuraTrainableLayerSelf<Input>: NeuraTrainableLayerBase<Input> {
+    /// Computes the regularization
+    fn regularize_layer(&self) -> Self::Gradient;
+
+    /// Computes the layer's gradient,
+    ///
+    /// `intermediary` is guaranteed to have been generated by a previous call to `eval_training`,
+    /// without mutation of `self` in-between, and with the same `input`.
+    fn get_gradient(
+        &self,
+        input: &Input,
+        intermediary: &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> Self::Gradient;
+}
+
+// impl<Input, Layer: NeuraTrainableLayerBase<Input, Gradient = ()>> NeuraTrainableLayerSelf<Input>
+//     for Layer
+// {
+//     #[inline(always)]
+//     fn regularize_layer(&self) -> Self::Gradient {
+//         ()
+//     }
+
+//     #[inline(always)]
+//     fn get_gradient(
+//         &self,
+//         input: &Input,
+//         intermediary: &Self::IntermediaryRepr,
+//         epsilon: Self::Output,
+//     ) -> Self::Gradient {
+//         ()
+//     }
+// }
+
+pub trait NeuraTrainableLayerBackprop<Input>: NeuraTrainableLayerBase<Input> {
    /// Computes the backpropagation term and the derivative of the internal weights,
    /// using the `input` vector outputted by the previous layer and the backpropagation term `epsilon` of the next layer.
    ///
@ -63,42 +116,31 @@ pub trait NeuraTrainableLayer<Input>: NeuraLayer<Input> {
    /// The function should then return a pair `(epsilon_{l-1}, δW_l)`,
    /// with `epsilon_{l-1}` being multiplied by `f_{l-1}'(activation)` by the next layer to obtain `delta_{l-1}`.
    /// Using this intermediate value for `delta` allows us to isolate it computation to the respective layers.
-    fn backprop_layer(&self, input: &Input, epsilon: Self::Output) -> (Input, Self::Gradient);
-
-    /// Computes the regularization
-    fn regularize_layer(&self) -> Self::Gradient;
-
-    /// Applies `δW_l` to the weights of the layer
-    fn apply_gradient(&mut self, gradient: &Self::Gradient);
-
-    /// Arbitrary computation that can be executed at the start of an epoch
-    #[allow(unused_variables)]
-    #[inline(always)]
-    fn prepare_layer(&mut self, is_training: bool) {}
+    fn backprop_layer(
+        &self,
+        input: &Input,
+        intermediary: &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> Input;
 }

-impl<Input: Clone> NeuraTrainableLayer<Input> for () {
+impl<Input: Clone> NeuraTrainableLayerBase<Input> for () {
    type Gradient = ();
+    type IntermediaryRepr = ();

    #[inline(always)]
    fn default_gradient(&self) -> Self::Gradient {
        ()
    }

-    #[inline(always)]
-    fn backprop_layer(&self, _input: &Input, epsilon: Self::Output) -> (Input, Self::Gradient) {
-        (epsilon, ())
-    }
-
-    #[inline(always)]
-    fn regularize_layer(&self) -> Self::Gradient {
-        ()
-    }
-
    #[inline(always)]
    fn apply_gradient(&mut self, _gradient: &Self::Gradient) {
        // Noop
    }
+
+    fn eval_training(&self, input: &Input) -> (Self::Output, Self::IntermediaryRepr) {
+        (self.eval(input), ())
+    }
 }

 /// Temporary implementation of neura_layer
--- a/src/layer/normalize.rs
+++ b/src/layer/normalize.rs
@ -1,4 +1,4 @@
-use nalgebra::{DVector, Scalar};
+use nalgebra::{DMatrix, DVector, Scalar};
 use num::{traits::NumAssignOps, Float};

 use super::*;
@ -54,14 +54,19 @@ impl<F: Float + Scalar> NeuraLayer<DVector<F>> for NeuraNormalizeLayer {
    }
 }

-impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayer<DVector<F>> for NeuraNormalizeLayer {
+impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayerBase<DVector<F>> for NeuraNormalizeLayer {
    type Gradient = ();
+    type IntermediaryRepr = (DMatrix<F>, F); // Partial jacobian matrix (without the kroenecker term) and stddev

-    fn backprop_layer(
-        &self,
-        input: &DVector<F>,
-        epsilon: Self::Output,
-    ) -> (DVector<F>, Self::Gradient) {
+    fn default_gradient(&self) -> Self::Gradient {
+        ()
+    }
+
+    fn apply_gradient(&mut self, _gradient: &Self::Gradient) {
+        // Noop
+    }
+
+    fn eval_training(&self, input: &DVector<F>) -> (Self::Output, Self::IntermediaryRepr) {
        let (mean, variance, len) = mean_variance(input);
        let stddev = F::sqrt(variance);
        let input_centered = input.clone().map(|x| x - mean);
@ -73,26 +78,42 @@ impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayer<DVector<F>> for Neura
            *value += F::one() / (stddev * len);
        }

-        let mut epsilon_out = jacobian_partial * &epsilon;
-
-        // Apply the δ_{ik}/σ term
-        for i in 0..epsilon_out.len() {
-            epsilon_out[i] += epsilon[i] / stddev;
+        (input_centered / stddev, (jacobian_partial, stddev))
    }
-
-        (epsilon_out, ())
 }

-    fn default_gradient(&self) -> Self::Gradient {
+impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayerSelf<DVector<F>> for NeuraNormalizeLayer {
+    fn regularize_layer(&self) -> Self::Gradient {
        ()
    }

-    fn regularize_layer(&self) -> Self::Gradient {
+    fn get_gradient(
+        &self,
+        input: &DVector<F>,
+        intermediary: &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> Self::Gradient {
        ()
    }
+}

-    fn apply_gradient(&mut self, _gradient: &Self::Gradient) {
-        // Noop
+impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayerBackprop<DVector<F>>
+    for NeuraNormalizeLayer
+{
+    fn backprop_layer(
+        &self,
+        input: &DVector<F>,
+        (jacobian_partial, stddev): &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> DVector<F> {
+        let mut epsilon_out = jacobian_partial * epsilon;
+
+        // Apply the δ_{ik}/σ term
+        for i in 0..epsilon_out.len() {
+            epsilon_out[i] += epsilon[i] / *stddev;
+        }
+
+        epsilon_out
    }
 }

--- a/src/layer/softmax.rs
+++ b/src/layer/softmax.rs
@ -54,22 +54,53 @@ impl NeuraPartialLayer for NeuraSoftmaxLayer {
    }
 }

-impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayer<DVector<F>> for NeuraSoftmaxLayer {
+impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayerBase<DVector<F>> for NeuraSoftmaxLayer {
    type Gradient = ();
+    type IntermediaryRepr = Self::Output; // Result of self.eval

    fn default_gradient(&self) -> Self::Gradient {
        ()
    }

+    fn apply_gradient(&mut self, _gradient: &Self::Gradient) {
+        // Noop
+    }
+
+    fn eval_training(&self, input: &DVector<F>) -> (Self::Output, Self::IntermediaryRepr) {
+        let res = self.eval(input);
+        (res.clone(), res)
+    }
+}
+
+impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayerSelf<DVector<F>> for NeuraSoftmaxLayer {
+    #[inline(always)]
+    fn regularize_layer(&self) -> Self::Gradient {
+        ()
+    }
+
+    #[inline(always)]
+    fn get_gradient(
+        &self,
+        input: &DVector<F>,
+        intermediary: &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> Self::Gradient {
+        ()
+    }
+}
+
+impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayerBackprop<DVector<F>>
+    for NeuraSoftmaxLayer
+{
    fn backprop_layer(
        &self,
        input: &DVector<F>,
-        mut epsilon: Self::Output,
-    ) -> (DVector<F>, Self::Gradient) {
-        // Note: a constant value can be added to `input` to bring it to increase precision
-        let evaluated = self.eval(input);
+        evaluated: &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> DVector<F> {
+        let mut epsilon = epsilon.clone();

-        // Compute $a_{l-1,i} \epsilon_{l,i}$
+        // Compute $a_{l-1,i} ° \epsilon_{l,i}$
        hadamard_product(&mut epsilon, &evaluated);

        // Compute $\sum_{k}{a_{l-1,k} \epsilon_{l,k}}$
@ -80,15 +111,7 @@ impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayer<DVector<F>> for Neura
            epsilon[i] -= evaluated[i] * sum_diagonal_terms;
        }

-        (epsilon, ())
-    }
-
-    fn regularize_layer(&self) -> Self::Gradient {
-        ()
-    }
-
-    fn apply_gradient(&mut self, _gradient: &Self::Gradient) {
-        // Noop
+        epsilon
    }
 }

@ -132,8 +155,9 @@ mod test {
                for epsilon1 in [1.7, 1.9, 2.3] {
                    for epsilon2 in [2.9, 3.1, 3.7] {
                        let epsilon = dvector![epsilon1, epsilon2];
+                        let evaluated = layer.eval(&input);

-                        let (epsilon, _) = layer.backprop_layer(&input, epsilon);
+                        let epsilon = layer.backprop_layer(&input, &evaluated, &epsilon);
                        let expected = [
                            output[0] * (1.0 - output[0]) * epsilon1
                                - output[1] * output[0] * epsilon2,
@ -165,7 +189,8 @@ mod test {
            derivative += DMatrix::from_diagonal(&evaluated);

            let expected = derivative * &loss;
-            let (actual, _) = layer.backprop_layer(&input, loss);
+            let evaluated = layer.eval(&input);
+            let actual = layer.backprop_layer(&input, &evaluated, &loss);

            for i in 0..4 {
                assert!((expected[i] - actual[i]).abs() < EPSILON);
--- a/src/network/mod.rs
+++ b/src/network/mod.rs
@ -4,6 +4,7 @@ use crate::{

 pub mod sequential;

+// TODO: extract regularize from this, so that we can drop the trait constraints on NeuraSequential's impl
 pub trait NeuraTrainableNetworkBase<Input>: NeuraLayer<Input> {
    type Gradient: NeuraVectorSpace;
    type LayerOutput;
--- a/src/network/sequential/layer_impl.rs
+++ b/src/network/sequential/layer_impl.rs
@ -0,0 +1,96 @@
+use super::*;
+use crate::prelude::NeuraTrainableLayerBackprop;
+
+impl<Input, Layer: NeuraLayer<Input>, ChildNetwork: NeuraLayer<Layer::Output>> NeuraLayer<Input>
+    for NeuraSequential<Layer, ChildNetwork>
+{
+    type Output = ChildNetwork::Output;
+
+    fn eval(&self, input: &Input) -> Self::Output {
+        self.child_network.eval(&self.layer.eval(input))
+    }
+}
+
+impl<
+        Input,
+        Layer: NeuraTrainableLayerBase<Input>,
+        ChildNetwork: NeuraTrainableLayerBase<Layer::Output>,
+    > NeuraTrainableLayerBase<Input> for NeuraSequential<Layer, ChildNetwork>
+{
+    type Gradient = (Layer::Gradient, Box<ChildNetwork::Gradient>);
+    type IntermediaryRepr = (Layer::IntermediaryRepr, Box<ChildNetwork::IntermediaryRepr>);
+
+    fn default_gradient(&self) -> Self::Gradient {
+        (
+            self.layer.default_gradient(),
+            Box::new(self.child_network.default_gradient()),
+        )
+    }
+
+    fn eval_training(&self, input: &Input) -> (Self::Output, Self::IntermediaryRepr) {
+        let (layer_output, layer_intermediary) = self.layer.eval_training(input);
+        let (child_output, child_intermediary) = self.child_network.eval_training(&layer_output);
+
+        (
+            child_output,
+            (layer_intermediary, Box::new(child_intermediary)),
+        )
+    }
+
+    fn prepare_layer(&mut self, is_training: bool) {
+        self.layer.prepare_layer(is_training);
+        self.child_network.prepare_layer(is_training);
+    }
+
+    fn apply_gradient(&mut self, gradient: &Self::Gradient) {
+        self.layer.apply_gradient(&gradient.0);
+        self.child_network.apply_gradient(&gradient.1);
+    }
+}
+
+impl<
+        Input,
+        Layer: NeuraTrainableLayerSelf<Input>,
+        ChildNetwork: NeuraTrainableLayerSelf<Layer::Output> + NeuraTrainableLayerBackprop<Layer::Output>,
+    > NeuraTrainableLayerSelf<Input> for NeuraSequential<Layer, ChildNetwork>
+{
+    fn regularize_layer(&self) -> Self::Gradient {
+        (
+            self.layer.regularize_layer(),
+            Box::new(self.child_network.regularize_layer()),
+        )
+    }
+
+    fn get_gradient(
+        &self,
+        input: &Input,
+        intermediary: &Self::IntermediaryRepr,
+        epsilon: &Self::Output,
+    ) -> Self::Gradient {
+        unimplemented!("NeuraSequential::get_gradient is not yet implemented, sorry");
+    }
+}
+
+impl<
+        Input,
+        Layer: NeuraTrainableLayerBackprop<Input>,
+        ChildNetwork: NeuraTrainableLayerBackprop<Layer::Output>,
+    > NeuraTrainableLayerBackprop<Input> for NeuraSequential<Layer, ChildNetwork>
+{
+    fn backprop_layer(
+        &self,
+        input: &Input,
+        intermediary: &Self::IntermediaryRepr,
+        incoming_epsilon: &Self::Output,
+    ) -> Input {
+        let transient_output = self.layer.eval(input);
+        let transient_epsilon =
+            self.child_network
+                .backprop_layer(&transient_output, &intermediary.1, incoming_epsilon);
+        let outgoing_epsilon =
+            self.layer
+                .backprop_layer(input, &intermediary.0, &transient_epsilon);
+
+        outgoing_epsilon
+    }
+}
--- a/src/network/sequential/mod.rs
+++ b/src/network/sequential/mod.rs
@ -1,10 +1,12 @@
 use super::{NeuraTrainableNetwork, NeuraTrainableNetworkBase};
 use crate::{
    gradient_solver::{NeuraGradientSolverFinal, NeuraGradientSolverTransient},
-    layer::{NeuraLayer, NeuraPartialLayer, NeuraShape, NeuraTrainableLayer},
+    layer::{NeuraLayer, NeuraPartialLayer, NeuraShape, NeuraTrainableLayerBase},
+    prelude::NeuraTrainableLayerSelf,
 };

 mod construct;
+mod layer_impl;
 mod tail;

 pub use construct::*;
@ -24,7 +26,7 @@ pub use tail::*;
 /// ## Notes on implemented traits
 ///
 /// The different implementations for `NeuraTrainableNetwork`,
-/// `NeuraLayer` and `NeuraTrainableLayer` each require that `ChildNetwork` implements those respective traits,
+/// `NeuraLayer` and `NeuraTrainableLayerBase` each require that `ChildNetwork` implements those respective traits,
 /// and that the output type of `Layer` matches the input type of `ChildNetwork`.
 ///
 /// If a method, like `eval`, is reported as missing,
@ -74,61 +76,9 @@ impl<Layer, ChildNetwork> NeuraSequential<Layer, ChildNetwork> {
    }
 }

-impl<Input, Layer: NeuraLayer<Input>, ChildNetwork: NeuraLayer<Layer::Output>> NeuraLayer<Input>
-    for NeuraSequential<Layer, ChildNetwork>
-{
-    type Output = ChildNetwork::Output;
-
-    fn eval(&self, input: &Input) -> Self::Output {
-        self.child_network.eval(&self.layer.eval(input))
-    }
-}
-
-impl<
-        Input,
-        Layer: NeuraTrainableLayer<Input>,
-        ChildNetwork: NeuraTrainableLayer<Layer::Output>,
-    > NeuraTrainableLayer<Input> for NeuraSequential<Layer, ChildNetwork>
-{
-    type Gradient = (Layer::Gradient, Box<ChildNetwork::Gradient>);
-
-    fn default_gradient(&self) -> Self::Gradient {
-        (
-            self.layer.default_gradient(),
-            Box::new(self.child_network.default_gradient()),
-        )
-    }
-
-    fn backprop_layer(
-        &self,
-        input: &Input,
-        incoming_epsilon: Self::Output,
-    ) -> (Input, Self::Gradient) {
-        let output = self.layer.eval(input);
-        let (transient_epsilon, child_gradient) =
-            self.child_network.backprop_layer(&output, incoming_epsilon);
-        let (outgoing_epsilon, layer_gradient) =
-            self.layer.backprop_layer(input, transient_epsilon);
-
-        (outgoing_epsilon, (layer_gradient, Box::new(child_gradient)))
-    }
-
-    fn regularize_layer(&self) -> Self::Gradient {
-        (
-            self.layer.regularize_layer(),
-            Box::new(self.child_network.regularize_layer()),
-        )
-    }
-
-    fn apply_gradient(&mut self, gradient: &Self::Gradient) {
-        self.layer.apply_gradient(&gradient.0);
-        self.child_network.apply_gradient(&gradient.1);
-    }
-}
-
 impl<
        Input,
-        Layer: NeuraTrainableLayer<Input>,
+        Layer: NeuraTrainableLayerBase<Input> + NeuraTrainableLayerSelf<Input>,
        ChildNetwork: NeuraTrainableNetworkBase<Layer::Output>,
    > NeuraTrainableNetworkBase<Input> for NeuraSequential<Layer, ChildNetwork>
 {
@ -188,8 +138,8 @@ impl<Input: Clone> NeuraTrainableNetworkBase<Input> for () {

 impl<
        Input,
-        Layer: NeuraTrainableLayer<Input>,
-        Optimizer: NeuraGradientSolverTransient<Layer::Output>,
+        Layer: NeuraTrainableLayerBase<Input> + NeuraTrainableLayerSelf<Input>,
+        Optimizer: NeuraGradientSolverTransient<Input, Layer>,
        ChildNetwork: NeuraTrainableNetworkBase<Layer::Output>,
    > NeuraTrainableNetwork<Input, Optimizer> for NeuraSequential<Layer, ChildNetwork>
 where
@ -200,12 +150,14 @@ where
        input: &Input,
        optimizer: &Optimizer,
    ) -> Optimizer::Output<Input, Self::Gradient> {
-        let next_activation = self.layer.eval(input);
+        let (next_activation, intermediary) = self.layer.eval_training(input);
        let child_result = self.child_network.traverse(&next_activation, optimizer);

        optimizer.eval_layer(
            &self.layer,
            input,
+            &next_activation,
+            &intermediary,
            child_result,
            |layer_gradient, child_gradient| (layer_gradient, Box::new(child_gradient)),
        )
--- a/src/train.rs
+++ b/src/train.rs
@ -82,7 +82,10 @@ impl NeuraBatchedTrainer {
        network: &mut Network,
        inputs: Inputs,
        test_inputs: &[(Input, Target)],
-    ) -> Vec<(f64, f64)> {
+    ) -> Vec<(f64, f64)>
+    where
+        <Network as NeuraTrainableNetworkBase<Input>>::Gradient: std::fmt::Debug,
+    {
        let mut losses = Vec::new();
        let mut iter = inputs.into_iter();
        let factor = -self.learning_rate / (self.batch_size as f64);
--- a/tests/xor.rs
+++ b/tests/xor.rs
@ -1,12 +1,20 @@
 use std::fs::File;

 use approx::assert_relative_eq;
-use nalgebra::{DMatrix, DVector, dvector};
-use neuramethyst::{prelude::{*, dense::NeuraDenseLayer}, derivable::{activation::{Relu, Tanh}, regularize::NeuraL0, loss::Euclidean}};
+use nalgebra::{dvector, DMatrix, DVector};
+use neuramethyst::{
+    derivable::{
+        activation::{Relu, Tanh},
+        loss::Euclidean,
+        regularize::NeuraL0,
+    },
+    prelude::{dense::NeuraDenseLayer, *},
+};

 fn load_test_data() -> Vec<(DMatrix<f64>, DVector<f64>, DMatrix<f64>, DVector<f64>)> {
    let file = File::open("tests/xor.json").unwrap();
-    let data: Vec<(DMatrix<f64>, DVector<f64>, DMatrix<f64>, DVector<f64>)> = serde_json::from_reader(&file).unwrap();
+    let data: Vec<(DMatrix<f64>, DVector<f64>, DMatrix<f64>, DVector<f64>)> =
+        serde_json::from_reader(&file).unwrap();

    data
 }
@ -43,7 +51,7 @@ fn test_xor_training() {
            network.layer.weights.clone(),
            network.layer.bias.clone(),
            network.child_network.layer.weights.clone(),
-            network.child_network.layer.bias.clone()
+            network.child_network.layer.bias.clone(),
        );

        assert_relative_eq!(expected.0.as_slice(), actual.0.as_slice());