🔥 Refactor of NeuraTrainableLayer, split it into multiple traits

2 years ago · d40098d2ef
parent f3752bd411
commit d40098d2ef
14 changed files with 425 additions and 201 deletions
--- a/examples/generate-tests.rs
+++ b/examples/generate-tests.rs
@ -30,7 +30,7 @@ fn main() {
        network.layer.weights.clone(),
        network.layer.bias.clone(),
        network.child_network.layer.weights.clone(),
-        network.child_network.layer.bias.clone()
+        network.child_network.layer.bias.clone(),
    )];
    for iteration in 0..4 {
@ -45,7 +45,7 @@ fn main() {
            network.layer.weights.clone(),
            network.layer.bias.clone(),
            network.child_network.layer.weights.clone(),
-            network.child_network.layer.bias.clone()
+            network.child_network.layer.bias.clone(),
        ));
    }
--- a/src/gradient_solver/backprop.rs
+++ b/src/gradient_solver/backprop.rs
@ -1,6 +1,9 @@
 use num::ToPrimitive;
-use crate::{derivable::NeuraLoss, layer::NeuraTrainableLayer, network::NeuraTrainableNetworkBase};
+use crate::{
    derivable::NeuraLoss, layer::NeuraTrainableLayerBackprop, layer::NeuraTrainableLayerSelf,
    network::NeuraTrainableNetworkBase,
 };
 use super::*;
@ -53,23 +56,26 @@ impl<LayerOutput, Target, Loss: NeuraLoss<LayerOutput, Target = Target>>
    }
 }
-impl<LayerOutput, Target, Loss> NeuraGradientSolverTransient<LayerOutput>
+impl<
    for (&NeuraBackprop<Loss>, &Target)
 {
    fn eval_layer<
        Input,
-        NetworkGradient,
+        Target,
-        RecGradient,
+        Loss,
-        Layer: NeuraTrainableLayer<Input, Output = LayerOutput>,
+        Layer: NeuraTrainableLayerBackprop<Input> + NeuraTrainableLayerSelf<Input>,
-    >(
+    > NeuraGradientSolverTransient<Input, Layer> for (&NeuraBackprop<Loss>, &Target)
 {
    fn eval_layer<NetworkGradient, RecGradient>(
        &self,
        layer: &Layer,
        input: &Input,
-        rec_opt_output: Self::Output<LayerOutput, RecGradient>,
+        _output: &Layer::Output,
        intermediary: &Layer::IntermediaryRepr,
        rec_opt_output: Self::Output<Layer::Output, RecGradient>,
        combine_gradients: impl Fn(Layer::Gradient, RecGradient) -> NetworkGradient,
    ) -> Self::Output<Input, NetworkGradient> {
        let (epsilon_in, rec_gradient) = rec_opt_output;
-        let (epsilon_out, layer_gradient) = layer.backprop_layer(input, epsilon_in);
+
        let epsilon_out = layer.backprop_layer(input, intermediary, &epsilon_in);
        let layer_gradient = layer.get_gradient(input, intermediary, &epsilon_in);
        (epsilon_out, combine_gradients(layer_gradient, rec_gradient))
    }
@ -80,7 +86,11 @@ mod test {
    use approx::assert_relative_eq;
    use super::*;
-    use crate::{prelude::*, derivable::{activation::Tanh, loss::Euclidean, NeuraDerivable}, utils::uniform_vector};
+    use crate::{
        derivable::{activation::Tanh, loss::Euclidean, NeuraDerivable},
        prelude::*,
        utils::uniform_vector,
    };
    #[test]
    fn test_backprop_epsilon_bias() {
@ -91,16 +101,22 @@ mod test {
            let network = neura_sequential![
                neura_layer!("dense", 4, f64).activation(Tanh),
                neura_layer!("dense", 2, f64).activation(Tanh)
-            ].construct(NeuraShape::Vector(4)).unwrap();
+            ]
            .construct(NeuraShape::Vector(4))
            .unwrap();
            let optimizer = NeuraBackprop::new(Euclidean);
            let input = uniform_vector(4);
            let target = uniform_vector(2);
            let layer1_intermediary = &network.layer.weights * &input;
-            let layer2_intermediary = &network.child_network.layer.weights * layer1_intermediary.map(|x| x.tanh());
+            let layer2_intermediary =
                &network.child_network.layer.weights * layer1_intermediary.map(|x| x.tanh());
-            assert_relative_eq!(layer1_intermediary.map(|x| x.tanh()), network.clone().trim_tail().eval(&input));
+            assert_relative_eq!(
                layer1_intermediary.map(|x| x.tanh()),
                network.clone().trim_tail().eval(&input)
            );
            let output = network.eval(&input);
@ -114,12 +130,14 @@ mod test {
            assert_relative_eq!(delta2_actual.as_slice(), delta2_expected.as_slice());
-            let gradient2_expected = &delta2_expected * layer1_intermediary.map(|x| x.tanh()).transpose();
+            let gradient2_expected =
                &delta2_expected * layer1_intermediary.map(|x| x.tanh()).transpose();
            let gradient2_actual = gradient.1 .0 .0;
            assert_relative_eq!(gradient2_actual.as_slice(), gradient2_expected.as_slice());
-            let mut delta1_expected = network.child_network.layer.weights.transpose() * delta2_expected;
+            let mut delta1_expected =
                network.child_network.layer.weights.transpose() * delta2_expected;
            for i in 0..4 {
                delta1_expected[i] *= Tanh.derivate(layer1_intermediary[i]);
            }
--- a/src/gradient_solver/forward_forward.rs
+++ b/src/gradient_solver/forward_forward.rs
@ -1,7 +1,7 @@
 use nalgebra::{DVector, Scalar};
 use num::{traits::NumAssignOps, Float, ToPrimitive};
-use crate::derivable::NeuraDerivable;
+use crate::{derivable::NeuraDerivable, prelude::NeuraTrainableLayerSelf};
 use super::*;
@ -90,22 +90,23 @@ impl<Act, LayerOutput> NeuraGradientSolverFinal<LayerOutput> for NeuraForwardPai
    }
 }
-impl<F: Float + Scalar + NumAssignOps, Act: NeuraDerivable<F>>
+impl<
-    NeuraGradientSolverTransient<DVector<F>> for NeuraForwardPair<Act>
+        F: Float + Scalar + NumAssignOps,
-{
+        Act: NeuraDerivable<F>,
    fn eval_layer<
        Input,
-        NetworkGradient,
+        Layer: NeuraTrainableLayerSelf<Input, Output = DVector<F>>,
-        RecGradient,
+    > NeuraGradientSolverTransient<Input, Layer> for NeuraForwardPair<Act>
-        Layer: NeuraTrainableLayer<Input, Output = DVector<F>>,
+{
-    >(
+    fn eval_layer<NetworkGradient, RecGradient>(
        &self,
        layer: &Layer,
        input: &Input,
        output: &Layer::Output,
        intermediary: &Layer::IntermediaryRepr,
        rec_gradient: RecGradient,
        combine_gradients: impl Fn(Layer::Gradient, RecGradient) -> NetworkGradient,
    ) -> Self::Output<Input, NetworkGradient> {
-        let output = layer.eval(input);
+        // let output = layer.eval(input);
        let goodness = output
            .iter()
            .copied()
@ -129,7 +130,7 @@ impl<F: Float + Scalar + NumAssignOps, Act: NeuraDerivable<F>>
        }
        // TODO: split backprop_layer into eval_training, get_gradient and get_backprop
-        let (_, layer_gradient) = layer.backprop_layer(input, goodness_derivative);
+        let layer_gradient = layer.get_gradient(input, intermediary, &goodness_derivative);
        combine_gradients(layer_gradient, rec_gradient)
    }
--- a/src/gradient_solver/mod.rs
+++ b/src/gradient_solver/mod.rs
@ -5,7 +5,7 @@ mod forward_forward;
 pub use forward_forward::NeuraForwardForward;
 use crate::{
-    layer::NeuraTrainableLayer,
+    layer::NeuraTrainableLayerBase,
    network::{NeuraTrainableNetwork, NeuraTrainableNetworkBase},
 };
@ -17,17 +17,16 @@ pub trait NeuraGradientSolverFinal<LayerOutput>: NeuraGradientSolverBase {
    fn eval_final(&self, output: LayerOutput) -> Self::Output<LayerOutput, ()>;
 }
-pub trait NeuraGradientSolverTransient<LayerOutput>: NeuraGradientSolverBase {
+pub trait NeuraGradientSolverTransient<Input, Layer: NeuraTrainableLayerBase<Input>>:
-    fn eval_layer<
+    NeuraGradientSolverBase
-        Input,
+{
-        NetworkGradient,
+    fn eval_layer<NetworkGradient, RecGradient>(
        RecGradient,
        Layer: NeuraTrainableLayer<Input, Output = LayerOutput>,
    >(
        &self,
        layer: &Layer,
        input: &Input,
-        rec_opt_output: Self::Output<LayerOutput, RecGradient>,
+        output: &Layer::Output,
        layer_intermediary: &Layer::IntermediaryRepr,
        rec_opt_output: Self::Output<Layer::Output, RecGradient>,
        combine_gradients: impl Fn(Layer::Gradient, RecGradient) -> NetworkGradient,
    ) -> Self::Output<Input, NetworkGradient>;
 }
--- a/src/layer/dense.rs
+++ b/src/layer/dense.rs
@ -161,9 +161,9 @@ impl<
    fn eval(&self, input: &DVector<F>) -> Self::Output {
        assert_eq!(input.shape().0, self.weights.shape().1);
-        let res = &self.weights * input + &self.bias;
+        let evaluated = &self.weights * input + &self.bias;
-        res.map(|x| self.activation.eval(x))
+        evaluated.map(|x| self.activation.eval(x))
    }
 }
@ -171,9 +171,17 @@ impl<
        F: Float + std::fmt::Debug + 'static + std::ops::AddAssign + std::ops::MulAssign,
        Act: NeuraDerivable<F>,
        Reg: NeuraDerivable<F>,
-    > NeuraTrainableLayer<DVector<F>> for NeuraDenseLayer<F, Act, Reg>
+    > NeuraTrainableLayerBase<DVector<F>> for NeuraDenseLayer<F, Act, Reg>
 {
    type Gradient = (DMatrix<F>, DVector<F>);
    type IntermediaryRepr = DVector<F>; // pre-activation values
    fn eval_training(&self, input: &DVector<F>) -> (Self::Output, Self::IntermediaryRepr) {
        let evaluated = &self.weights * input + &self.bias;
        let output = evaluated.map(|x| self.activation.eval(x));
        (output, evaluated)
    }
    fn default_gradient(&self) -> Self::Gradient {
        (
@ -182,41 +190,70 @@ impl<
        )
    }
-    fn backprop_layer(
+    fn apply_gradient(&mut self, gradient: &Self::Gradient) {
        self.weights += &gradient.0;
        self.bias += &gradient.1;
    }
 }
 impl<
        F: Float + std::fmt::Debug + 'static + std::ops::AddAssign + std::ops::MulAssign,
        Act: NeuraDerivable<F>,
        Reg: NeuraDerivable<F>,
    > NeuraTrainableLayerSelf<DVector<F>> for NeuraDenseLayer<F, Act, Reg>
 {
    fn regularize_layer(&self) -> Self::Gradient {
        (
            self.weights.map(|x| self.regularization.derivate(x)),
            DVector::zeros(self.bias.shape().0),
        )
    }
    fn get_gradient(
        &self,
        input: &DVector<F>,
-        epsilon: Self::Output,
+        evaluated: &Self::IntermediaryRepr,
-    ) -> (DVector<F>, Self::Gradient) {
+        epsilon: &Self::Output,
-        let evaluated = &self.weights * input + &self.bias;
+    ) -> Self::Gradient {
        // Compute delta (the input gradient of the neuron) from epsilon (the output gradient of the neuron),
        // with `self.activation'(input) ° epsilon = delta`
        let mut delta = epsilon.clone();
        for i in 0..delta.len() {
            // TODO: remove `- self.bias[i]`
            delta[i] *= self.activation.derivate(evaluated[i]);
        }
        // Compute the weight gradient
        let weights_gradient = &delta * input.transpose();
        let new_epsilon = self.weights.tr_mul(&delta);
        // According to https://datascience.stackexchange.com/questions/20139/gradients-for-bias-terms-in-backpropagation
        // The gradient of the bias is equal to the delta term of the backpropagation algorithm
        let bias_gradient = delta;
-        (new_epsilon, (weights_gradient, bias_gradient))
+        (weights_gradient, bias_gradient)
    }
 }
-    fn regularize_layer(&self) -> Self::Gradient {
+impl<
-        (
+        F: Float + std::fmt::Debug + 'static + std::ops::AddAssign + std::ops::MulAssign,
-            self.weights.map(|x| self.regularization.derivate(x)),
+        Act: NeuraDerivable<F>,
-            DVector::zeros(self.bias.shape().0),
+        Reg: NeuraDerivable<F>,
-        )
+    > NeuraTrainableLayerBackprop<DVector<F>> for NeuraDenseLayer<F, Act, Reg>
 {
    fn backprop_layer(
        &self,
        input: &DVector<F>,
        evaluated: &Self::IntermediaryRepr,
        epsilon: &Self::Output,
    ) -> DVector<F> {
        // Compute delta (the input gradient of the neuron) from epsilon (the output gradient of the neuron),
        // with `self.activation'(input) ° epsilon = delta`
        let mut delta = epsilon.clone();
        for i in 0..delta.len() {
            delta[i] *= self.activation.derivate(evaluated[i]);
        }
-    fn apply_gradient(&mut self, gradient: &Self::Gradient) {
+        self.weights.tr_mul(&delta)
        self.weights += &gradient.0;
        self.bias += &gradient.1;
    }
 }
--- a/src/layer/dropout.rs
+++ b/src/layer/dropout.rs
@ -61,24 +61,15 @@ impl<R: Rng, F: Float> NeuraLayer<DVector<F>> for NeuraDropoutLayer<R> {
    }
 }
-impl<R: Rng, F: Float> NeuraTrainableLayer<DVector<F>> for NeuraDropoutLayer<R> {
+impl<R: Rng, F: Float> NeuraTrainableLayerBase<DVector<F>> for NeuraDropoutLayer<R> {
    type Gradient = ();
    type IntermediaryRepr = ();
-    fn default_gradient(&self) -> Self::Gradient {
+    fn eval_training(&self, input: &DVector<F>) -> (Self::Output, Self::IntermediaryRepr) {
-        ()
+        (self.eval(input), ())
    }
-    fn backprop_layer(
+    fn default_gradient(&self) -> Self::Gradient {
        &self,
        _input: &DVector<F>,
        mut epsilon: Self::Output,
    ) -> (DVector<F>, Self::Gradient) {
        self.apply_dropout(&mut epsilon);
        (epsilon, ())
    }
    fn regularize_layer(&self) -> Self::Gradient {
        ()
    }
@ -110,6 +101,36 @@ impl<R: Rng, F: Float> NeuraTrainableLayer<DVector<F>> for NeuraDropoutLayer<R>
    }
 }
 impl<R: Rng, F: Float> NeuraTrainableLayerSelf<DVector<F>> for NeuraDropoutLayer<R> {
    fn regularize_layer(&self) -> Self::Gradient {
        ()
    }
    fn get_gradient(
        &self,
        _input: &DVector<F>,
        _intermediary: &Self::IntermediaryRepr,
        _epsilon: &Self::Output,
    ) -> Self::Gradient {
        ()
    }
 }
 impl<R: Rng, F: Float> NeuraTrainableLayerBackprop<DVector<F>> for NeuraDropoutLayer<R> {
    fn backprop_layer(
        &self,
        _input: &DVector<F>,
        _intermediary: &Self::IntermediaryRepr,
        epsilon: &Self::Output,
    ) -> DVector<F> {
        let mut epsilon = epsilon.clone();
        self.apply_dropout(&mut epsilon);
        epsilon
    }
 }
 #[cfg(test)]
 mod test {
    use super::*;
@ -121,7 +142,7 @@ mod test {
            .unwrap();
        for _ in 0..100 {
-            <NeuraDropoutLayer<_> as NeuraTrainableLayer<DVector<f64>>>::prepare_layer(
+            <NeuraDropoutLayer<_> as NeuraTrainableLayerBase<DVector<f64>>>::prepare_layer(
                &mut layer, true,
            );
            assert!(layer.multiplier.is_finite());
--- a/src/layer/mod.rs
+++ b/src/layer/mod.rs
@ -23,6 +23,7 @@ impl NeuraShape {
 }
 pub trait NeuraLayer<Input> {
    /// What type the layer outputs
    type Output;
    fn eval(&self, input: &Input) -> Self::Output;
@ -46,12 +47,64 @@ pub trait NeuraPartialLayer {
    fn output_shape(constructed: &Self::Constructed) -> NeuraShape;
 }
-pub trait NeuraTrainableLayer<Input>: NeuraLayer<Input> {
+pub trait NeuraTrainableLayerBase<Input>: NeuraLayer<Input> {
    /// The representation of the layer gradient as a vector space
    type Gradient: NeuraVectorSpace;
    /// An intermediary object type to be passed to the various training methods
    type IntermediaryRepr;
    fn default_gradient(&self) -> Self::Gradient;
    /// Applies `δW_l` to the weights of the layer
    fn apply_gradient(&mut self, gradient: &Self::Gradient);
    fn eval_training(&self, input: &Input) -> (Self::Output, Self::IntermediaryRepr);
    /// Arbitrary computation that can be executed at the start of an epoch
    #[allow(unused_variables)]
    #[inline(always)]
    fn prepare_layer(&mut self, is_training: bool) {}
 }
 /// Contains methods relative to a layer's ability to compute its own weights gradients,
 /// given the derivative of the output variables.
 pub trait NeuraTrainableLayerSelf<Input>: NeuraTrainableLayerBase<Input> {
    /// Computes the regularization
    fn regularize_layer(&self) -> Self::Gradient;
    /// Computes the layer's gradient,
    ///
    /// `intermediary` is guaranteed to have been generated by a previous call to `eval_training`,
    /// without mutation of `self` in-between, and with the same `input`.
    fn get_gradient(
        &self,
        input: &Input,
        intermediary: &Self::IntermediaryRepr,
        epsilon: &Self::Output,
    ) -> Self::Gradient;
 }
 // impl<Input, Layer: NeuraTrainableLayerBase<Input, Gradient = ()>> NeuraTrainableLayerSelf<Input>
 //     for Layer
 // {
 //     #[inline(always)]
 //     fn regularize_layer(&self) -> Self::Gradient {
 //         ()
 //     }
 //     #[inline(always)]
 //     fn get_gradient(
 //         &self,
 //         input: &Input,
 //         intermediary: &Self::IntermediaryRepr,
 //         epsilon: Self::Output,
 //     ) -> Self::Gradient {
 //         ()
 //     }
 // }
 pub trait NeuraTrainableLayerBackprop<Input>: NeuraTrainableLayerBase<Input> {
    /// Computes the backpropagation term and the derivative of the internal weights,
    /// using the `input` vector outputted by the previous layer and the backpropagation term `epsilon` of the next layer.
    ///
@ -63,42 +116,31 @@ pub trait NeuraTrainableLayer<Input>: NeuraLayer<Input> {
    /// The function should then return a pair `(epsilon_{l-1}, δW_l)`,
    /// with `epsilon_{l-1}` being multiplied by `f_{l-1}'(activation)` by the next layer to obtain `delta_{l-1}`.
    /// Using this intermediate value for `delta` allows us to isolate it computation to the respective layers.
-    fn backprop_layer(&self, input: &Input, epsilon: Self::Output) -> (Input, Self::Gradient);
+    fn backprop_layer(
-
+        &self,
-    /// Computes the regularization
+        input: &Input,
-    fn regularize_layer(&self) -> Self::Gradient;
+        intermediary: &Self::IntermediaryRepr,
-
+        epsilon: &Self::Output,
-    /// Applies `δW_l` to the weights of the layer
+    ) -> Input;
    fn apply_gradient(&mut self, gradient: &Self::Gradient);
    /// Arbitrary computation that can be executed at the start of an epoch
    #[allow(unused_variables)]
    #[inline(always)]
    fn prepare_layer(&mut self, is_training: bool) {}
 }
-impl<Input: Clone> NeuraTrainableLayer<Input> for () {
+impl<Input: Clone> NeuraTrainableLayerBase<Input> for () {
    type Gradient = ();
    type IntermediaryRepr = ();
    #[inline(always)]
    fn default_gradient(&self) -> Self::Gradient {
        ()
    }
    #[inline(always)]
    fn backprop_layer(&self, _input: &Input, epsilon: Self::Output) -> (Input, Self::Gradient) {
        (epsilon, ())
    }
    #[inline(always)]
    fn regularize_layer(&self) -> Self::Gradient {
        ()
    }
    #[inline(always)]
    fn apply_gradient(&mut self, _gradient: &Self::Gradient) {
        // Noop
    }
    fn eval_training(&self, input: &Input) -> (Self::Output, Self::IntermediaryRepr) {
        (self.eval(input), ())
    }
 }
 /// Temporary implementation of neura_layer
--- a/src/layer/normalize.rs
+++ b/src/layer/normalize.rs
@ -1,4 +1,4 @@
-use nalgebra::{DVector, Scalar};
+use nalgebra::{DMatrix, DVector, Scalar};
 use num::{traits::NumAssignOps, Float};
 use super::*;
@ -54,14 +54,19 @@ impl<F: Float + Scalar> NeuraLayer<DVector<F>> for NeuraNormalizeLayer {
    }
 }
-impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayer<DVector<F>> for NeuraNormalizeLayer {
+impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayerBase<DVector<F>> for NeuraNormalizeLayer {
    type Gradient = ();
    type IntermediaryRepr = (DMatrix<F>, F); // Partial jacobian matrix (without the kroenecker term) and stddev
-    fn backprop_layer(
+    fn default_gradient(&self) -> Self::Gradient {
-        &self,
+        ()
-        input: &DVector<F>,
+    }
-        epsilon: Self::Output,
+
-    ) -> (DVector<F>, Self::Gradient) {
+    fn apply_gradient(&mut self, _gradient: &Self::Gradient) {
        // Noop
    }
    fn eval_training(&self, input: &DVector<F>) -> (Self::Output, Self::IntermediaryRepr) {
        let (mean, variance, len) = mean_variance(input);
        let stddev = F::sqrt(variance);
        let input_centered = input.clone().map(|x| x - mean);
@ -73,26 +78,42 @@ impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayer<DVector<F>> for Neura
            *value += F::one() / (stddev * len);
        }
-        let mut epsilon_out = jacobian_partial * &epsilon;
+        (input_centered / stddev, (jacobian_partial, stddev))
        // Apply the δ_{ik}/σ term
        for i in 0..epsilon_out.len() {
            epsilon_out[i] += epsilon[i] / stddev;
    }
        (epsilon_out, ())
 }
-    fn default_gradient(&self) -> Self::Gradient {
+impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayerSelf<DVector<F>> for NeuraNormalizeLayer {
    fn regularize_layer(&self) -> Self::Gradient {
        ()
    }
-    fn regularize_layer(&self) -> Self::Gradient {
+    fn get_gradient(
        &self,
        input: &DVector<F>,
        intermediary: &Self::IntermediaryRepr,
        epsilon: &Self::Output,
    ) -> Self::Gradient {
        ()
    }
 }
-    fn apply_gradient(&mut self, _gradient: &Self::Gradient) {
+impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayerBackprop<DVector<F>>
-        // Noop
+    for NeuraNormalizeLayer
 {
    fn backprop_layer(
        &self,
        input: &DVector<F>,
        (jacobian_partial, stddev): &Self::IntermediaryRepr,
        epsilon: &Self::Output,
    ) -> DVector<F> {
        let mut epsilon_out = jacobian_partial * epsilon;
        // Apply the δ_{ik}/σ term
        for i in 0..epsilon_out.len() {
            epsilon_out[i] += epsilon[i] / *stddev;
        }
        epsilon_out
    }
 }
--- a/src/layer/softmax.rs
+++ b/src/layer/softmax.rs
@ -54,22 +54,53 @@ impl NeuraPartialLayer for NeuraSoftmaxLayer {
    }
 }
-impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayer<DVector<F>> for NeuraSoftmaxLayer {
+impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayerBase<DVector<F>> for NeuraSoftmaxLayer {
    type Gradient = ();
    type IntermediaryRepr = Self::Output; // Result of self.eval
    fn default_gradient(&self) -> Self::Gradient {
        ()
    }
    fn apply_gradient(&mut self, _gradient: &Self::Gradient) {
        // Noop
    }
    fn eval_training(&self, input: &DVector<F>) -> (Self::Output, Self::IntermediaryRepr) {
        let res = self.eval(input);
        (res.clone(), res)
    }
 }
 impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayerSelf<DVector<F>> for NeuraSoftmaxLayer {
    #[inline(always)]
    fn regularize_layer(&self) -> Self::Gradient {
        ()
    }
    #[inline(always)]
    fn get_gradient(
        &self,
        input: &DVector<F>,
        intermediary: &Self::IntermediaryRepr,
        epsilon: &Self::Output,
    ) -> Self::Gradient {
        ()
    }
 }
 impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayerBackprop<DVector<F>>
    for NeuraSoftmaxLayer
 {
    fn backprop_layer(
        &self,
        input: &DVector<F>,
-        mut epsilon: Self::Output,
+        evaluated: &Self::IntermediaryRepr,
-    ) -> (DVector<F>, Self::Gradient) {
+        epsilon: &Self::Output,
-        // Note: a constant value can be added to `input` to bring it to increase precision
+    ) -> DVector<F> {
-        let evaluated = self.eval(input);
+        let mut epsilon = epsilon.clone();
-        // Compute $a_{l-1,i} \epsilon_{l,i}$
+        // Compute $a_{l-1,i} ° \epsilon_{l,i}$
        hadamard_product(&mut epsilon, &evaluated);
        // Compute $\sum_{k}{a_{l-1,k} \epsilon_{l,k}}$
@ -80,15 +111,7 @@ impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayer<DVector<F>> for Neura
            epsilon[i] -= evaluated[i] * sum_diagonal_terms;
        }
-        (epsilon, ())
+        epsilon
    }
    fn regularize_layer(&self) -> Self::Gradient {
        ()
    }
    fn apply_gradient(&mut self, _gradient: &Self::Gradient) {
        // Noop
    }
 }
@ -132,8 +155,9 @@ mod test {
                for epsilon1 in [1.7, 1.9, 2.3] {
                    for epsilon2 in [2.9, 3.1, 3.7] {
                        let epsilon = dvector![epsilon1, epsilon2];
                        let evaluated = layer.eval(&input);
-                        let (epsilon, _) = layer.backprop_layer(&input, epsilon);
+                        let epsilon = layer.backprop_layer(&input, &evaluated, &epsilon);
                        let expected = [
                            output[0] * (1.0 - output[0]) * epsilon1
                                - output[1] * output[0] * epsilon2,
@ -165,7 +189,8 @@ mod test {
            derivative += DMatrix::from_diagonal(&evaluated);
            let expected = derivative * &loss;
-            let (actual, _) = layer.backprop_layer(&input, loss);
+            let evaluated = layer.eval(&input);
            let actual = layer.backprop_layer(&input, &evaluated, &loss);
            for i in 0..4 {
                assert!((expected[i] - actual[i]).abs() < EPSILON);
--- a/src/network/mod.rs
+++ b/src/network/mod.rs
@ -4,6 +4,7 @@ use crate::{
 pub mod sequential;
 // TODO: extract regularize from this, so that we can drop the trait constraints on NeuraSequential's impl
 pub trait NeuraTrainableNetworkBase<Input>: NeuraLayer<Input> {
    type Gradient: NeuraVectorSpace;
    type LayerOutput;
--- a/src/network/sequential/layer_impl.rs
+++ b/src/network/sequential/layer_impl.rs
@ -0,0 +1,96 @@
 use super::*;
 use crate::prelude::NeuraTrainableLayerBackprop;
 impl<Input, Layer: NeuraLayer<Input>, ChildNetwork: NeuraLayer<Layer::Output>> NeuraLayer<Input>
    for NeuraSequential<Layer, ChildNetwork>
 {
    type Output = ChildNetwork::Output;
    fn eval(&self, input: &Input) -> Self::Output {
        self.child_network.eval(&self.layer.eval(input))
    }
 }
 impl<
        Input,
        Layer: NeuraTrainableLayerBase<Input>,
        ChildNetwork: NeuraTrainableLayerBase<Layer::Output>,
    > NeuraTrainableLayerBase<Input> for NeuraSequential<Layer, ChildNetwork>
 {
    type Gradient = (Layer::Gradient, Box<ChildNetwork::Gradient>);
    type IntermediaryRepr = (Layer::IntermediaryRepr, Box<ChildNetwork::IntermediaryRepr>);
    fn default_gradient(&self) -> Self::Gradient {
        (
            self.layer.default_gradient(),
            Box::new(self.child_network.default_gradient()),
        )
    }
    fn eval_training(&self, input: &Input) -> (Self::Output, Self::IntermediaryRepr) {
        let (layer_output, layer_intermediary) = self.layer.eval_training(input);
        let (child_output, child_intermediary) = self.child_network.eval_training(&layer_output);
        (
            child_output,
            (layer_intermediary, Box::new(child_intermediary)),
        )
    }
    fn prepare_layer(&mut self, is_training: bool) {
        self.layer.prepare_layer(is_training);
        self.child_network.prepare_layer(is_training);
    }
    fn apply_gradient(&mut self, gradient: &Self::Gradient) {
        self.layer.apply_gradient(&gradient.0);
        self.child_network.apply_gradient(&gradient.1);
    }
 }
 impl<
        Input,
        Layer: NeuraTrainableLayerSelf<Input>,
        ChildNetwork: NeuraTrainableLayerSelf<Layer::Output> + NeuraTrainableLayerBackprop<Layer::Output>,
    > NeuraTrainableLayerSelf<Input> for NeuraSequential<Layer, ChildNetwork>
 {
    fn regularize_layer(&self) -> Self::Gradient {
        (
            self.layer.regularize_layer(),
            Box::new(self.child_network.regularize_layer()),
        )
    }
    fn get_gradient(
        &self,
        input: &Input,
        intermediary: &Self::IntermediaryRepr,
        epsilon: &Self::Output,
    ) -> Self::Gradient {
        unimplemented!("NeuraSequential::get_gradient is not yet implemented, sorry");
    }
 }
 impl<
        Input,
        Layer: NeuraTrainableLayerBackprop<Input>,
        ChildNetwork: NeuraTrainableLayerBackprop<Layer::Output>,
    > NeuraTrainableLayerBackprop<Input> for NeuraSequential<Layer, ChildNetwork>
 {
    fn backprop_layer(
        &self,
        input: &Input,
        intermediary: &Self::IntermediaryRepr,
        incoming_epsilon: &Self::Output,
    ) -> Input {
        let transient_output = self.layer.eval(input);
        let transient_epsilon =
            self.child_network
                .backprop_layer(&transient_output, &intermediary.1, incoming_epsilon);
        let outgoing_epsilon =
            self.layer
                .backprop_layer(input, &intermediary.0, &transient_epsilon);
        outgoing_epsilon
    }
 }
--- a/src/network/sequential/mod.rs
+++ b/src/network/sequential/mod.rs
@ -1,10 +1,12 @@
 use super::{NeuraTrainableNetwork, NeuraTrainableNetworkBase};
 use crate::{
    gradient_solver::{NeuraGradientSolverFinal, NeuraGradientSolverTransient},
-    layer::{NeuraLayer, NeuraPartialLayer, NeuraShape, NeuraTrainableLayer},
+    layer::{NeuraLayer, NeuraPartialLayer, NeuraShape, NeuraTrainableLayerBase},
    prelude::NeuraTrainableLayerSelf,
 };
 mod construct;
 mod layer_impl;
 mod tail;
 pub use construct::*;
@ -24,7 +26,7 @@ pub use tail::*;
 /// ## Notes on implemented traits
 ///
 /// The different implementations for `NeuraTrainableNetwork`,
-/// `NeuraLayer` and `NeuraTrainableLayer` each require that `ChildNetwork` implements those respective traits,
+/// `NeuraLayer` and `NeuraTrainableLayerBase` each require that `ChildNetwork` implements those respective traits,
 /// and that the output type of `Layer` matches the input type of `ChildNetwork`.
 ///
 /// If a method, like `eval`, is reported as missing,
@ -74,61 +76,9 @@ impl<Layer, ChildNetwork> NeuraSequential<Layer, ChildNetwork> {
    }
 }
 impl<Input, Layer: NeuraLayer<Input>, ChildNetwork: NeuraLayer<Layer::Output>> NeuraLayer<Input>
    for NeuraSequential<Layer, ChildNetwork>
 {
    type Output = ChildNetwork::Output;
    fn eval(&self, input: &Input) -> Self::Output {
        self.child_network.eval(&self.layer.eval(input))
    }
 }
 impl<
        Input,
        Layer: NeuraTrainableLayer<Input>,
        ChildNetwork: NeuraTrainableLayer<Layer::Output>,
    > NeuraTrainableLayer<Input> for NeuraSequential<Layer, ChildNetwork>
 {
    type Gradient = (Layer::Gradient, Box<ChildNetwork::Gradient>);
    fn default_gradient(&self) -> Self::Gradient {
        (
            self.layer.default_gradient(),
            Box::new(self.child_network.default_gradient()),
        )
    }
    fn backprop_layer(
        &self,
        input: &Input,
        incoming_epsilon: Self::Output,
    ) -> (Input, Self::Gradient) {
        let output = self.layer.eval(input);
        let (transient_epsilon, child_gradient) =
            self.child_network.backprop_layer(&output, incoming_epsilon);
        let (outgoing_epsilon, layer_gradient) =
            self.layer.backprop_layer(input, transient_epsilon);
        (outgoing_epsilon, (layer_gradient, Box::new(child_gradient)))
    }
    fn regularize_layer(&self) -> Self::Gradient {
        (
            self.layer.regularize_layer(),
            Box::new(self.child_network.regularize_layer()),
        )
    }
    fn apply_gradient(&mut self, gradient: &Self::Gradient) {
        self.layer.apply_gradient(&gradient.0);
        self.child_network.apply_gradient(&gradient.1);
    }
 }
 impl<
        Input,
-        Layer: NeuraTrainableLayer<Input>,
+        Layer: NeuraTrainableLayerBase<Input> + NeuraTrainableLayerSelf<Input>,
        ChildNetwork: NeuraTrainableNetworkBase<Layer::Output>,
    > NeuraTrainableNetworkBase<Input> for NeuraSequential<Layer, ChildNetwork>
 {
@ -188,8 +138,8 @@ impl<Input: Clone> NeuraTrainableNetworkBase<Input> for () {
 impl<
        Input,
-        Layer: NeuraTrainableLayer<Input>,
+        Layer: NeuraTrainableLayerBase<Input> + NeuraTrainableLayerSelf<Input>,
-        Optimizer: NeuraGradientSolverTransient<Layer::Output>,
+        Optimizer: NeuraGradientSolverTransient<Input, Layer>,
        ChildNetwork: NeuraTrainableNetworkBase<Layer::Output>,
    > NeuraTrainableNetwork<Input, Optimizer> for NeuraSequential<Layer, ChildNetwork>
 where
@ -200,12 +150,14 @@ where
        input: &Input,
        optimizer: &Optimizer,
    ) -> Optimizer::Output<Input, Self::Gradient> {
-        let next_activation = self.layer.eval(input);
+        let (next_activation, intermediary) = self.layer.eval_training(input);
        let child_result = self.child_network.traverse(&next_activation, optimizer);
        optimizer.eval_layer(
            &self.layer,
            input,
            &next_activation,
            &intermediary,
            child_result,
            |layer_gradient, child_gradient| (layer_gradient, Box::new(child_gradient)),
        )
--- a/src/train.rs
+++ b/src/train.rs
@ -82,7 +82,10 @@ impl NeuraBatchedTrainer {
        network: &mut Network,
        inputs: Inputs,
        test_inputs: &[(Input, Target)],
-    ) -> Vec<(f64, f64)> {
+    ) -> Vec<(f64, f64)>
    where
        <Network as NeuraTrainableNetworkBase<Input>>::Gradient: std::fmt::Debug,
    {
        let mut losses = Vec::new();
        let mut iter = inputs.into_iter();
        let factor = -self.learning_rate / (self.batch_size as f64);
--- a/tests/xor.rs
+++ b/tests/xor.rs
@ -1,12 +1,20 @@
 use std::fs::File;
 use approx::assert_relative_eq;
-use nalgebra::{DMatrix, DVector, dvector};
+use nalgebra::{dvector, DMatrix, DVector};
-use neuramethyst::{prelude::{*, dense::NeuraDenseLayer}, derivable::{activation::{Relu, Tanh}, regularize::NeuraL0, loss::Euclidean}};
+use neuramethyst::{
    derivable::{
        activation::{Relu, Tanh},
        loss::Euclidean,
        regularize::NeuraL0,
    },
    prelude::{dense::NeuraDenseLayer, *},
 };
 fn load_test_data() -> Vec<(DMatrix<f64>, DVector<f64>, DMatrix<f64>, DVector<f64>)> {
    let file = File::open("tests/xor.json").unwrap();
-    let data: Vec<(DMatrix<f64>, DVector<f64>, DMatrix<f64>, DVector<f64>)> = serde_json::from_reader(&file).unwrap();
+    let data: Vec<(DMatrix<f64>, DVector<f64>, DMatrix<f64>, DVector<f64>)> =
        serde_json::from_reader(&file).unwrap();
    data
 }
@ -43,7 +51,7 @@ fn test_xor_training() {
            network.layer.weights.clone(),
            network.layer.bias.clone(),
            network.child_network.layer.weights.clone(),
-            network.child_network.layer.bias.clone()
+            network.child_network.layer.bias.clone(),
        );
        assert_relative_eq!(expected.0.as_slice(), actual.0.as_slice());