From d40098d2efba6f6a634722ba1a8bfa4412e46e70 Mon Sep 17 00:00:00 2001 From: Adrien Burgun Date: Sat, 22 Apr 2023 10:31:46 +0200 Subject: [PATCH] :fire: Refactor of NeuraTrainableLayer, split it into multiple traits --- examples/generate-tests.rs | 4 +- src/gradient_solver/backprop.rs | 60 ++++++++++------ src/gradient_solver/forward_forward.rs | 23 +++--- src/gradient_solver/mod.rs | 17 +++-- src/layer/dense.rs | 77 +++++++++++++++------ src/layer/dropout.rs | 51 ++++++++++---- src/layer/mod.rs | 90 +++++++++++++++++------- src/layer/normalize.rs | 59 +++++++++++----- src/layer/softmax.rs | 59 +++++++++++----- src/network/mod.rs | 1 + src/network/sequential/layer_impl.rs | 96 ++++++++++++++++++++++++++ src/network/sequential/mod.rs | 68 +++--------------- src/train.rs | 5 +- tests/xor.rs | 16 +++-- 14 files changed, 425 insertions(+), 201 deletions(-) create mode 100644 src/network/sequential/layer_impl.rs diff --git a/examples/generate-tests.rs b/examples/generate-tests.rs index 1b33f3b..c35e334 100644 --- a/examples/generate-tests.rs +++ b/examples/generate-tests.rs @@ -30,7 +30,7 @@ fn main() { network.layer.weights.clone(), network.layer.bias.clone(), network.child_network.layer.weights.clone(), - network.child_network.layer.bias.clone() + network.child_network.layer.bias.clone(), )]; for iteration in 0..4 { @@ -45,7 +45,7 @@ fn main() { network.layer.weights.clone(), network.layer.bias.clone(), network.child_network.layer.weights.clone(), - network.child_network.layer.bias.clone() + network.child_network.layer.bias.clone(), )); } diff --git a/src/gradient_solver/backprop.rs b/src/gradient_solver/backprop.rs index 637253b..d6635ec 100644 --- a/src/gradient_solver/backprop.rs +++ b/src/gradient_solver/backprop.rs @@ -1,6 +1,9 @@ use num::ToPrimitive; -use crate::{derivable::NeuraLoss, layer::NeuraTrainableLayer, network::NeuraTrainableNetworkBase}; +use crate::{ + derivable::NeuraLoss, layer::NeuraTrainableLayerBackprop, layer::NeuraTrainableLayerSelf, + network::NeuraTrainableNetworkBase, +}; use super::*; @@ -53,23 +56,26 @@ impl> } } -impl NeuraGradientSolverTransient - for (&NeuraBackprop, &Target) -{ - fn eval_layer< +impl< Input, - NetworkGradient, - RecGradient, - Layer: NeuraTrainableLayer, - >( + Target, + Loss, + Layer: NeuraTrainableLayerBackprop + NeuraTrainableLayerSelf, + > NeuraGradientSolverTransient for (&NeuraBackprop, &Target) +{ + fn eval_layer( &self, layer: &Layer, input: &Input, - rec_opt_output: Self::Output, + _output: &Layer::Output, + intermediary: &Layer::IntermediaryRepr, + rec_opt_output: Self::Output, combine_gradients: impl Fn(Layer::Gradient, RecGradient) -> NetworkGradient, ) -> Self::Output { let (epsilon_in, rec_gradient) = rec_opt_output; - let (epsilon_out, layer_gradient) = layer.backprop_layer(input, epsilon_in); + + let epsilon_out = layer.backprop_layer(input, intermediary, &epsilon_in); + let layer_gradient = layer.get_gradient(input, intermediary, &epsilon_in); (epsilon_out, combine_gradients(layer_gradient, rec_gradient)) } @@ -80,7 +86,11 @@ mod test { use approx::assert_relative_eq; use super::*; - use crate::{prelude::*, derivable::{activation::Tanh, loss::Euclidean, NeuraDerivable}, utils::uniform_vector}; + use crate::{ + derivable::{activation::Tanh, loss::Euclidean, NeuraDerivable}, + prelude::*, + utils::uniform_vector, + }; #[test] fn test_backprop_epsilon_bias() { @@ -91,16 +101,22 @@ mod test { let network = neura_sequential![ neura_layer!("dense", 4, f64).activation(Tanh), neura_layer!("dense", 2, f64).activation(Tanh) - ].construct(NeuraShape::Vector(4)).unwrap(); + ] + .construct(NeuraShape::Vector(4)) + .unwrap(); let optimizer = NeuraBackprop::new(Euclidean); let input = uniform_vector(4); let target = uniform_vector(2); let layer1_intermediary = &network.layer.weights * &input; - let layer2_intermediary = &network.child_network.layer.weights * layer1_intermediary.map(|x| x.tanh()); + let layer2_intermediary = + &network.child_network.layer.weights * layer1_intermediary.map(|x| x.tanh()); - assert_relative_eq!(layer1_intermediary.map(|x| x.tanh()), network.clone().trim_tail().eval(&input)); + assert_relative_eq!( + layer1_intermediary.map(|x| x.tanh()), + network.clone().trim_tail().eval(&input) + ); let output = network.eval(&input); @@ -110,25 +126,27 @@ mod test { for i in 0..2 { delta2_expected[i] *= Tanh.derivate(layer2_intermediary[i]); } - let delta2_actual = gradient.1.0.1; + let delta2_actual = gradient.1 .0 .1; assert_relative_eq!(delta2_actual.as_slice(), delta2_expected.as_slice()); - let gradient2_expected = &delta2_expected * layer1_intermediary.map(|x| x.tanh()).transpose(); - let gradient2_actual = gradient.1.0.0; + let gradient2_expected = + &delta2_expected * layer1_intermediary.map(|x| x.tanh()).transpose(); + let gradient2_actual = gradient.1 .0 .0; assert_relative_eq!(gradient2_actual.as_slice(), gradient2_expected.as_slice()); - let mut delta1_expected = network.child_network.layer.weights.transpose() * delta2_expected; + let mut delta1_expected = + network.child_network.layer.weights.transpose() * delta2_expected; for i in 0..4 { delta1_expected[i] *= Tanh.derivate(layer1_intermediary[i]); } - let delta1_actual = gradient.0.1; + let delta1_actual = gradient.0 .1; assert_relative_eq!(delta1_actual.as_slice(), delta1_expected.as_slice()); let gradient1_expected = &delta1_expected * input.transpose(); - let gradient1_actual = gradient.0.0; + let gradient1_actual = gradient.0 .0; assert_relative_eq!(gradient1_actual.as_slice(), gradient1_expected.as_slice()); } diff --git a/src/gradient_solver/forward_forward.rs b/src/gradient_solver/forward_forward.rs index 76c8b70..2b88a00 100644 --- a/src/gradient_solver/forward_forward.rs +++ b/src/gradient_solver/forward_forward.rs @@ -1,7 +1,7 @@ use nalgebra::{DVector, Scalar}; use num::{traits::NumAssignOps, Float, ToPrimitive}; -use crate::derivable::NeuraDerivable; +use crate::{derivable::NeuraDerivable, prelude::NeuraTrainableLayerSelf}; use super::*; @@ -90,22 +90,23 @@ impl NeuraGradientSolverFinal for NeuraForwardPai } } -impl> - NeuraGradientSolverTransient> for NeuraForwardPair -{ - fn eval_layer< +impl< + F: Float + Scalar + NumAssignOps, + Act: NeuraDerivable, Input, - NetworkGradient, - RecGradient, - Layer: NeuraTrainableLayer>, - >( + Layer: NeuraTrainableLayerSelf>, + > NeuraGradientSolverTransient for NeuraForwardPair +{ + fn eval_layer( &self, layer: &Layer, input: &Input, + output: &Layer::Output, + intermediary: &Layer::IntermediaryRepr, rec_gradient: RecGradient, combine_gradients: impl Fn(Layer::Gradient, RecGradient) -> NetworkGradient, ) -> Self::Output { - let output = layer.eval(input); + // let output = layer.eval(input); let goodness = output .iter() .copied() @@ -129,7 +130,7 @@ impl> } // TODO: split backprop_layer into eval_training, get_gradient and get_backprop - let (_, layer_gradient) = layer.backprop_layer(input, goodness_derivative); + let layer_gradient = layer.get_gradient(input, intermediary, &goodness_derivative); combine_gradients(layer_gradient, rec_gradient) } diff --git a/src/gradient_solver/mod.rs b/src/gradient_solver/mod.rs index 3f68076..732ca2c 100644 --- a/src/gradient_solver/mod.rs +++ b/src/gradient_solver/mod.rs @@ -5,7 +5,7 @@ mod forward_forward; pub use forward_forward::NeuraForwardForward; use crate::{ - layer::NeuraTrainableLayer, + layer::NeuraTrainableLayerBase, network::{NeuraTrainableNetwork, NeuraTrainableNetworkBase}, }; @@ -17,17 +17,16 @@ pub trait NeuraGradientSolverFinal: NeuraGradientSolverBase { fn eval_final(&self, output: LayerOutput) -> Self::Output; } -pub trait NeuraGradientSolverTransient: NeuraGradientSolverBase { - fn eval_layer< - Input, - NetworkGradient, - RecGradient, - Layer: NeuraTrainableLayer, - >( +pub trait NeuraGradientSolverTransient>: + NeuraGradientSolverBase +{ + fn eval_layer( &self, layer: &Layer, input: &Input, - rec_opt_output: Self::Output, + output: &Layer::Output, + layer_intermediary: &Layer::IntermediaryRepr, + rec_opt_output: Self::Output, combine_gradients: impl Fn(Layer::Gradient, RecGradient) -> NetworkGradient, ) -> Self::Output; } diff --git a/src/layer/dense.rs b/src/layer/dense.rs index 2437f4d..0df187a 100644 --- a/src/layer/dense.rs +++ b/src/layer/dense.rs @@ -161,9 +161,9 @@ impl< fn eval(&self, input: &DVector) -> Self::Output { assert_eq!(input.shape().0, self.weights.shape().1); - let res = &self.weights * input + &self.bias; + let evaluated = &self.weights * input + &self.bias; - res.map(|x| self.activation.eval(x)) + evaluated.map(|x| self.activation.eval(x)) } } @@ -171,9 +171,17 @@ impl< F: Float + std::fmt::Debug + 'static + std::ops::AddAssign + std::ops::MulAssign, Act: NeuraDerivable, Reg: NeuraDerivable, - > NeuraTrainableLayer> for NeuraDenseLayer + > NeuraTrainableLayerBase> for NeuraDenseLayer { type Gradient = (DMatrix, DVector); + type IntermediaryRepr = DVector; // pre-activation values + + fn eval_training(&self, input: &DVector) -> (Self::Output, Self::IntermediaryRepr) { + let evaluated = &self.weights * input + &self.bias; + let output = evaluated.map(|x| self.activation.eval(x)); + + (output, evaluated) + } fn default_gradient(&self) -> Self::Gradient { ( @@ -182,41 +190,70 @@ impl< ) } - fn backprop_layer( + fn apply_gradient(&mut self, gradient: &Self::Gradient) { + self.weights += &gradient.0; + self.bias += &gradient.1; + } +} + +impl< + F: Float + std::fmt::Debug + 'static + std::ops::AddAssign + std::ops::MulAssign, + Act: NeuraDerivable, + Reg: NeuraDerivable, + > NeuraTrainableLayerSelf> for NeuraDenseLayer +{ + fn regularize_layer(&self) -> Self::Gradient { + ( + self.weights.map(|x| self.regularization.derivate(x)), + DVector::zeros(self.bias.shape().0), + ) + } + + fn get_gradient( &self, input: &DVector, - epsilon: Self::Output, - ) -> (DVector, Self::Gradient) { - let evaluated = &self.weights * input + &self.bias; + evaluated: &Self::IntermediaryRepr, + epsilon: &Self::Output, + ) -> Self::Gradient { // Compute delta (the input gradient of the neuron) from epsilon (the output gradient of the neuron), // with `self.activation'(input) ° epsilon = delta` let mut delta = epsilon.clone(); for i in 0..delta.len() { + // TODO: remove `- self.bias[i]` delta[i] *= self.activation.derivate(evaluated[i]); } - // Compute the weight gradient let weights_gradient = &delta * input.transpose(); - let new_epsilon = self.weights.tr_mul(&delta); - // According to https://datascience.stackexchange.com/questions/20139/gradients-for-bias-terms-in-backpropagation // The gradient of the bias is equal to the delta term of the backpropagation algorithm let bias_gradient = delta; - (new_epsilon, (weights_gradient, bias_gradient)) + (weights_gradient, bias_gradient) } +} - fn regularize_layer(&self) -> Self::Gradient { - ( - self.weights.map(|x| self.regularization.derivate(x)), - DVector::zeros(self.bias.shape().0), - ) - } +impl< + F: Float + std::fmt::Debug + 'static + std::ops::AddAssign + std::ops::MulAssign, + Act: NeuraDerivable, + Reg: NeuraDerivable, + > NeuraTrainableLayerBackprop> for NeuraDenseLayer +{ + fn backprop_layer( + &self, + input: &DVector, + evaluated: &Self::IntermediaryRepr, + epsilon: &Self::Output, + ) -> DVector { + // Compute delta (the input gradient of the neuron) from epsilon (the output gradient of the neuron), + // with `self.activation'(input) ° epsilon = delta` + let mut delta = epsilon.clone(); - fn apply_gradient(&mut self, gradient: &Self::Gradient) { - self.weights += &gradient.0; - self.bias += &gradient.1; + for i in 0..delta.len() { + delta[i] *= self.activation.derivate(evaluated[i]); + } + + self.weights.tr_mul(&delta) } } diff --git a/src/layer/dropout.rs b/src/layer/dropout.rs index afd0511..b44fb32 100644 --- a/src/layer/dropout.rs +++ b/src/layer/dropout.rs @@ -61,24 +61,15 @@ impl NeuraLayer> for NeuraDropoutLayer { } } -impl NeuraTrainableLayer> for NeuraDropoutLayer { +impl NeuraTrainableLayerBase> for NeuraDropoutLayer { type Gradient = (); + type IntermediaryRepr = (); - fn default_gradient(&self) -> Self::Gradient { - () + fn eval_training(&self, input: &DVector) -> (Self::Output, Self::IntermediaryRepr) { + (self.eval(input), ()) } - fn backprop_layer( - &self, - _input: &DVector, - mut epsilon: Self::Output, - ) -> (DVector, Self::Gradient) { - self.apply_dropout(&mut epsilon); - - (epsilon, ()) - } - - fn regularize_layer(&self) -> Self::Gradient { + fn default_gradient(&self) -> Self::Gradient { () } @@ -110,6 +101,36 @@ impl NeuraTrainableLayer> for NeuraDropoutLayer } } +impl NeuraTrainableLayerSelf> for NeuraDropoutLayer { + fn regularize_layer(&self) -> Self::Gradient { + () + } + + fn get_gradient( + &self, + _input: &DVector, + _intermediary: &Self::IntermediaryRepr, + _epsilon: &Self::Output, + ) -> Self::Gradient { + () + } +} + +impl NeuraTrainableLayerBackprop> for NeuraDropoutLayer { + fn backprop_layer( + &self, + _input: &DVector, + _intermediary: &Self::IntermediaryRepr, + epsilon: &Self::Output, + ) -> DVector { + let mut epsilon = epsilon.clone(); + + self.apply_dropout(&mut epsilon); + + epsilon + } +} + #[cfg(test)] mod test { use super::*; @@ -121,7 +142,7 @@ mod test { .unwrap(); for _ in 0..100 { - as NeuraTrainableLayer>>::prepare_layer( + as NeuraTrainableLayerBase>>::prepare_layer( &mut layer, true, ); assert!(layer.multiplier.is_finite()); diff --git a/src/layer/mod.rs b/src/layer/mod.rs index 10cc623..6a80e40 100644 --- a/src/layer/mod.rs +++ b/src/layer/mod.rs @@ -23,6 +23,7 @@ impl NeuraShape { } pub trait NeuraLayer { + /// What type the layer outputs type Output; fn eval(&self, input: &Input) -> Self::Output; @@ -46,12 +47,64 @@ pub trait NeuraPartialLayer { fn output_shape(constructed: &Self::Constructed) -> NeuraShape; } -pub trait NeuraTrainableLayer: NeuraLayer { +pub trait NeuraTrainableLayerBase: NeuraLayer { /// The representation of the layer gradient as a vector space type Gradient: NeuraVectorSpace; + /// An intermediary object type to be passed to the various training methods + type IntermediaryRepr; + fn default_gradient(&self) -> Self::Gradient; + /// Applies `δW_l` to the weights of the layer + fn apply_gradient(&mut self, gradient: &Self::Gradient); + + fn eval_training(&self, input: &Input) -> (Self::Output, Self::IntermediaryRepr); + + /// Arbitrary computation that can be executed at the start of an epoch + #[allow(unused_variables)] + #[inline(always)] + fn prepare_layer(&mut self, is_training: bool) {} +} + +/// Contains methods relative to a layer's ability to compute its own weights gradients, +/// given the derivative of the output variables. +pub trait NeuraTrainableLayerSelf: NeuraTrainableLayerBase { + /// Computes the regularization + fn regularize_layer(&self) -> Self::Gradient; + + /// Computes the layer's gradient, + /// + /// `intermediary` is guaranteed to have been generated by a previous call to `eval_training`, + /// without mutation of `self` in-between, and with the same `input`. + fn get_gradient( + &self, + input: &Input, + intermediary: &Self::IntermediaryRepr, + epsilon: &Self::Output, + ) -> Self::Gradient; +} + +// impl> NeuraTrainableLayerSelf +// for Layer +// { +// #[inline(always)] +// fn regularize_layer(&self) -> Self::Gradient { +// () +// } + +// #[inline(always)] +// fn get_gradient( +// &self, +// input: &Input, +// intermediary: &Self::IntermediaryRepr, +// epsilon: Self::Output, +// ) -> Self::Gradient { +// () +// } +// } + +pub trait NeuraTrainableLayerBackprop: NeuraTrainableLayerBase { /// Computes the backpropagation term and the derivative of the internal weights, /// using the `input` vector outputted by the previous layer and the backpropagation term `epsilon` of the next layer. /// @@ -63,42 +116,31 @@ pub trait NeuraTrainableLayer: NeuraLayer { /// The function should then return a pair `(epsilon_{l-1}, δW_l)`, /// with `epsilon_{l-1}` being multiplied by `f_{l-1}'(activation)` by the next layer to obtain `delta_{l-1}`. /// Using this intermediate value for `delta` allows us to isolate it computation to the respective layers. - fn backprop_layer(&self, input: &Input, epsilon: Self::Output) -> (Input, Self::Gradient); - - /// Computes the regularization - fn regularize_layer(&self) -> Self::Gradient; - - /// Applies `δW_l` to the weights of the layer - fn apply_gradient(&mut self, gradient: &Self::Gradient); - - /// Arbitrary computation that can be executed at the start of an epoch - #[allow(unused_variables)] - #[inline(always)] - fn prepare_layer(&mut self, is_training: bool) {} + fn backprop_layer( + &self, + input: &Input, + intermediary: &Self::IntermediaryRepr, + epsilon: &Self::Output, + ) -> Input; } -impl NeuraTrainableLayer for () { +impl NeuraTrainableLayerBase for () { type Gradient = (); + type IntermediaryRepr = (); #[inline(always)] fn default_gradient(&self) -> Self::Gradient { () } - #[inline(always)] - fn backprop_layer(&self, _input: &Input, epsilon: Self::Output) -> (Input, Self::Gradient) { - (epsilon, ()) - } - - #[inline(always)] - fn regularize_layer(&self) -> Self::Gradient { - () - } - #[inline(always)] fn apply_gradient(&mut self, _gradient: &Self::Gradient) { // Noop } + + fn eval_training(&self, input: &Input) -> (Self::Output, Self::IntermediaryRepr) { + (self.eval(input), ()) + } } /// Temporary implementation of neura_layer diff --git a/src/layer/normalize.rs b/src/layer/normalize.rs index 291657e..2cb81a1 100644 --- a/src/layer/normalize.rs +++ b/src/layer/normalize.rs @@ -1,4 +1,4 @@ -use nalgebra::{DVector, Scalar}; +use nalgebra::{DMatrix, DVector, Scalar}; use num::{traits::NumAssignOps, Float}; use super::*; @@ -54,14 +54,19 @@ impl NeuraLayer> for NeuraNormalizeLayer { } } -impl NeuraTrainableLayer> for NeuraNormalizeLayer { +impl NeuraTrainableLayerBase> for NeuraNormalizeLayer { type Gradient = (); + type IntermediaryRepr = (DMatrix, F); // Partial jacobian matrix (without the kroenecker term) and stddev - fn backprop_layer( - &self, - input: &DVector, - epsilon: Self::Output, - ) -> (DVector, Self::Gradient) { + fn default_gradient(&self) -> Self::Gradient { + () + } + + fn apply_gradient(&mut self, _gradient: &Self::Gradient) { + // Noop + } + + fn eval_training(&self, input: &DVector) -> (Self::Output, Self::IntermediaryRepr) { let (mean, variance, len) = mean_variance(input); let stddev = F::sqrt(variance); let input_centered = input.clone().map(|x| x - mean); @@ -73,26 +78,42 @@ impl NeuraTrainableLayer> for Neura *value += F::one() / (stddev * len); } - let mut epsilon_out = jacobian_partial * ε - - // Apply the δ_{ik}/σ term - for i in 0..epsilon_out.len() { - epsilon_out[i] += epsilon[i] / stddev; - } - - (epsilon_out, ()) + (input_centered / stddev, (jacobian_partial, stddev)) } +} - fn default_gradient(&self) -> Self::Gradient { +impl NeuraTrainableLayerSelf> for NeuraNormalizeLayer { + fn regularize_layer(&self) -> Self::Gradient { () } - fn regularize_layer(&self) -> Self::Gradient { + fn get_gradient( + &self, + input: &DVector, + intermediary: &Self::IntermediaryRepr, + epsilon: &Self::Output, + ) -> Self::Gradient { () } +} - fn apply_gradient(&mut self, _gradient: &Self::Gradient) { - // Noop +impl NeuraTrainableLayerBackprop> + for NeuraNormalizeLayer +{ + fn backprop_layer( + &self, + input: &DVector, + (jacobian_partial, stddev): &Self::IntermediaryRepr, + epsilon: &Self::Output, + ) -> DVector { + let mut epsilon_out = jacobian_partial * epsilon; + + // Apply the δ_{ik}/σ term + for i in 0..epsilon_out.len() { + epsilon_out[i] += epsilon[i] / *stddev; + } + + epsilon_out } } diff --git a/src/layer/softmax.rs b/src/layer/softmax.rs index e428677..6f97472 100644 --- a/src/layer/softmax.rs +++ b/src/layer/softmax.rs @@ -54,22 +54,53 @@ impl NeuraPartialLayer for NeuraSoftmaxLayer { } } -impl NeuraTrainableLayer> for NeuraSoftmaxLayer { +impl NeuraTrainableLayerBase> for NeuraSoftmaxLayer { type Gradient = (); + type IntermediaryRepr = Self::Output; // Result of self.eval fn default_gradient(&self) -> Self::Gradient { () } + fn apply_gradient(&mut self, _gradient: &Self::Gradient) { + // Noop + } + + fn eval_training(&self, input: &DVector) -> (Self::Output, Self::IntermediaryRepr) { + let res = self.eval(input); + (res.clone(), res) + } +} + +impl NeuraTrainableLayerSelf> for NeuraSoftmaxLayer { + #[inline(always)] + fn regularize_layer(&self) -> Self::Gradient { + () + } + + #[inline(always)] + fn get_gradient( + &self, + input: &DVector, + intermediary: &Self::IntermediaryRepr, + epsilon: &Self::Output, + ) -> Self::Gradient { + () + } +} + +impl NeuraTrainableLayerBackprop> + for NeuraSoftmaxLayer +{ fn backprop_layer( &self, input: &DVector, - mut epsilon: Self::Output, - ) -> (DVector, Self::Gradient) { - // Note: a constant value can be added to `input` to bring it to increase precision - let evaluated = self.eval(input); + evaluated: &Self::IntermediaryRepr, + epsilon: &Self::Output, + ) -> DVector { + let mut epsilon = epsilon.clone(); - // Compute $a_{l-1,i} \epsilon_{l,i}$ + // Compute $a_{l-1,i} ° \epsilon_{l,i}$ hadamard_product(&mut epsilon, &evaluated); // Compute $\sum_{k}{a_{l-1,k} \epsilon_{l,k}}$ @@ -80,15 +111,7 @@ impl NeuraTrainableLayer> for Neura epsilon[i] -= evaluated[i] * sum_diagonal_terms; } - (epsilon, ()) - } - - fn regularize_layer(&self) -> Self::Gradient { - () - } - - fn apply_gradient(&mut self, _gradient: &Self::Gradient) { - // Noop + epsilon } } @@ -132,8 +155,9 @@ mod test { for epsilon1 in [1.7, 1.9, 2.3] { for epsilon2 in [2.9, 3.1, 3.7] { let epsilon = dvector![epsilon1, epsilon2]; + let evaluated = layer.eval(&input); - let (epsilon, _) = layer.backprop_layer(&input, epsilon); + let epsilon = layer.backprop_layer(&input, &evaluated, &epsilon); let expected = [ output[0] * (1.0 - output[0]) * epsilon1 - output[1] * output[0] * epsilon2, @@ -165,7 +189,8 @@ mod test { derivative += DMatrix::from_diagonal(&evaluated); let expected = derivative * &loss; - let (actual, _) = layer.backprop_layer(&input, loss); + let evaluated = layer.eval(&input); + let actual = layer.backprop_layer(&input, &evaluated, &loss); for i in 0..4 { assert!((expected[i] - actual[i]).abs() < EPSILON); diff --git a/src/network/mod.rs b/src/network/mod.rs index 823ee61..335889e 100644 --- a/src/network/mod.rs +++ b/src/network/mod.rs @@ -4,6 +4,7 @@ use crate::{ pub mod sequential; +// TODO: extract regularize from this, so that we can drop the trait constraints on NeuraSequential's impl pub trait NeuraTrainableNetworkBase: NeuraLayer { type Gradient: NeuraVectorSpace; type LayerOutput; diff --git a/src/network/sequential/layer_impl.rs b/src/network/sequential/layer_impl.rs new file mode 100644 index 0000000..e454b84 --- /dev/null +++ b/src/network/sequential/layer_impl.rs @@ -0,0 +1,96 @@ +use super::*; +use crate::prelude::NeuraTrainableLayerBackprop; + +impl, ChildNetwork: NeuraLayer> NeuraLayer + for NeuraSequential +{ + type Output = ChildNetwork::Output; + + fn eval(&self, input: &Input) -> Self::Output { + self.child_network.eval(&self.layer.eval(input)) + } +} + +impl< + Input, + Layer: NeuraTrainableLayerBase, + ChildNetwork: NeuraTrainableLayerBase, + > NeuraTrainableLayerBase for NeuraSequential +{ + type Gradient = (Layer::Gradient, Box); + type IntermediaryRepr = (Layer::IntermediaryRepr, Box); + + fn default_gradient(&self) -> Self::Gradient { + ( + self.layer.default_gradient(), + Box::new(self.child_network.default_gradient()), + ) + } + + fn eval_training(&self, input: &Input) -> (Self::Output, Self::IntermediaryRepr) { + let (layer_output, layer_intermediary) = self.layer.eval_training(input); + let (child_output, child_intermediary) = self.child_network.eval_training(&layer_output); + + ( + child_output, + (layer_intermediary, Box::new(child_intermediary)), + ) + } + + fn prepare_layer(&mut self, is_training: bool) { + self.layer.prepare_layer(is_training); + self.child_network.prepare_layer(is_training); + } + + fn apply_gradient(&mut self, gradient: &Self::Gradient) { + self.layer.apply_gradient(&gradient.0); + self.child_network.apply_gradient(&gradient.1); + } +} + +impl< + Input, + Layer: NeuraTrainableLayerSelf, + ChildNetwork: NeuraTrainableLayerSelf + NeuraTrainableLayerBackprop, + > NeuraTrainableLayerSelf for NeuraSequential +{ + fn regularize_layer(&self) -> Self::Gradient { + ( + self.layer.regularize_layer(), + Box::new(self.child_network.regularize_layer()), + ) + } + + fn get_gradient( + &self, + input: &Input, + intermediary: &Self::IntermediaryRepr, + epsilon: &Self::Output, + ) -> Self::Gradient { + unimplemented!("NeuraSequential::get_gradient is not yet implemented, sorry"); + } +} + +impl< + Input, + Layer: NeuraTrainableLayerBackprop, + ChildNetwork: NeuraTrainableLayerBackprop, + > NeuraTrainableLayerBackprop for NeuraSequential +{ + fn backprop_layer( + &self, + input: &Input, + intermediary: &Self::IntermediaryRepr, + incoming_epsilon: &Self::Output, + ) -> Input { + let transient_output = self.layer.eval(input); + let transient_epsilon = + self.child_network + .backprop_layer(&transient_output, &intermediary.1, incoming_epsilon); + let outgoing_epsilon = + self.layer + .backprop_layer(input, &intermediary.0, &transient_epsilon); + + outgoing_epsilon + } +} diff --git a/src/network/sequential/mod.rs b/src/network/sequential/mod.rs index a969d33..dfeb577 100644 --- a/src/network/sequential/mod.rs +++ b/src/network/sequential/mod.rs @@ -1,10 +1,12 @@ use super::{NeuraTrainableNetwork, NeuraTrainableNetworkBase}; use crate::{ gradient_solver::{NeuraGradientSolverFinal, NeuraGradientSolverTransient}, - layer::{NeuraLayer, NeuraPartialLayer, NeuraShape, NeuraTrainableLayer}, + layer::{NeuraLayer, NeuraPartialLayer, NeuraShape, NeuraTrainableLayerBase}, + prelude::NeuraTrainableLayerSelf, }; mod construct; +mod layer_impl; mod tail; pub use construct::*; @@ -24,7 +26,7 @@ pub use tail::*; /// ## Notes on implemented traits /// /// The different implementations for `NeuraTrainableNetwork`, -/// `NeuraLayer` and `NeuraTrainableLayer` each require that `ChildNetwork` implements those respective traits, +/// `NeuraLayer` and `NeuraTrainableLayerBase` each require that `ChildNetwork` implements those respective traits, /// and that the output type of `Layer` matches the input type of `ChildNetwork`. /// /// If a method, like `eval`, is reported as missing, @@ -74,61 +76,9 @@ impl NeuraSequential { } } -impl, ChildNetwork: NeuraLayer> NeuraLayer - for NeuraSequential -{ - type Output = ChildNetwork::Output; - - fn eval(&self, input: &Input) -> Self::Output { - self.child_network.eval(&self.layer.eval(input)) - } -} - -impl< - Input, - Layer: NeuraTrainableLayer, - ChildNetwork: NeuraTrainableLayer, - > NeuraTrainableLayer for NeuraSequential -{ - type Gradient = (Layer::Gradient, Box); - - fn default_gradient(&self) -> Self::Gradient { - ( - self.layer.default_gradient(), - Box::new(self.child_network.default_gradient()), - ) - } - - fn backprop_layer( - &self, - input: &Input, - incoming_epsilon: Self::Output, - ) -> (Input, Self::Gradient) { - let output = self.layer.eval(input); - let (transient_epsilon, child_gradient) = - self.child_network.backprop_layer(&output, incoming_epsilon); - let (outgoing_epsilon, layer_gradient) = - self.layer.backprop_layer(input, transient_epsilon); - - (outgoing_epsilon, (layer_gradient, Box::new(child_gradient))) - } - - fn regularize_layer(&self) -> Self::Gradient { - ( - self.layer.regularize_layer(), - Box::new(self.child_network.regularize_layer()), - ) - } - - fn apply_gradient(&mut self, gradient: &Self::Gradient) { - self.layer.apply_gradient(&gradient.0); - self.child_network.apply_gradient(&gradient.1); - } -} - impl< Input, - Layer: NeuraTrainableLayer, + Layer: NeuraTrainableLayerBase + NeuraTrainableLayerSelf, ChildNetwork: NeuraTrainableNetworkBase, > NeuraTrainableNetworkBase for NeuraSequential { @@ -188,8 +138,8 @@ impl NeuraTrainableNetworkBase for () { impl< Input, - Layer: NeuraTrainableLayer, - Optimizer: NeuraGradientSolverTransient, + Layer: NeuraTrainableLayerBase + NeuraTrainableLayerSelf, + Optimizer: NeuraGradientSolverTransient, ChildNetwork: NeuraTrainableNetworkBase, > NeuraTrainableNetwork for NeuraSequential where @@ -200,12 +150,14 @@ where input: &Input, optimizer: &Optimizer, ) -> Optimizer::Output { - let next_activation = self.layer.eval(input); + let (next_activation, intermediary) = self.layer.eval_training(input); let child_result = self.child_network.traverse(&next_activation, optimizer); optimizer.eval_layer( &self.layer, input, + &next_activation, + &intermediary, child_result, |layer_gradient, child_gradient| (layer_gradient, Box::new(child_gradient)), ) diff --git a/src/train.rs b/src/train.rs index 576acca..24e1ce4 100644 --- a/src/train.rs +++ b/src/train.rs @@ -82,7 +82,10 @@ impl NeuraBatchedTrainer { network: &mut Network, inputs: Inputs, test_inputs: &[(Input, Target)], - ) -> Vec<(f64, f64)> { + ) -> Vec<(f64, f64)> + where + >::Gradient: std::fmt::Debug, + { let mut losses = Vec::new(); let mut iter = inputs.into_iter(); let factor = -self.learning_rate / (self.batch_size as f64); diff --git a/tests/xor.rs b/tests/xor.rs index 9f47459..1b9edb0 100644 --- a/tests/xor.rs +++ b/tests/xor.rs @@ -1,12 +1,20 @@ use std::fs::File; use approx::assert_relative_eq; -use nalgebra::{DMatrix, DVector, dvector}; -use neuramethyst::{prelude::{*, dense::NeuraDenseLayer}, derivable::{activation::{Relu, Tanh}, regularize::NeuraL0, loss::Euclidean}}; +use nalgebra::{dvector, DMatrix, DVector}; +use neuramethyst::{ + derivable::{ + activation::{Relu, Tanh}, + loss::Euclidean, + regularize::NeuraL0, + }, + prelude::{dense::NeuraDenseLayer, *}, +}; fn load_test_data() -> Vec<(DMatrix, DVector, DMatrix, DVector)> { let file = File::open("tests/xor.json").unwrap(); - let data: Vec<(DMatrix, DVector, DMatrix, DVector)> = serde_json::from_reader(&file).unwrap(); + let data: Vec<(DMatrix, DVector, DMatrix, DVector)> = + serde_json::from_reader(&file).unwrap(); data } @@ -43,7 +51,7 @@ fn test_xor_training() { network.layer.weights.clone(), network.layer.bias.clone(), network.child_network.layer.weights.clone(), - network.child_network.layer.bias.clone() + network.child_network.layer.bias.clone(), ); assert_relative_eq!(expected.0.as_slice(), actual.0.as_slice());