From bca56a5557e7e2b9be2c2344508e29a6044c6751 Mon Sep 17 00:00:00 2001 From: Adrien Burgun Date: Sat, 15 Apr 2023 13:05:33 +0200 Subject: [PATCH] :sparkles: Re-order arguments of neura_layer, implement softmax and normalization --- examples/bivariate.rs | 27 ++++--- examples/xor.rs | 26 ++++-- src/derivable/activation.rs | 11 ++- src/derivable/mod.rs | 1 + src/derivable/regularize.rs | 134 +++++++++++++++++++++++++++++++ src/layer/dense.rs | 68 +++++++++++++--- src/layer/dropout.rs | 4 + src/layer/mod.rs | 35 ++++++-- src/layer/softmax.rs | 155 ++++++++++++++++++++++++++++++++++++ src/lib.rs | 10 +-- src/network.rs | 35 +++++--- src/train.rs | 49 +++++++----- src/utils.rs | 56 +++++++++++-- 13 files changed, 520 insertions(+), 91 deletions(-) create mode 100644 src/derivable/regularize.rs create mode 100644 src/layer/softmax.rs diff --git a/examples/bivariate.rs b/examples/bivariate.rs index 67443fe..5d64419 100644 --- a/examples/bivariate.rs +++ b/examples/bivariate.rs @@ -2,22 +2,26 @@ use std::io::Write; -use neuramethyst::prelude::*; -use neuramethyst::derivable::activation::{Relu, Tanh, LeakyRelu}; +use neuramethyst::derivable::activation::Linear; +#[allow(unused_imports)] +use neuramethyst::derivable::activation::{LeakyRelu, Relu, Tanh}; use neuramethyst::derivable::loss::Euclidean; +use neuramethyst::derivable::regularize::NeuraElastic; +use neuramethyst::prelude::*; use rand::Rng; fn main() { let mut network = neura_network![ - neura_layer!("dense", LeakyRelu(0.01), 9, 2), + neura_layer!("dense", 2, 8; LeakyRelu(0.01)), neura_layer!("dropout", 0.1), - neura_layer!("dense", LeakyRelu(0.01), 9), + neura_layer!("dense", 8; LeakyRelu(0.01), NeuraElastic::new(0.0001, 0.002)), neura_layer!("dropout", 0.3), - neura_layer!("dense", LeakyRelu(0.01), 6), + neura_layer!("dense", 8; LeakyRelu(0.01), NeuraElastic::new(0.0001, 0.002)), neura_layer!("dropout", 0.1), - neura_layer!("dense", LeakyRelu(0.01), 4), - neura_layer!("dense", LeakyRelu(0.1), 2) + neura_layer!("dense", 4; LeakyRelu(0.1), NeuraElastic::new(0.0001, 0.002)), + neura_layer!("dense", 2; Linear), + neura_layer!("softmax"), ]; // println!("{:#?}", network); @@ -39,20 +43,23 @@ fn main() { let test_inputs: Vec<_> = inputs.clone().take(100).collect(); - let mut trainer = NeuraBatchedTrainer::new(0.1, 4000); - trainer.log_epochs = 500; + let mut trainer = NeuraBatchedTrainer::new(0.25, 1000); + trainer.log_epochs = 50; + trainer.learning_momentum = 0.05; + trainer.batch_size = 2000; trainer.train( NeuraBackprop::new(Euclidean), &mut network, inputs, - &test_inputs + &test_inputs, ); let mut file = std::fs::File::create("target/bivariate.csv").unwrap(); for (input, _target) in test_inputs { let guess = argmax(&network.eval(&input)); writeln!(&mut file, "{},{},{}", input[0], input[1], guess).unwrap(); + // println!("{:?}", network.eval(&input)); } // println!("{:#?}", network); diff --git a/examples/xor.rs b/examples/xor.rs index fa1a88b..be04629 100644 --- a/examples/xor.rs +++ b/examples/xor.rs @@ -1,25 +1,30 @@ #![feature(generic_arg_infer)] -use neuramethyst::prelude::*; -use neuramethyst::derivable::activation::{Relu}; +use neuramethyst::derivable::activation::Relu; use neuramethyst::derivable::loss::Euclidean; +use neuramethyst::prelude::*; fn main() { let mut network = neura_network![ - neura_layer!("dense", Relu, 4, 2), - neura_layer!("dense", Relu, 3), - neura_layer!("dense", Relu, 1) + neura_layer!("dense", 2, 4; Relu), + neura_layer!("dense", 3; Relu), + neura_layer!("dense", 1; Relu) ]; let inputs = [ ([0.0, 0.0], [0.0]), ([0.0, 1.0], [1.0]), ([1.0, 0.0], [1.0]), - ([1.0, 1.0], [0.0]) + ([1.0, 1.0], [0.0]), ]; for (input, target) in inputs { - println!("Input: {:?}, target: {}, actual: {:.3}", &input, target[0], network.eval(&input)[0]); + println!( + "Input: {:?}, target: {}, actual: {:.3}", + &input, + target[0], + network.eval(&input)[0] + ); } let mut trainer = NeuraBatchedTrainer::new(0.05, 1000); @@ -35,6 +40,11 @@ fn main() { ); for (input, target) in inputs { - println!("Input: {:?}, target: {}, actual: {:.3}", &input, target[0], network.eval(&input)[0]); + println!( + "Input: {:?}, target: {}, actual: {:.3}", + &input, + target[0], + network.eval(&input)[0] + ); } } diff --git a/src/derivable/activation.rs b/src/derivable/activation.rs index 9c13bd3..07b97da 100644 --- a/src/derivable/activation.rs +++ b/src/derivable/activation.rs @@ -36,10 +36,9 @@ impl NeuraDerivable for Relu { } #[derive(Clone, Copy, Debug, PartialEq)] -pub struct LeakyRelu(pub f64); +pub struct LeakyRelu(pub F); - -impl NeuraDerivable for LeakyRelu { +impl NeuraDerivable for LeakyRelu { #[inline(always)] fn eval(&self, input: f64) -> f64 { if input > 0.0 { @@ -59,13 +58,13 @@ impl NeuraDerivable for LeakyRelu { } } -impl NeuraDerivable for LeakyRelu { +impl NeuraDerivable for LeakyRelu { #[inline(always)] fn eval(&self, input: f32) -> f32 { if input > 0.0 { input } else { - (self.0 as f32) * input + self.0 * input } } @@ -74,7 +73,7 @@ impl NeuraDerivable for LeakyRelu { if input > 0.0 { 1.0 } else { - self.0 as f32 + self.0 } } } diff --git a/src/derivable/mod.rs b/src/derivable/mod.rs index 9888423..b5c4412 100644 --- a/src/derivable/mod.rs +++ b/src/derivable/mod.rs @@ -1,5 +1,6 @@ pub mod activation; pub mod loss; +pub mod regularize; pub trait NeuraDerivable { fn eval(&self, input: F) -> F; diff --git a/src/derivable/regularize.rs b/src/derivable/regularize.rs new file mode 100644 index 0000000..91cdb30 --- /dev/null +++ b/src/derivable/regularize.rs @@ -0,0 +1,134 @@ +use super::*; + +/// Default regularization, which is no regularization +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct NeuraL0; + +impl NeuraDerivable for NeuraL0 { + #[inline(always)] + fn eval(&self, _input: f64) -> f64 { + 0.0 + } + + #[inline(always)] + fn derivate(&self, _at: f64) -> f64 { + 0.0 + } +} + +impl NeuraDerivable for NeuraL0 { + #[inline(always)] + fn eval(&self, _input: f32) -> f32 { + 0.0 + } + + #[inline(always)] + fn derivate(&self, _at: f32) -> f32 { + 0.0 + } +} + +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct NeuraL1(pub F); + +impl NeuraDerivable for NeuraL1 { + #[inline(always)] + fn eval(&self, input: f64) -> f64 { + self.0 * input.abs() + } + + #[inline(always)] + fn derivate(&self, at: f64) -> f64 { + if at > 0.0 { + self.0 + } else if at < 0.0 { + -self.0 + } else { + 0.0 + } + } +} + +impl NeuraDerivable for NeuraL1 { + #[inline(always)] + fn eval(&self, input: f32) -> f32 { + self.0 * input.abs() + } + + #[inline(always)] + fn derivate(&self, at: f32) -> f32 { + if at > 0.0 { + self.0 + } else if at < 0.0 { + -self.0 + } else { + 0.0 + } + } +} + +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct NeuraL2(pub F); + +impl NeuraDerivable for NeuraL2 { + #[inline(always)] + fn eval(&self, input: f64) -> f64 { + self.0 * (input * input) + } + + #[inline(always)] + fn derivate(&self, at: f64) -> f64 { + self.0 * at + } +} + +impl NeuraDerivable for NeuraL2 { + #[inline(always)] + fn eval(&self, input: f32) -> f32 { + self.0 * (input * input) + } + + #[inline(always)] + fn derivate(&self, at: f32) -> f32 { + self.0 * at + } +} + +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct NeuraElastic { + pub l1: F, + pub l2: F, +} + +impl NeuraElastic { + pub fn new(l1_factor: F, l2_factor: F) -> Self { + Self { + l1: l1_factor, + l2: l2_factor, + } + } +} + +impl NeuraDerivable for NeuraElastic { + #[inline(always)] + fn eval(&self, input: f64) -> f64 { + NeuraL1(self.l1).eval(input) + NeuraL2(self.l2).eval(input) + } + + #[inline(always)] + fn derivate(&self, at: f64) -> f64 { + NeuraL1(self.l1).derivate(at) + NeuraL2(self.l2).derivate(at) + } +} + +impl NeuraDerivable for NeuraElastic { + #[inline(always)] + fn eval(&self, input: f32) -> f32 { + NeuraL1(self.l1).eval(input) + NeuraL2(self.l2).eval(input) + } + + #[inline(always)] + fn derivate(&self, at: f32) -> f32 { + NeuraL1(self.l1).derivate(at) + NeuraL2(self.l2).derivate(at) + } +} diff --git a/src/layer/dense.rs b/src/layer/dense.rs index bf94b76..2c9e5fc 100644 --- a/src/layer/dense.rs +++ b/src/layer/dense.rs @@ -1,39 +1,53 @@ use super::NeuraLayer; -use crate::{derivable::NeuraDerivable, utils::{multiply_matrix_vector, reverse_dot_product, multiply_matrix_transpose_vector}, train::NeuraTrainableLayer, algebra::NeuraVectorSpace}; +use crate::{ + algebra::NeuraVectorSpace, + derivable::NeuraDerivable, + train::NeuraTrainableLayer, + utils::{multiply_matrix_transpose_vector, multiply_matrix_vector, reverse_dot_product}, +}; -use rand_distr::Distribution; use rand::Rng; +use rand_distr::Distribution; #[derive(Clone, Debug)] pub struct NeuraDenseLayer< Act: NeuraDerivable, + Reg: NeuraDerivable, const INPUT_LEN: usize, const OUTPUT_LEN: usize, > { weights: [[f64; INPUT_LEN]; OUTPUT_LEN], bias: [f64; OUTPUT_LEN], activation: Act, + regularization: Reg, } -impl, const INPUT_LEN: usize, const OUTPUT_LEN: usize> - NeuraDenseLayer +impl< + Act: NeuraDerivable, + Reg: NeuraDerivable, + const INPUT_LEN: usize, + const OUTPUT_LEN: usize, + > NeuraDenseLayer { pub fn new( weights: [[f64; INPUT_LEN]; OUTPUT_LEN], bias: [f64; OUTPUT_LEN], activation: Act, + regularization: Reg, ) -> Self { Self { weights, bias, activation, + regularization, } } - pub fn from_rng(rng: &mut impl Rng, activation: Act) -> Self { + pub fn from_rng(rng: &mut impl Rng, activation: Act, regularization: Reg) -> Self { let mut weights = [[0.0; INPUT_LEN]; OUTPUT_LEN]; - let distribution = rand_distr::Normal::new(0.0, 2.0 / (INPUT_LEN as f64 + OUTPUT_LEN as f64)).unwrap(); + let distribution = + rand_distr::Normal::new(0.0, 2.0 / (INPUT_LEN as f64 + OUTPUT_LEN as f64)).unwrap(); for i in 0..OUTPUT_LEN { for j in 0..INPUT_LEN { @@ -46,12 +60,17 @@ impl, const INPUT_LEN: usize, const OUTPUT_LEN: usize> // Biases are zero-initialized, as this shouldn't cause any issues during training bias: [0.0; OUTPUT_LEN], activation, + regularization, } } } -impl, const INPUT_LEN: usize, const OUTPUT_LEN: usize> NeuraLayer - for NeuraDenseLayer +impl< + Act: NeuraDerivable, + Reg: NeuraDerivable, + const INPUT_LEN: usize, + const OUTPUT_LEN: usize, + > NeuraLayer for NeuraDenseLayer { type Input = [f64; INPUT_LEN]; @@ -68,13 +87,21 @@ impl, const INPUT_LEN: usize, const OUTPUT_LEN: usize> } } -impl, const INPUT_LEN: usize, const OUTPUT_LEN: usize> NeuraTrainableLayer - for NeuraDenseLayer +impl< + Act: NeuraDerivable, + Reg: NeuraDerivable, + const INPUT_LEN: usize, + const OUTPUT_LEN: usize, + > NeuraTrainableLayer for NeuraDenseLayer { type Delta = ([[f64; INPUT_LEN]; OUTPUT_LEN], [f64; OUTPUT_LEN]); // TODO: double-check the math in this - fn backpropagate(&self, input: &Self::Input, epsilon: Self::Output) -> (Self::Input, Self::Delta) { + fn backpropagate( + &self, + input: &Self::Input, + epsilon: Self::Output, + ) -> (Self::Input, Self::Delta) { let evaluated = multiply_matrix_vector(&self.weights, input); // Compute delta from epsilon, with `self.activation'(input) ° epsilon = delta` let mut delta = epsilon.clone(); @@ -96,17 +123,32 @@ impl, const INPUT_LEN: usize, const OUTPUT_LEN: usize> NeuraVectorSpace::add_assign(&mut self.weights, &gradient.0); NeuraVectorSpace::add_assign(&mut self.bias, &gradient.1); } + + fn regularize(&self) -> Self::Delta { + let mut res = ([[0.0; INPUT_LEN]; OUTPUT_LEN], [0.0; OUTPUT_LEN]); + + for i in 0..OUTPUT_LEN { + for j in 0..INPUT_LEN { + res.0[i][j] = self.regularization.derivate(self.weights[i][j]); + } + } + + // Note: biases aren't taken into account here, as per https://stats.stackexchange.com/questions/153605/no-regularisation-term-for-bias-unit-in-neural-network + + res + } } #[cfg(test)] mod test { use super::*; - use crate::derivable::activation::Relu; + use crate::derivable::{activation::Relu, regularize::NeuraL0}; #[test] fn test_from_rng() { let mut rng = rand::thread_rng(); - let layer: NeuraDenseLayer<_, 64, 32> = NeuraDenseLayer::from_rng(&mut rng, Relu); + let layer: NeuraDenseLayer<_, _, 64, 32> = + NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0); let mut input = [0.0; 64]; for x in 0..64 { input[x] = rng.gen(); diff --git a/src/layer/dropout.rs b/src/layer/dropout.rs index d8ff615..5ce6479 100644 --- a/src/layer/dropout.rs +++ b/src/layer/dropout.rs @@ -59,6 +59,10 @@ impl NeuraTrainableLayer for NeuraDropoutLayer Self::Delta { + () + } + #[inline(always)] fn apply_gradient(&mut self, _gradient: &Self::Delta) { // Noop diff --git a/src/layer/mod.rs b/src/layer/mod.rs index a74bd69..4ac7393 100644 --- a/src/layer/mod.rs +++ b/src/layer/mod.rs @@ -4,6 +4,9 @@ pub use dense::NeuraDenseLayer; mod dropout; pub use dropout::NeuraDropoutLayer; +mod softmax; +pub use softmax::NeuraSoftmaxLayer; + pub trait NeuraLayer { type Input; type Output; @@ -13,18 +16,34 @@ pub trait NeuraLayer { #[macro_export] macro_rules! neura_layer { - ( "dense", $activation:expr, $output:expr ) => { - NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation) - as NeuraDenseLayer<_, _, $output> + ( "dense", $( $shape:expr ),*; $activation:expr ) => { + $crate::layer::NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation, $crate::derivable::regularize::NeuraL0) + as neura_layer!("_dense_shape", $($shape),*) + }; + + ( "dense", $( $shape:expr ),*; $activation:expr, $regularization:expr ) => { + $crate::layer::NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation, $regularization) + as neura_layer!("_dense_shape", $($shape),*) + }; + + ( "_dense_shape", $output:expr ) => { + $crate::layer::NeuraDenseLayer<_, _, _, $output> }; - ( "dense", $activation:expr, $output:expr, $input:expr ) => { - NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation) - as NeuraDenseLayer<_, $input, $output> + ( "_dense_shape", $input:expr, $output:expr ) => { + $crate::layer::NeuraDenseLayer<_, _, $input, $output> }; ( "dropout", $probability:expr ) => { - NeuraDropoutLayer::new($probability, rand::thread_rng()) - as NeuraDropoutLayer<_, _> + $crate::layer::NeuraDropoutLayer::new($probability, rand::thread_rng()) + as $crate::layer::NeuraDropoutLayer<_, _> + }; + + ( "softmax" ) => { + $crate::layer::NeuraSoftmaxLayer::new() as $crate::layer::NeuraSoftmaxLayer<_> + }; + + ( "softmax", $length:expr ) => { + $crate::layer::NeuraSoftmaxLayer::new() as $crate::layer::NeuraSoftmaxLayer<$length> }; } diff --git a/src/layer/softmax.rs b/src/layer/softmax.rs new file mode 100644 index 0000000..8160e50 --- /dev/null +++ b/src/layer/softmax.rs @@ -0,0 +1,155 @@ +use crate::{train::NeuraTrainableLayer, utils::multiply_vectors_pointwise}; + +use super::NeuraLayer; + +#[non_exhaustive] +#[derive(Clone, Debug)] +pub struct NeuraSoftmaxLayer; + +impl NeuraSoftmaxLayer { + pub fn new() -> Self { + Self + } +} + +impl NeuraLayer for NeuraSoftmaxLayer { + type Input = [f64; LENGTH]; + type Output = [f64; LENGTH]; + + fn eval(&self, input: &Self::Input) -> Self::Output { + let mut res = input.clone(); + + let mut max = 0.0; + for item in &res { + if *item > max { + max = *item; + } + } + + for item in &mut res { + *item = (*item - max).exp(); + } + + let mut sum = 0.0; + for item in &res { + sum += item; + } + + for item in &mut res { + *item /= sum; + } + + res + } +} + +impl NeuraTrainableLayer for NeuraSoftmaxLayer { + type Delta = (); + + fn backpropagate( + &self, + input: &Self::Input, + mut epsilon: Self::Output, + ) -> (Self::Input, Self::Delta) { + // Note: a constant value can be added to `input` to bring it to increase precision + let evaluated = self.eval(input); + + // Compute $a_{l-1,i} \epsilon_{l,i}$ + epsilon = multiply_vectors_pointwise(&epsilon, &evaluated); + + // Compute $\sum_{k}{a_{l-1,k} \epsilon_{l,k}}$ + let sum_diagonal_terms: f64 = epsilon.iter().copied().sum(); + + for i in 0..LENGTH { + // Multiply $\sum_{k}{a_{l-1,k} \epsilon_{l,k}}$ by $a_{l-1,i}$ and add it to $a_{l-1,i} \epsilon_{l,i}$ + epsilon[i] -= evaluated[i] * sum_diagonal_terms; + } + + (epsilon, ()) + } + + fn regularize(&self) -> Self::Delta { + () + } + + fn apply_gradient(&mut self, _gradient: &Self::Delta) { + // Noop + } +} + +#[cfg(test)] +mod test { + use crate::algebra::NeuraVectorSpace; + use crate::utils::{ + matrix_from_diagonal, multiply_matrix_vector, reverse_dot_product, uniform_vector, + }; + + use super::*; + + #[test] + fn test_softmax_eval() { + const EPSILON: f64 = 0.000002; + let layer = NeuraSoftmaxLayer::new() as NeuraSoftmaxLayer<3>; + + let result = layer.eval(&[1.0, 2.0, 8.0]); + + assert!((result[0] - 0.0009088).abs() < EPSILON); + assert!((result[1] - 0.0024704).abs() < EPSILON); + assert!((result[2] - 0.9966208).abs() < EPSILON); + } + + // Based on https://stats.stackexchange.com/a/306710 + #[test] + fn test_softmax_backpropagation_two() { + const EPSILON: f64 = 0.000001; + let layer = NeuraSoftmaxLayer::new() as NeuraSoftmaxLayer<2>; + + for input1 in [0.2, 0.3, 0.5] as [f64; 3] { + for input2 in [0.7, 1.1, 1.3] { + let input = [input1, input2]; + let sum = input1.exp() + input2.exp(); + let output = [input1.exp() / sum, input2.exp() / sum]; + for epsilon1 in [1.7, 1.9, 2.3] { + for epsilon2 in [2.9, 3.1, 3.7] { + let epsilon = [epsilon1, epsilon2]; + + let (epsilon, _) = layer.backpropagate(&input, epsilon); + let expected = [ + output[0] * (1.0 - output[0]) * epsilon1 + - output[1] * output[0] * epsilon2, + output[1] * (1.0 - output[1]) * epsilon2 + - output[1] * output[0] * epsilon1, + ]; + + assert!((epsilon[0] - expected[0]).abs() < EPSILON); + assert!((epsilon[1] - expected[1]).abs() < EPSILON); + } + } + } + } + } + + // Based on https://e2eml.school/softmax.html + #[test] + fn test_softmax_backpropagation() { + const EPSILON: f64 = 0.000001; + let layer = NeuraSoftmaxLayer::new() as NeuraSoftmaxLayer<4>; + + for _ in 0..100 { + let input: [f64; 4] = uniform_vector(); + let evaluated = layer.eval(&input); + let loss: [f64; 4] = uniform_vector(); + + let mut derivative = reverse_dot_product(&evaluated, &evaluated); + derivative.mul_assign(-1.0); + derivative.add_assign(&matrix_from_diagonal(&evaluated)); + + let expected = multiply_matrix_vector(&derivative, &loss); + let (actual, _) = layer.backpropagate(&input, loss); + + for i in 0..4 { + assert!((expected[i] - actual[i]).abs() < EPSILON); + } + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 0c66809..4bfe1e5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,15 +10,11 @@ mod utils; pub mod prelude { // Macros - pub use crate::{neura_network, neura_layer}; + pub use crate::{neura_layer, neura_network}; // Structs and traits - pub use crate::network::{NeuraNetwork}; - pub use crate::layer::{ - NeuraLayer, - NeuraDenseLayer, - NeuraDropoutLayer - }; + pub use crate::layer::{NeuraDenseLayer, NeuraDropoutLayer, NeuraLayer}; + pub use crate::network::NeuraNetwork; pub use crate::train::{NeuraBackprop, NeuraBatchedTrainer}; pub use crate::utils::cycle_shuffling; } diff --git a/src/network.rs b/src/network.rs index f165e3e..3a6f4dc 100644 --- a/src/network.rs +++ b/src/network.rs @@ -82,6 +82,10 @@ impl NeuraTrainable for NeuraNetwork { self.layer.backpropagate(&input, backprop_epsilon) } + fn regularize(&self) -> Self::Delta { + self.layer.regularize() + } + fn prepare_epoch(&mut self) { self.layer.prepare_epoch(); } @@ -117,6 +121,10 @@ impl Self::Delta { + (self.layer.regularize(), self.child_network.regularize()) + } + fn prepare_epoch(&mut self) { self.layer.prepare_epoch(); self.child_network.prepare_epoch(); @@ -145,7 +153,11 @@ macro_rules! neura_network { #[cfg(test)] mod test { - use crate::{derivable::activation::Relu, layer::NeuraDenseLayer, neura_layer}; + use crate::{ + derivable::{activation::Relu, regularize::NeuraL0}, + layer::NeuraDenseLayer, + neura_layer, + }; use super::*; @@ -154,23 +166,24 @@ mod test { let mut rng = rand::thread_rng(); let _ = neura_network![ - NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, 8, 16>, - NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, _, 12>, - NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, _, 2> + NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, 8, 16>, + NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, _, 12>, + NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, _, 2> ]; - let _ = - neura_network![NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, 8, 16>,]; + let _ = neura_network![ + NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, 8, 16>, + ]; let _ = neura_network![ - NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, 8, 16>, - NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, _, 12>, + NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, 8, 16>, + NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, _, 12>, ]; let _ = neura_network![ - neura_layer!("dense", Relu, 16, 8), - neura_layer!("dense", Relu, 12), - neura_layer!("dense", Relu, 2) + neura_layer!("dense", 8, 16; Relu), + neura_layer!("dense", 12; Relu), + neura_layer!("dense", 2; Relu) ]; } } diff --git a/src/train.rs b/src/train.rs index b2e71c9..c7627fe 100644 --- a/src/train.rs +++ b/src/train.rs @@ -1,8 +1,5 @@ use crate::{ - algebra::NeuraVectorSpace, - derivable::NeuraLoss, - layer::NeuraLayer, - network::NeuraNetwork, + algebra::NeuraVectorSpace, derivable::NeuraLoss, layer::NeuraLayer, network::NeuraNetwork, }; // TODO: move this to layer/mod.rs @@ -26,6 +23,9 @@ pub trait NeuraTrainableLayer: NeuraLayer { epsilon: Self::Output, ) -> (Self::Input, Self::Delta); + /// Computes the regularization + fn regularize(&self) -> Self::Delta; + /// Applies `δW_l` to the weights of the layer fn apply_gradient(&mut self, gradient: &Self::Delta); @@ -51,6 +51,9 @@ pub trait NeuraTrainable: NeuraLayer { loss: Loss, ) -> (Self::Input, Self::Delta); + /// Should return the regularization gradient + fn regularize(&self) -> Self::Delta; + /// Called before an epoch begins, to allow the network to set itself up for training. fn prepare_epoch(&mut self); @@ -89,8 +92,8 @@ impl NeuraBackprop { } } -impl + Clone> NeuraGradientSolver<[f64; N], Loss::Target> - for NeuraBackprop +impl + Clone> + NeuraGradientSolver<[f64; N], Loss::Target> for NeuraBackprop { fn get_gradient( &self, @@ -184,15 +187,17 @@ impl NeuraBatchedTrainer { NeuraNetwork: NeuraTrainable, Layer::Input: Clone, { - // TODO: apply shuffling? let mut iter = inputs.into_iter(); let factor = -self.learning_rate / (self.batch_size as f64); let momentum_factor = self.learning_momentum / self.learning_rate; + let reg_factor = -self.learning_rate; // Contains `momentum_factor * factor * gradient_sum_previous_iter` - let mut previous_gradient_sum = as NeuraTrainable>::Delta::zero(); + let mut previous_gradient_sum = + as NeuraTrainable>::Delta::zero(); 'd: for epoch in 0..self.epochs { - let mut gradient_sum = as NeuraTrainable>::Delta::zero(); + let mut gradient_sum = + as NeuraTrainable>::Delta::zero(); network.prepare_epoch(); for _ in 0..self.batch_size { @@ -205,6 +210,12 @@ impl NeuraBatchedTrainer { } gradient_sum.mul_assign(factor); + + // Add regularization gradient (TODO: check if it can be factored out of momentum) + let mut reg_gradient = network.regularize(); + reg_gradient.mul_assign(reg_factor); + gradient_sum.add_assign(®_gradient); + network.apply_gradient(&gradient_sum); if self.learning_momentum != 0.0 { @@ -230,23 +241,21 @@ impl NeuraBatchedTrainer { #[cfg(test)] mod test { - use crate::{layer::NeuraDenseLayer, derivable::{activation::Linear, loss::Euclidean}}; use super::*; + use crate::{ + derivable::{activation::Linear, loss::Euclidean, regularize::NeuraL0}, + layer::NeuraDenseLayer, + }; #[test] fn test_backpropagation_simple() { for wa in [0.0, 0.25, 0.5, 1.0] { for wb in [0.0, 0.25, 0.5, 1.0] { - let network = NeuraNetwork::new( - NeuraDenseLayer::new([[wa, wb]], [0.0], Linear), - () - ); - - let gradient = NeuraBackprop::new(Euclidean).get_gradient( - &network, - &[1.0, 1.0], - &[0.0] - ); + let network = + NeuraNetwork::new(NeuraDenseLayer::new([[wa, wb]], [0.0], Linear, NeuraL0), ()); + + let gradient = + NeuraBackprop::new(Euclidean).get_gradient(&network, &[1.0, 1.0], &[0.0]); let expected = wa + wb; assert!((gradient.0[0][0] - expected) < 0.001); diff --git a/src/utils.rs b/src/utils.rs index c6db3a4..081312b 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -33,6 +33,7 @@ pub(crate) fn multiply_matrix_transpose_vector( left: &[f64; HEIGHT], right: &[f64; WIDTH], @@ -48,6 +49,32 @@ pub(crate) fn reverse_dot_product( result } +pub(crate) fn multiply_vectors_pointwise( + left: &[f64; LENGTH], + right: &[f64; LENGTH], +) -> [f64; LENGTH] { + let mut result = [0.0; LENGTH]; + + for i in 0..LENGTH { + result[i] = left[i] * right[i]; + } + + result +} + +#[cfg(test)] +pub(crate) fn matrix_from_diagonal( + vector: &[f64; LENGTH], +) -> [[f64; LENGTH]; LENGTH] { + let mut result = [[0.0; LENGTH]; LENGTH]; + + for i in 0..LENGTH { + result[i][i] = vector[i]; + } + + result +} + #[allow(dead_code)] pub(crate) fn assign_add_vector(sum: &mut [f64; N], operand: &[f64; N]) { for i in 0..N { @@ -89,7 +116,10 @@ struct ShuffleCycled { rng: R, } -impl Iterator for ShuffleCycled where I::Item: Clone { +impl Iterator for ShuffleCycled +where + I::Item: Clone, +{ type Item = I::Item; #[inline] @@ -99,7 +129,7 @@ impl Iterator for ShuffleCycled where I::Item: if let Some(next) = self.iter.next() { // Base iterator is not empty yet self.buffer.push(next.clone()); - return Some(next) + return Some(next); } else if self.buffer.len() > 0 { if self.index == 0 { // Shuffle the vector and return the first element, setting the index to 1 @@ -118,12 +148,9 @@ impl Iterator for ShuffleCycled where I::Item: } } -pub fn cycle_shuffling( - iter: I, - rng: impl rand::Rng -) -> impl Iterator +pub fn cycle_shuffling(iter: I, rng: impl rand::Rng) -> impl Iterator where - I::Item: Clone + I::Item: Clone, { let size_hint = iter.size_hint(); let size_hint = size_hint.1.unwrap_or(size_hint.0).max(1); @@ -132,6 +159,19 @@ where buffer: Vec::with_capacity(size_hint), index: 0, iter, - rng + rng, } } + +#[cfg(test)] +pub(crate) fn uniform_vector() -> [f64; LENGTH] { + use rand::Rng; + let mut res = [0.0; LENGTH]; + let mut rng = rand::thread_rng(); + + for i in 0..LENGTH { + res[i] = rng.gen(); + } + + res +}