From 8ac82e20e29ba8b7d95f1e09b097e60b2eec8ea3 Mon Sep 17 00:00:00 2001 From: Adrien Burgun Date: Wed, 12 Apr 2023 17:16:04 +0200 Subject: [PATCH] :sparkles: Working backpropagation :3 --- examples/bivariate.rs | 71 +++++++++++++++ examples/xor.rs | 28 +++--- src/derivable/activation.rs | 71 +++++++++++++++ src/layer/dense.rs | 6 +- src/lib.rs | 7 +- src/train.rs | 174 +++++++++++++++++++++++++++--------- src/utils.rs | 93 +++++++++++++++---- 7 files changed, 367 insertions(+), 83 deletions(-) create mode 100644 examples/bivariate.rs diff --git a/examples/bivariate.rs b/examples/bivariate.rs new file mode 100644 index 0000000..4a7c0b0 --- /dev/null +++ b/examples/bivariate.rs @@ -0,0 +1,71 @@ +#![feature(generic_arg_infer)] + +use std::io::Write; + +use neuramethyst::prelude::*; +use neuramethyst::derivable::activation::{Relu, Tanh, LeakyRelu}; +use neuramethyst::derivable::loss::Euclidean; + +use rand::Rng; + +fn main() { + let mut network = neura_network![ + neura_layer!("dense", LeakyRelu(0.01), 4, 2), + neura_layer!("dense", Tanh, 3), + neura_layer!("dense", Relu, 2) + ]; + + let mut rng = rand::thread_rng(); + let inputs = (0..=1).cycle().map(move |category| { + let (x, y) = if category == 0 { + let radius: f64 = rng.gen_range(0.0..1.0); + let radius = radius.sqrt(); + let angle = rng.gen_range(0.0..std::f64::consts::TAU); + (angle.cos() * radius, angle.sin() * radius) + } else { + let radius: f64 = rng.gen_range(1.0..2.0); + let angle = rng.gen_range(0.0..std::f64::consts::TAU); + (angle.cos() * radius, angle.sin() * radius) + }; + + ([x, y], one_hot::<2>(category)) + }); + + let test_inputs: Vec<_> = inputs.clone().take(100).collect(); + + let mut trainer = NeuraBatchedTrainer::new(0.1, 4000); + trainer.log_epochs = 500; + + trainer.train( + NeuraBackprop::new(Euclidean), + &mut network, + inputs, + &test_inputs + ); + + let mut file = std::fs::File::create("target/bivariate.csv").unwrap(); + for (input, _target) in test_inputs { + let guess = argmax(&network.eval(&input)); + writeln!(&mut file, "{},{},{}", input[0], input[1], guess).unwrap(); + } +} + +fn one_hot(value: usize) -> [f64; N] { + let mut res = [0.0; N]; + if value < N { + res[value] = 1.0; + } + res +} + +fn argmax(array: &[f64]) -> usize { + let mut res = 0; + + for n in 1..array.len() { + if array[n] > array[res] { + res = n; + } + } + + res +} diff --git a/examples/xor.rs b/examples/xor.rs index 9d19aa0..fa1a88b 100644 --- a/examples/xor.rs +++ b/examples/xor.rs @@ -1,13 +1,13 @@ #![feature(generic_arg_infer)] use neuramethyst::prelude::*; -use neuramethyst::derivable::activation::{Relu, Tanh}; +use neuramethyst::derivable::activation::{Relu}; use neuramethyst::derivable::loss::Euclidean; fn main() { let mut network = neura_network![ - neura_layer!("dense", Tanh, 2, 2), - neura_layer!("dense", Tanh, 3), + neura_layer!("dense", Relu, 4, 2), + neura_layer!("dense", Relu, 3), neura_layer!("dense", Relu, 1) ]; @@ -18,25 +18,23 @@ fn main() { ([1.0, 1.0], [0.0]) ]; - // println!("{:#?}", network); - for (input, target) in inputs { - println!("Input: {:?}, target: {}, actual: {}", &input, target[0], network.eval(&input)[0]); + println!("Input: {:?}, target: {}, actual: {:.3}", &input, target[0], network.eval(&input)[0]); } - train_batched( + let mut trainer = NeuraBatchedTrainer::new(0.05, 1000); + trainer.batch_size = 6; + trainer.log_epochs = 250; + trainer.learning_momentum = 0.01; + + trainer.train( + NeuraBackprop::new(Euclidean), &mut network, - inputs.clone(), + cycle_shuffling(inputs.iter().cloned(), rand::thread_rng()), &inputs, - NeuraBackprop::new(Euclidean), - 0.01, - 1, - 25 ); - // println!("{:#?}", network); - for (input, target) in inputs { - println!("Input: {:?}, target: {}, actual: {}", &input, target[0], network.eval(&input)[0]); + println!("Input: {:?}, target: {}, actual: {:.3}", &input, target[0], network.eval(&input)[0]); } } diff --git a/src/derivable/activation.rs b/src/derivable/activation.rs index 0bac5ee..9c13bd3 100644 --- a/src/derivable/activation.rs +++ b/src/derivable/activation.rs @@ -35,6 +35,50 @@ impl NeuraDerivable for Relu { } } +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct LeakyRelu(pub f64); + + +impl NeuraDerivable for LeakyRelu { + #[inline(always)] + fn eval(&self, input: f64) -> f64 { + if input > 0.0 { + input + } else { + self.0 * input + } + } + + #[inline(always)] + fn derivate(&self, input: f64) -> f64 { + if input > 0.0 { + 1.0 + } else { + self.0 + } + } +} + +impl NeuraDerivable for LeakyRelu { + #[inline(always)] + fn eval(&self, input: f32) -> f32 { + if input > 0.0 { + input + } else { + (self.0 as f32) * input + } + } + + #[inline(always)] + fn derivate(&self, input: f32) -> f32 { + if input > 0.0 { + 1.0 + } else { + self.0 as f32 + } + } +} + #[derive(Clone, Copy, Debug, PartialEq)] pub struct Tanh; @@ -63,3 +107,30 @@ impl NeuraDerivable for Tanh { 0.5 * (1.0 - tanh * tanh) } } + +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct Linear; + +impl NeuraDerivable for Linear { + #[inline(always)] + fn eval(&self, input: f64) -> f64 { + input + } + + #[inline(always)] + fn derivate(&self, _at: f64) -> f64 { + 1.0 + } +} + +impl NeuraDerivable for Linear { + #[inline(always)] + fn eval(&self, input: f32) -> f32 { + input + } + + #[inline(always)] + fn derivate(&self, _at: f32) -> f32 { + 1.0 + } +} diff --git a/src/layer/dense.rs b/src/layer/dense.rs index 2929f22..1776bc8 100644 --- a/src/layer/dense.rs +++ b/src/layer/dense.rs @@ -35,7 +35,7 @@ impl, const INPUT_LEN: usize, const OUTPUT_LEN: usize> for i in 0..OUTPUT_LEN { for j in 0..INPUT_LEN { - weights[i][j] = rng.gen_range(-multiplier..multiplier); + weights[i][j] = rng.gen_range(0.0..multiplier); } } @@ -74,10 +74,10 @@ impl, const INPUT_LEN: usize, const OUTPUT_LEN: usize> // TODO: double-check the math in this fn backpropagate(&self, input: &Self::Input, epsilon: Self::Output) -> (Self::Input, Self::Delta) { let evaluated = multiply_matrix_vector(&self.weights, input); - // Compute delta from epsilon, with `self.activation'(z) * epsilon = delta` + // Compute delta from epsilon, with `self.activation'(input) ° epsilon = delta` let mut delta = epsilon.clone(); for i in 0..OUTPUT_LEN { - delta[i] = self.activation.derivate(evaluated[i]); + delta[i] *= self.activation.derivate(evaluated[i]); } let weights_gradient = reverse_dot_product(&delta, input); diff --git a/src/lib.rs b/src/lib.rs index d17f734..eb9b2e8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,7 +13,8 @@ pub mod prelude { pub use crate::{neura_network, neura_layer}; // Structs and traits - pub use super::network::{NeuraNetwork}; - pub use super::layer::{NeuraLayer, NeuraDenseLayer}; - pub use super::train::{NeuraBackprop, train_batched}; + pub use crate::network::{NeuraNetwork}; + pub use crate::layer::{NeuraLayer, NeuraDenseLayer}; + pub use crate::train::{NeuraBackprop, NeuraBatchedTrainer}; + pub use crate::utils::cycle_shuffling; } diff --git a/src/train.rs b/src/train.rs index adc23fb..13074b4 100644 --- a/src/train.rs +++ b/src/train.rs @@ -3,7 +3,7 @@ use crate::{ algebra::NeuraVectorSpace, derivable::NeuraLoss, layer::NeuraLayer, - network::NeuraNetwork, + network::NeuraNetwork, utils::cycle_shuffling, }; pub trait NeuraTrainableLayer: NeuraLayer { @@ -44,7 +44,7 @@ pub trait NeuraTrainable: NeuraLayer { ) -> (Self::Input, Self::Delta); } -pub trait NeuraTrainer { +pub trait NeuraGradientSolver { fn get_gradient( &self, trainable: &NeuraNetwork, @@ -75,7 +75,7 @@ impl NeuraBackprop { } } -impl + Clone> NeuraTrainer<[f64; N], Loss::Target> +impl + Clone> NeuraGradientSolver<[f64; N], Loss::Target> for NeuraBackprop { fn get_gradient( @@ -103,49 +103,137 @@ impl + Clone> NeuraTrainer<[f6 } } -pub fn train_batched< - Output, - Target, - Trainer: NeuraTrainer, - Layer: NeuraLayer, - ChildNetwork, - Inputs: IntoIterator, ->( - network: &mut NeuraNetwork, - inputs: Inputs, - test_inputs: &[(Layer::Input, Target)], - trainer: Trainer, - learning_rate: f64, - batch_size: usize, - epochs: usize, -) where - NeuraNetwork: NeuraTrainable, - Inputs::IntoIter: Clone, -{ - // TODO: apply shuffling? - let mut iter = inputs.into_iter().cycle(); - let factor = -learning_rate / (batch_size as f64); - - 'd: for epoch in 0..epochs { - let mut gradient_sum = as NeuraTrainable>::Delta::zero(); - - for _ in 0..batch_size { - if let Some((input, target)) = iter.next() { - let gradient = trainer.get_gradient(&network, &input, &target); - gradient_sum.add_assign(&gradient); - } else { - break 'd; - } +#[non_exhaustive] +pub struct NeuraBatchedTrainer { + /// The learning rate of the gradient descent algorithm; the weights `W` will be updated as follows: + /// `W += -learning_rate * gradient_average`. + /// + /// Defaults to `0.1` + pub learning_rate: f64, + + /// The momentum of the gradient descent algorithm; if set to a non-zero value, then the weights `W` will be updated as follows: + /// `W += -learning_rate * gradient_average - learning_momentum * previous_gradient`. + /// This value should be smaller than `learning_rate`. + /// + /// Defaults to `0.0` + pub learning_momentum: f64, + + /// How many gradient computations to average before updating the weights + pub batch_size: usize, + + /// How many batches to run for; if `epochs * batch_size` exceeds the input length, then training will stop. + /// You should use `cycle_shuffling` from the `prelude` module to avoid this. + pub epochs: usize, + + /// The trainer will log progress at every multiple of `log_epochs` steps. + /// If `log_epochs` is zero (default), then no progress will be logged. + /// + /// The test inputs is used to measure the score of the network. + pub log_epochs: usize, +} + +impl Default for NeuraBatchedTrainer { + fn default() -> Self { + Self { + learning_rate: 0.1, + learning_momentum: 0.0, + batch_size: 100, + epochs: 100, + log_epochs: 0, } + } +} - gradient_sum.mul_assign(factor); - network.apply_gradient(&gradient_sum); +impl NeuraBatchedTrainer { + pub fn new(learning_rate: f64, epochs: usize) -> Self { + Self { + learning_rate, + epochs, + ..Default::default() + } + } + + pub fn train< + Output, + Target: Clone, + GradientSolver: NeuraGradientSolver, + Layer: NeuraLayer, + ChildNetwork, + Inputs: IntoIterator, + >( + &self, + gradient_solver: GradientSolver, + network: &mut NeuraNetwork, + inputs: Inputs, + test_inputs: &[(Layer::Input, Target)], + ) where + NeuraNetwork: NeuraTrainable, + Layer::Input: Clone, + { + // TODO: apply shuffling? + let mut iter = inputs.into_iter(); + let factor = -self.learning_rate / (self.batch_size as f64); + let momentum_factor = self.learning_momentum / self.learning_rate; + + // Contains `momentum_factor * factor * gradient_sum_previous_iter` + let mut previous_gradient_sum = as NeuraTrainable>::Delta::zero(); + 'd: for epoch in 0..self.epochs { + let mut gradient_sum = as NeuraTrainable>::Delta::zero(); + + for _ in 0..self.batch_size { + if let Some((input, target)) = iter.next() { + let gradient = gradient_solver.get_gradient(&network, &input, &target); + gradient_sum.add_assign(&gradient); + } else { + break 'd; + } + } + + gradient_sum.mul_assign(factor); + network.apply_gradient(&gradient_sum); + + if self.learning_momentum != 0.0 { + network.apply_gradient(&previous_gradient_sum); + previous_gradient_sum = gradient_sum; + previous_gradient_sum.mul_assign(momentum_factor); + } - let mut loss_sum = 0.0; - for (input, target) in test_inputs { - loss_sum += trainer.score(&network, input, target); + if self.log_epochs > 0 && (epoch + 1) % self.log_epochs == 0 { + let mut loss_sum = 0.0; + for (input, target) in test_inputs { + loss_sum += gradient_solver.score(&network, input, target); + } + loss_sum /= test_inputs.len() as f64; + println!("Epoch {}, Loss: {:.3}", epoch + 1, loss_sum); + } + } + } +} + +#[cfg(test)] +mod test { + use crate::{layer::NeuraDenseLayer, derivable::{activation::Linear, loss::Euclidean}}; + use super::*; + + #[test] + fn test_backpropagation_simple() { + for wa in [0.0, 0.25, 0.5, 1.0] { + for wb in [0.0, 0.25, 0.5, 1.0] { + let network = NeuraNetwork::new( + NeuraDenseLayer::new([[wa, wb]], [0.0], Linear), + () + ); + + let gradient = NeuraBackprop::new(Euclidean).get_gradient( + &network, + &[1.0, 1.0], + &[0.0] + ); + + let expected = wa + wb; + assert!((gradient.0[0][0] - expected) < 0.001); + assert!((gradient.0[0][1] - expected) < 0.001); + } } - loss_sum /= test_inputs.len() as f64; - println!("Epoch {epoch}, Loss: {:.3}", loss_sum); } } diff --git a/src/utils.rs b/src/utils.rs index 7b63642..4f8e535 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -54,36 +54,91 @@ pub(crate) fn assign_add_vector(sum: &mut [f64; N], operand: &[f } } +struct Chunked { + iter: J, + chunk_size: usize, +} + +impl Iterator for Chunked { + type Item = Vec; + + fn next(&mut self) -> Option { + let mut result = Vec::with_capacity(self.chunk_size); + + for _ in 0..self.chunk_size { + if let Some(item) = self.iter.next() { + result.push(item); + } else { + break; + } + } + + if result.len() > 0 { + Some(result) + } else { + None + } + } +} + pub(crate) fn chunked( iter: I, chunk_size: usize, ) -> impl Iterator> { - struct Chunked { - iter: J, - chunk_size: usize, - } + Chunked { iter, chunk_size } +} - impl Iterator for Chunked { - type Item = Vec; - fn next(&mut self) -> Option { - let mut result = Vec::with_capacity(self.chunk_size); +struct ShuffleCycled { + buffer: Vec, + index: usize, + iter: I, + rng: R, +} - for _ in 0..self.chunk_size { - if let Some(item) = self.iter.next() { - result.push(item); - } else { - break; - } - } +impl Iterator for ShuffleCycled where I::Item: Clone { + type Item = I::Item; - if result.len() > 0 { - Some(result) + #[inline] + fn next(&mut self) -> Option { + use rand::prelude::SliceRandom; + + if let Some(next) = self.iter.next() { + // Base iterator is not empty yet + self.buffer.push(next.clone()); + return Some(next) + } else if self.buffer.len() > 0 { + if self.index == 0 { + // Shuffle the vector and return the first element, setting the index to 1 + self.buffer.shuffle(&mut self.rng); + self.index = 1; + Some(self.buffer[0].clone()) } else { - None + // Keep consuming the shuffled vector + let res = self.buffer[self.index].clone(); + self.index = (self.index + 1) % self.buffer.len(); + Some(res) } + } else { + None } } +} - Chunked { iter, chunk_size } +pub fn cycle_shuffling( + iter: I, + rng: impl rand::Rng +) -> impl Iterator +where + I::Item: Clone +{ + let size_hint = iter.size_hint(); + let size_hint = size_hint.1.unwrap_or(size_hint.0).max(1); + + ShuffleCycled { + buffer: Vec::with_capacity(size_hint), + index: 0, + iter, + rng + } }