From a5237a8ef15527891073c6424aa01aef28c4f13a Mon Sep 17 00:00:00 2001 From: Adrien Burgun Date: Fri, 21 Apr 2023 16:08:36 +0200 Subject: [PATCH] :sparkles: Inefficient but working forward-forward implementation --- examples/bivariate-forward.rs | 137 ++++++++++++++++++ examples/bivariate.rs | 21 +-- examples/xor.rs | 2 +- src/gradient_solver/backprop.rs | 3 +- src/gradient_solver/forward_forward.rs | 191 +++++++++++++++++++++++++ src/gradient_solver/mod.rs | 3 + src/layer/mod.rs | 5 + src/layer/normalize.rs | 116 +++++++++++++++ src/lib.rs | 4 +- src/network/mod.rs | 4 +- src/network/sequential/mod.rs | 6 +- src/train.rs | 7 +- 12 files changed, 479 insertions(+), 20 deletions(-) create mode 100644 examples/bivariate-forward.rs create mode 100644 src/gradient_solver/forward_forward.rs create mode 100644 src/layer/normalize.rs diff --git a/examples/bivariate-forward.rs b/examples/bivariate-forward.rs new file mode 100644 index 0000000..9fde389 --- /dev/null +++ b/examples/bivariate-forward.rs @@ -0,0 +1,137 @@ +#![feature(generic_arg_infer)] + +use nalgebra::{dvector, DVector}; +#[allow(unused_imports)] +use neuramethyst::derivable::activation::{LeakyRelu, Linear, Relu, Tanh}; +use neuramethyst::derivable::regularize::NeuraL1; +use neuramethyst::gradient_solver::NeuraForwardForward; +use neuramethyst::prelude::*; + +use rand::Rng; + +fn main() { + let mut network = neura_sequential![ + neura_layer!("dense", 10).regularization(NeuraL1(0.001)), + neura_layer!("dropout", 0.25), + neura_layer!("normalize"), + neura_layer!("dense", 6).regularization(NeuraL1(0.001)), + ] + .construct(NeuraShape::Vector(3)) + .unwrap(); + + let inputs = (0..1).cycle().map(move |_| { + let mut rng = rand::thread_rng(); + let category = rng.gen_bool(0.5); + let good = rng.gen_bool(0.5); + let (x, y) = if category { + let radius: f32 = rng.gen_range(0.0..2.0); + let angle = rng.gen_range(0.0..std::f32::consts::TAU); + (angle.cos() * radius, angle.sin() * radius) + } else { + let radius: f32 = rng.gen_range(3.0..5.0); + let angle = rng.gen_range(0.0..std::f32::consts::TAU); + (angle.cos() * radius, angle.sin() * radius) + }; + + if good { + (dvector![x, y, category as u8 as f32], true) + } else { + (dvector![x, y, 1.0 - category as u8 as f32], false) + } + }); + + let test_inputs: Vec<_> = inputs.clone().filter(|(_, good)| *good).take(10).collect(); + let threshold = 0.25f32; + + if std::env::args().any(|arg| arg == "draw") { + for epoch in 0..200 { + let mut trainer = NeuraBatchedTrainer::new(0.03, 10); + trainer.batch_size = 10; + + trainer.train( + &NeuraForwardForward::new(Tanh, threshold as f64), + &mut network, + inputs.clone(), + &test_inputs, + ); + + // let network = network.clone().trim_tail().trim_tail(); + draw_neuron_activation( + |input| { + let cat0 = network.eval(&dvector![input[0] as f32, input[1] as f32, 0.0]); + let cat1 = network.eval(&dvector![input[0] as f32, input[1] as f32, 1.0]); + + let cat0_good = cat0.map(|x| x * x).sum(); + let cat1_good = cat1.map(|x| x * x).sum(); + let estimation = cat1_good / (cat0_good + cat1_good); + + let cat0_norm = cat0 / cat0_good.sqrt(); + let mut cat0_rgb = DVector::from_element(3, 0.0); + + for i in 0..cat0_norm.len() { + cat0_rgb[i % 3] += cat0_norm[i].abs(); + } + + (cat0_rgb * estimation) + .into_iter() + .map(|x| *x as f64) + .collect() + }, + 6.0, + ); + println!("{}", epoch); + + std::thread::sleep(std::time::Duration::new(0, 50_000_000)); + } + } else { + let mut trainer = NeuraBatchedTrainer::new(0.03, 20 * 50); + trainer.batch_size = 10; + trainer.log_iterations = 20; + + trainer.train( + &NeuraForwardForward::new(Tanh, threshold as f64), + &mut network, + inputs.clone(), + &test_inputs, + ); + + // println!("{}", String::from("\n").repeat(64)); + // draw_neuron_activation(|input| network.eval(&input).into_iter().collect(), 6.0); + } +} + +// TODO: move this to the library? +fn draw_neuron_activation Vec>(callback: F, scale: f64) { + use viuer::Config; + + const WIDTH: u32 = 64; + const HEIGHT: u32 = 64; + + let mut image = image::RgbImage::new(WIDTH, HEIGHT); + + fn sigmoid(x: f64) -> f64 { + 0.1 + 0.9 * x.abs().powf(0.8) + } + + for y in 0..HEIGHT { + let y2 = 2.0 * y as f64 / HEIGHT as f64 - 1.0; + for x in 0..WIDTH { + let x2 = 2.0 * x as f64 / WIDTH as f64 - 1.0; + let activation = callback([x2 * scale, y2 * scale]); + let r = (sigmoid(activation.get(0).copied().unwrap_or(-1.0)) * 255.0).floor() as u8; + let g = (sigmoid(activation.get(1).copied().unwrap_or(-1.0)) * 255.0).floor() as u8; + let b = (sigmoid(activation.get(2).copied().unwrap_or(-1.0)) * 255.0).floor() as u8; + + *image.get_pixel_mut(x, y) = image::Rgb([r, g, b]); + } + } + + let config = Config { + use_kitty: false, + truecolor: true, + // absolute_offset: false, + ..Default::default() + }; + + viuer::print(&image::DynamicImage::ImageRgb8(image), &config).unwrap(); +} diff --git a/examples/bivariate.rs b/examples/bivariate.rs index 10da1af..82fd87b 100644 --- a/examples/bivariate.rs +++ b/examples/bivariate.rs @@ -47,20 +47,24 @@ fn main() { trainer.batch_size = 10; trainer.train( - NeuraBackprop::new(CrossEntropy), + &NeuraBackprop::new(CrossEntropy), &mut network, inputs.clone(), &test_inputs, ); - let network = network.clone(); + let network_trimmed = network.clone().trim_tail().trim_tail(); draw_neuron_activation( |input| { - network - .eval(&dvector![input[0] as f32, input[1] as f32]) + let output = network.eval(&dvector![input[0] as f32, input[1] as f32]); + let estimation = output[0] / (output[0] + output[1]); + + let color = network_trimmed.eval(&dvector![input[0] as f32, input[1] as f32]); + + (&color / color.map(|x| x * x).sum() * estimation) .into_iter() - .map(|x| *x as f64) - .collect() + .map(|x| x.abs() as f64) + .collect::>() }, 6.0, ); @@ -74,7 +78,7 @@ fn main() { trainer.log_iterations = 20; trainer.train( - NeuraBackprop::new(CrossEntropy), + &NeuraBackprop::new(CrossEntropy), &mut network, inputs.clone(), &test_inputs, @@ -101,7 +105,7 @@ fn draw_neuron_activation Vec>(callback: F, scale: f64) let mut image = image::RgbImage::new(WIDTH, HEIGHT); fn sigmoid(x: f64) -> f64 { - 1.0 / (1.0 + (-x * 3.0).exp()) + 1.9 / (1.0 + (-x * 3.0).exp()) - 0.9 } for y in 0..HEIGHT { @@ -119,6 +123,7 @@ fn draw_neuron_activation Vec>(callback: F, scale: f64) let config = Config { use_kitty: false, + truecolor: true, // absolute_offset: false, ..Default::default() }; diff --git a/examples/xor.rs b/examples/xor.rs index 6657de1..8d4d7c3 100644 --- a/examples/xor.rs +++ b/examples/xor.rs @@ -38,7 +38,7 @@ fn main() { trainer.learning_momentum = 0.01; trainer.train( - NeuraBackprop::new(Euclidean), + &NeuraBackprop::new(Euclidean), &mut network, cycle_shuffling(inputs.iter().cloned(), rand::thread_rng()), &inputs, diff --git a/src/gradient_solver/backprop.rs b/src/gradient_solver/backprop.rs index f924f1f..d53ebf2 100644 --- a/src/gradient_solver/backprop.rs +++ b/src/gradient_solver/backprop.rs @@ -1,10 +1,9 @@ use num::ToPrimitive; -use crate::{network::NeuraTrainableNetworkBase, derivable::NeuraLoss, layer::NeuraTrainableLayer}; +use crate::{derivable::NeuraLoss, layer::NeuraTrainableLayer, network::NeuraTrainableNetworkBase}; use super::*; - pub struct NeuraBackprop { loss: Loss, } diff --git a/src/gradient_solver/forward_forward.rs b/src/gradient_solver/forward_forward.rs new file mode 100644 index 0000000..76c8b70 --- /dev/null +++ b/src/gradient_solver/forward_forward.rs @@ -0,0 +1,191 @@ +use nalgebra::{DVector, Scalar}; +use num::{traits::NumAssignOps, Float, ToPrimitive}; + +use crate::derivable::NeuraDerivable; + +use super::*; + +// TODO: add `max_depth: usize` +pub struct NeuraForwardForward { + threshold: f64, + activation: Act, +} + +impl> NeuraForwardForward { + pub fn new(activation: Act, threshold: f64) -> Self { + Self { + threshold, + activation, + } + } +} + +struct NeuraForwardPair { + threshold: f64, + maximize: bool, + activation: Act, +} + +impl< + F, + Act: Clone + NeuraDerivable, + Input, + Trainable: NeuraTrainableNetwork, Output = DVector>, + > NeuraGradientSolver for NeuraForwardForward +where + F: ToPrimitive, +{ + fn get_gradient( + &self, + trainable: &Trainable, + input: &Input, + target: &bool, + ) -> Trainable::Gradient { + let target = *target; + + trainable.traverse( + input, + &NeuraForwardPair { + threshold: self.threshold, + maximize: target, + activation: self.activation.clone(), + }, + ) + } + + fn score(&self, trainable: &Trainable, input: &Input, target: &bool) -> f64 { + let output = trainable.eval(input); + let goodness = output + .iter() + .map(|x| x.to_f64().unwrap()) + .reduce(|acc, x| acc + x * x) + .unwrap_or(0.0); + let goodness = goodness - self.threshold; + let goodness = self.activation.eval(goodness); + + // Try to normalize the goodness so that if `Act([-threshold; +inf]) = [α; 1]`, `goodness ∈ [0; 1]` + let activation_zero = self.activation.eval(-self.threshold); + const EPSILON: f64 = 0.01; + let goodness = if activation_zero < 1.0 - EPSILON { + (goodness - activation_zero) / (1.0 - activation_zero) + } else { + goodness + }; + + if *target { + 1.0 - goodness + } else { + goodness + } + } +} + +impl NeuraGradientSolverBase for NeuraForwardPair { + type Output = NetworkGradient; +} + +impl NeuraGradientSolverFinal for NeuraForwardPair { + fn eval_final(&self, _output: LayerOutput) -> Self::Output { + () + } +} + +impl> + NeuraGradientSolverTransient> for NeuraForwardPair +{ + fn eval_layer< + Input, + NetworkGradient, + RecGradient, + Layer: NeuraTrainableLayer>, + >( + &self, + layer: &Layer, + input: &Input, + rec_gradient: RecGradient, + combine_gradients: impl Fn(Layer::Gradient, RecGradient) -> NetworkGradient, + ) -> Self::Output { + let output = layer.eval(input); + let goodness = output + .iter() + .copied() + .reduce(|acc, x| acc + x * x) + .unwrap_or(F::zero()); + let goodness = if self.maximize { + goodness - F::from(self.threshold).unwrap() + } else { + F::from(self.threshold).unwrap() - goodness + }; + // We skip self.activation.eval(goodness) + + let two = F::from(2.0).unwrap(); + + // The original formula does not have a 1/2 term, + // so we must multiply by 2 + let mut goodness_derivative = output * (two * self.activation.derivate(goodness)); + + if self.maximize { + goodness_derivative = -goodness_derivative; + } + + // TODO: split backprop_layer into eval_training, get_gradient and get_backprop + let (_, layer_gradient) = layer.backprop_layer(input, goodness_derivative); + + combine_gradients(layer_gradient, rec_gradient) + } +} + +#[cfg(test)] +mod test { + use rand::Rng; + + use super::*; + use crate::{ + derivable::activation::{Relu, Tanh}, + prelude::*, + utils::uniform_vector, + }; + + #[test] + fn test_forward_forward() { + let network = neura_sequential![ + neura_layer!("dense", 10).activation(Relu), + neura_layer!("normalize"), + neura_layer!("dense", 4).activation(Relu), + neura_layer!("normalize"), + neura_layer!("dense", 1), + ] + .construct(NeuraShape::Vector(10)) + .unwrap(); + + let fwd_solver = NeuraForwardForward::new(Tanh, 0.25); + + fwd_solver.get_gradient(&network, &uniform_vector(10).map(|x| x as f32), &true); + } + + #[test] + fn test_forward_forward_train() { + let mut network = neura_sequential![ + neura_layer!("dense", 5).activation(Relu), + neura_layer!("normalize"), + neura_layer!("dense", 3).activation(Relu), + neura_layer!("normalize"), + neura_layer!("dense", 1) + ] + .construct(NeuraShape::Vector(4)) + .unwrap(); + + let solver = NeuraForwardForward::new(Tanh, 0.25); + + let trainer = NeuraBatchedTrainer::new(0.01, 20); + + let inputs = (0..1).cycle().map(|_| { + let mut rng = rand::thread_rng(); + (uniform_vector(4).map(|x| x as f32), rng.gen_bool(0.5)) + }); + + let test_inputs: Vec<_> = inputs.clone().take(10).collect(); + + trainer.train(&solver, &mut network, inputs, test_inputs.as_slice()); + } +} diff --git a/src/gradient_solver/mod.rs b/src/gradient_solver/mod.rs index 83bc595..3f68076 100644 --- a/src/gradient_solver/mod.rs +++ b/src/gradient_solver/mod.rs @@ -1,6 +1,9 @@ mod backprop; pub use backprop::NeuraBackprop; +mod forward_forward; +pub use forward_forward::NeuraForwardForward; + use crate::{ layer::NeuraTrainableLayer, network::{NeuraTrainableNetwork, NeuraTrainableNetworkBase}, diff --git a/src/layer/mod.rs b/src/layer/mod.rs index 672ee00..10cc623 100644 --- a/src/layer/mod.rs +++ b/src/layer/mod.rs @@ -2,6 +2,7 @@ use crate::algebra::NeuraVectorSpace; pub mod dense; pub mod dropout; +pub mod normalize; pub mod softmax; #[derive(Clone, Copy, PartialEq, Debug)] @@ -124,4 +125,8 @@ macro_rules! neura_layer { ( "softmax" ) => { $crate::layer::softmax::NeuraSoftmaxLayer::new() }; + + ( "normalize" ) => { + $crate::layer::normalize::NeuraNormalizeLayer::new() + }; } diff --git a/src/layer/normalize.rs b/src/layer/normalize.rs new file mode 100644 index 0000000..291657e --- /dev/null +++ b/src/layer/normalize.rs @@ -0,0 +1,116 @@ +use nalgebra::{DVector, Scalar}; +use num::{traits::NumAssignOps, Float}; + +use super::*; + +/// A layer that normalizes and centers its input, as follows: +/// +/// ```no_rust +/// μ = sum_i(x_i) / n +/// σ = sum_i((x_i - μ)^2) / n +/// y_i = (x_i - μ) / σ +/// ``` +#[derive(Debug, Clone, Copy)] +pub struct NeuraNormalizeLayer { + shape: NeuraShape, +} + +impl NeuraNormalizeLayer { + pub fn new() -> Self { + Self { + shape: NeuraShape::Vector(0), + } + } +} + +impl NeuraPartialLayer for NeuraNormalizeLayer { + type Constructed = NeuraNormalizeLayer; + + type Err = (); + + fn construct(self, input_shape: NeuraShape) -> Result { + Ok(Self { shape: input_shape }) + } + + fn output_shape(constructed: &Self::Constructed) -> NeuraShape { + constructed.shape + } +} + +impl NeuraLayer> for NeuraNormalizeLayer { + type Output = DVector; + + fn eval(&self, input: &DVector) -> Self::Output { + let (mean, variance, _) = mean_variance(input); + let stddev = F::sqrt(variance); + + let mut output = input.clone(); + + for item in &mut output { + *item = (*item - mean) / stddev; + } + + output + } +} + +impl NeuraTrainableLayer> for NeuraNormalizeLayer { + type Gradient = (); + + fn backprop_layer( + &self, + input: &DVector, + epsilon: Self::Output, + ) -> (DVector, Self::Gradient) { + let (mean, variance, len) = mean_variance(input); + let stddev = F::sqrt(variance); + let input_centered = input.clone().map(|x| x - mean); + + let mut jacobian_partial = &input_centered * input_centered.transpose(); + jacobian_partial /= -variance * (stddev * len); + // Apply the 1/σ * dμ/dx_i term + for value in jacobian_partial.iter_mut() { + *value += F::one() / (stddev * len); + } + + let mut epsilon_out = jacobian_partial * ε + + // Apply the δ_{ik}/σ term + for i in 0..epsilon_out.len() { + epsilon_out[i] += epsilon[i] / stddev; + } + + (epsilon_out, ()) + } + + fn default_gradient(&self) -> Self::Gradient { + () + } + + fn regularize_layer(&self) -> Self::Gradient { + () + } + + fn apply_gradient(&mut self, _gradient: &Self::Gradient) { + // Noop + } +} + +fn mean_variance<'a, F: Float + Scalar>(input: impl IntoIterator) -> (F, F, F) { + // Quickly compute mean and variance in one pass + let mut count = 0; + let mut sum = F::zero(); + let mut sum_squared = F::zero(); + + for &value in input.into_iter() { + count += 1; + sum = sum + value; + sum_squared = sum_squared + value * value; + } + + let len = F::from(count).unwrap(); + let mean = sum / len; + let variance = sum_squared / len - mean * mean; + + (mean, variance, len) +} diff --git a/src/lib.rs b/src/lib.rs index d8f1896..e5abe7d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,9 +3,9 @@ pub mod algebra; pub mod derivable; +pub mod gradient_solver; pub mod layer; pub mod network; -pub mod gradient_solver; pub mod train; mod utils; @@ -18,10 +18,10 @@ pub mod prelude { pub use crate::{neura_layer, neura_sequential}; // Structs and traits + pub use crate::gradient_solver::NeuraBackprop; pub use crate::layer::*; pub use crate::network::sequential::{ NeuraSequential, NeuraSequentialConstruct, NeuraSequentialTail, }; - pub use crate::gradient_solver::NeuraBackprop; pub use crate::train::NeuraBatchedTrainer; } diff --git a/src/network/mod.rs b/src/network/mod.rs index 0e2556b..823ee61 100644 --- a/src/network/mod.rs +++ b/src/network/mod.rs @@ -1,4 +1,6 @@ -use crate::{algebra::NeuraVectorSpace, layer::NeuraLayer, gradient_solver::NeuraGradientSolverBase}; +use crate::{ + algebra::NeuraVectorSpace, gradient_solver::NeuraGradientSolverBase, layer::NeuraLayer, +}; pub mod sequential; diff --git a/src/network/sequential/mod.rs b/src/network/sequential/mod.rs index 637a42f..a969d33 100644 --- a/src/network/sequential/mod.rs +++ b/src/network/sequential/mod.rs @@ -1,7 +1,7 @@ use super::{NeuraTrainableNetwork, NeuraTrainableNetworkBase}; use crate::{ - layer::{NeuraLayer, NeuraPartialLayer, NeuraShape, NeuraTrainableLayer}, gradient_solver::{NeuraGradientSolverFinal, NeuraGradientSolverTransient}, + layer::{NeuraLayer, NeuraPartialLayer, NeuraShape, NeuraTrainableLayer}, }; mod construct; @@ -212,8 +212,8 @@ where } } -impl> NeuraTrainableNetwork - for () +impl> + NeuraTrainableNetwork for () { fn traverse( &self, diff --git a/src/train.rs b/src/train.rs index 1288578..b782b5c 100644 --- a/src/train.rs +++ b/src/train.rs @@ -1,5 +1,6 @@ use crate::{ - algebra::NeuraVectorSpace, network::NeuraTrainableNetworkBase, gradient_solver::NeuraGradientSolver, + algebra::NeuraVectorSpace, gradient_solver::NeuraGradientSolver, + network::NeuraTrainableNetworkBase, }; #[non_exhaustive] @@ -77,7 +78,7 @@ impl NeuraBatchedTrainer { Inputs: IntoIterator, >( &self, - gradient_solver: GradientSolver, + gradient_solver: &GradientSolver, network: &mut Network, inputs: Inputs, test_inputs: &[(Input, Target)], @@ -140,10 +141,10 @@ mod test { use crate::{ assert_approx, derivable::{activation::Linear, loss::Euclidean, regularize::NeuraL0, NeuraLoss}, + gradient_solver::NeuraBackprop, layer::{dense::NeuraDenseLayer, NeuraLayer}, network::sequential::{NeuraSequential, NeuraSequentialTail}, neura_sequential, - gradient_solver::NeuraBackprop, }; #[test]