From b4a97694a6398d965e083b906ebe00c42daefe58 Mon Sep 17 00:00:00 2001 From: Adrien Burgun Date: Sat, 15 Apr 2023 21:29:55 +0200 Subject: [PATCH] :art: Clean things up, add unit tests, add one hot layer --- Cargo.toml | 4 + examples/bivariate.rs | 119 +++++++++++++--------- examples/xor.rs | 2 +- src/algebra.rs | 18 ++++ src/derivable/activation.rs | 191 ++++++++++++++++-------------------- src/derivable/loss.rs | 53 ++++++++++ src/derivable/mod.rs | 12 +++ src/derivable/regularize.rs | 4 +- src/layer/dense.rs | 20 ++-- src/layer/mod.rs | 7 ++ src/layer/one_hot.rs | 61 ++++++++++++ src/lib.rs | 7 +- src/network.rs | 83 +++++++++++++--- src/train.rs | 76 ++++++++++---- src/utils.rs | 51 ++++++++++ 15 files changed, 511 insertions(+), 197 deletions(-) create mode 100644 src/layer/one_hot.rs diff --git a/Cargo.toml b/Cargo.toml index 9d8f4ba..d245660 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,3 +10,7 @@ ndarray = "^0.15" # num-traits = "0.2.15" rand = "^0.8" rand_distr = "0.4.3" + +[dev-dependencies] +image = "0.24.6" +viuer = "0.6.2" diff --git a/examples/bivariate.rs b/examples/bivariate.rs index 5d64419..905ce37 100644 --- a/examples/bivariate.rs +++ b/examples/bivariate.rs @@ -2,85 +2,112 @@ use std::io::Write; -use neuramethyst::derivable::activation::Linear; #[allow(unused_imports)] -use neuramethyst::derivable::activation::{LeakyRelu, Relu, Tanh}; -use neuramethyst::derivable::loss::Euclidean; -use neuramethyst::derivable::regularize::NeuraElastic; +use neuramethyst::derivable::activation::{LeakyRelu, Linear, Relu, Tanh}; +use neuramethyst::derivable::loss::CrossEntropy; +use neuramethyst::derivable::regularize::NeuraL1; use neuramethyst::prelude::*; use rand::Rng; fn main() { let mut network = neura_network![ - neura_layer!("dense", 2, 8; LeakyRelu(0.01)), - neura_layer!("dropout", 0.1), - neura_layer!("dense", 8; LeakyRelu(0.01), NeuraElastic::new(0.0001, 0.002)), - neura_layer!("dropout", 0.3), - neura_layer!("dense", 8; LeakyRelu(0.01), NeuraElastic::new(0.0001, 0.002)), - neura_layer!("dropout", 0.1), - neura_layer!("dense", 4; LeakyRelu(0.1), NeuraElastic::new(0.0001, 0.002)), - neura_layer!("dense", 2; Linear), + neura_layer!("dense", 2, 8; Relu, NeuraL1(0.001)), + neura_layer!("dropout", 0.25), + neura_layer!("dense", 2; Linear, NeuraL1(0.001)), neura_layer!("softmax"), ]; - // println!("{:#?}", network); - let mut rng = rand::thread_rng(); - let inputs = (0..=1).cycle().map(move |category| { + let inputs = (0..1).cycle().map(move |_| { + let mut rng = rand::thread_rng(); // TODO: move out + let category = rng.gen_bool(0.5) as usize; let (x, y) = if category == 0 { - let radius: f64 = rng.gen_range(0.0..1.0); - let radius = radius.sqrt(); + let radius: f64 = rng.gen_range(0.0..2.0); let angle = rng.gen_range(0.0..std::f64::consts::TAU); (angle.cos() * radius, angle.sin() * radius) } else { - let radius: f64 = rng.gen_range(1.0..2.0); + let radius: f64 = rng.gen_range(3.0..5.0); let angle = rng.gen_range(0.0..std::f64::consts::TAU); (angle.cos() * radius, angle.sin() * radius) }; - ([x, y], one_hot::<2>(category)) + ([x, y], neuramethyst::one_hot::<2>(category)) }); - let test_inputs: Vec<_> = inputs.clone().take(100).collect(); + let test_inputs: Vec<_> = inputs.clone().take(10).collect(); - let mut trainer = NeuraBatchedTrainer::new(0.25, 1000); - trainer.log_epochs = 50; - trainer.learning_momentum = 0.05; - trainer.batch_size = 2000; + if std::env::args().any(|arg| arg == "draw") { + for epoch in 0..200 { + let mut trainer = NeuraBatchedTrainer::new(0.03, 10); + trainer.batch_size = 10; - trainer.train( - NeuraBackprop::new(Euclidean), - &mut network, - inputs, - &test_inputs, - ); + trainer.train( + NeuraBackprop::new(CrossEntropy), + &mut network, + inputs.clone(), + &test_inputs, + ); + + let network = network.clone(); + draw_neuron_activation(|input| network.eval(&input).into_iter().collect(), 6.0); + println!("{}", epoch); + + std::thread::sleep(std::time::Duration::new(0, 50_000_000)); + } + } else { + let mut trainer = NeuraBatchedTrainer::new(0.03, 20 * 50); + trainer.batch_size = 10; + trainer.log_iterations = 20; + + trainer.train( + NeuraBackprop::new(CrossEntropy), + &mut network, + inputs.clone(), + &test_inputs, + ); + + // println!("{}", String::from("\n").repeat(64)); + // draw_neuron_activation(|input| network.eval(&input).into_iter().collect(), 6.0); + } let mut file = std::fs::File::create("target/bivariate.csv").unwrap(); for (input, _target) in test_inputs { - let guess = argmax(&network.eval(&input)); + let guess = neuramethyst::argmax(&network.eval(&input)); writeln!(&mut file, "{},{},{}", input[0], input[1], guess).unwrap(); - // println!("{:?}", network.eval(&input)); } - - // println!("{:#?}", network); } -fn one_hot(value: usize) -> [f64; N] { - let mut res = [0.0; N]; - if value < N { - res[value] = 1.0; +// TODO: move this to the library? +fn draw_neuron_activation Vec>(callback: F, scale: f64) { + use viuer::Config; + + const WIDTH: u32 = 64; + const HEIGHT: u32 = 64; + + let mut image = image::RgbImage::new(WIDTH, HEIGHT); + + fn sigmoid(x: f64) -> f64 { + 1.0 / (1.0 + (-x * 3.0).exp()) } - res -} -fn argmax(array: &[f64]) -> usize { - let mut res = 0; + for y in 0..HEIGHT { + let y2 = 2.0 * y as f64 / HEIGHT as f64 - 1.0; + for x in 0..WIDTH { + let x2 = 2.0 * x as f64 / WIDTH as f64 - 1.0; + let activation = callback([x2 * scale, y2 * scale]); + let r = (sigmoid(activation.get(0).copied().unwrap_or(-1.0)) * 255.0).floor() as u8; + let g = (sigmoid(activation.get(1).copied().unwrap_or(-1.0)) * 255.0).floor() as u8; + let b = (sigmoid(activation.get(2).copied().unwrap_or(-1.0)) * 255.0).floor() as u8; - for n in 1..array.len() { - if array[n] > array[res] { - res = n; + *image.get_pixel_mut(x, y) = image::Rgb([r, g, b]); } } - res + let config = Config { + use_kitty: false, + // absolute_offset: false, + ..Default::default() + }; + + viuer::print(&image::DynamicImage::ImageRgb8(image), &config).unwrap(); } diff --git a/examples/xor.rs b/examples/xor.rs index be04629..a0a4572 100644 --- a/examples/xor.rs +++ b/examples/xor.rs @@ -29,7 +29,7 @@ fn main() { let mut trainer = NeuraBatchedTrainer::new(0.05, 1000); trainer.batch_size = 6; - trainer.log_epochs = 250; + trainer.log_iterations = 250; trainer.learning_momentum = 0.01; trainer.train( diff --git a/src/algebra.rs b/src/algebra.rs index d3bcd12..fffafe5 100644 --- a/src/algebra.rs +++ b/src/algebra.rs @@ -5,6 +5,8 @@ pub trait NeuraVectorSpace { fn mul_assign(&mut self, by: f64); fn zero() -> Self; + + fn norm_squared(&self) -> f64; } impl NeuraVectorSpace for () { @@ -22,6 +24,10 @@ impl NeuraVectorSpace for () { fn zero() -> Self { () } + + fn norm_squared(&self) -> f64 { + 0.0 + } } impl NeuraVectorSpace for (Left, Right) { @@ -38,6 +44,10 @@ impl NeuraVectorSpace for (Left fn zero() -> Self { (Left::zero(), Right::zero()) } + + fn norm_squared(&self) -> f64 { + self.0.norm_squared() + self.1.norm_squared() + } } impl NeuraVectorSpace for [T; N] { @@ -65,6 +75,10 @@ impl NeuraVectorSpace for [T; N] { unreachable!() }) } + + fn norm_squared(&self) -> f64 { + self.iter().map(T::norm_squared).sum() + } } macro_rules! base { @@ -81,6 +95,10 @@ macro_rules! base { fn zero() -> Self { ::default() } + + fn norm_squared(&self) -> f64 { + (self * self) as f64 + } } }; } diff --git a/src/derivable/activation.rs b/src/derivable/activation.rs index 07b97da..8ef3a5d 100644 --- a/src/derivable/activation.rs +++ b/src/derivable/activation.rs @@ -1,135 +1,108 @@ -use super::NeuraDerivable; - -#[derive(Clone, Copy, Debug, PartialEq)] -pub struct Relu; +#![allow(unused_variables)] -impl NeuraDerivable for Relu { - #[inline(always)] - fn eval(&self, input: f64) -> f64 { - input.max(0.0) - } +use super::NeuraDerivable; - #[inline(always)] - fn derivate(&self, input: f64) -> f64 { - if input > 0.0 { - 1.0 - } else { - 0.0 +macro_rules! impl_derivable { + ( $type_f32:ty, $type_f64:ty, $self:ident, $variable:ident, $eval:expr, $derivate:expr $(; $variance_hint:expr, $bias_hint:expr )? ) => { + impl NeuraDerivable for $type_f32 { + #[inline(always)] + fn eval($self: &Self, $variable: f32) -> f32 { + $eval + } + + #[inline(always)] + fn derivate($self: &Self, $variable: f32) -> f32 { + $derivate + } + + $( + #[inline(always)] + fn variance_hint($self: &Self) -> f64 { + $variance_hint + } + + #[inline(always)] + fn bias_hint($self: &Self) -> f64 { + $bias_hint + } + )? } - } -} -impl NeuraDerivable for Relu { - #[inline(always)] - fn eval(&self, input: f32) -> f32 { - input.max(0.0) - } - - #[inline(always)] - fn derivate(&self, input: f32) -> f32 { - if input > 0.0 { - 1.0 - } else { - 0.0 + impl NeuraDerivable for $type_f64 { + #[inline(always)] + fn eval($self: &Self, $variable: f64) -> f64 { + $eval + } + + #[inline(always)] + fn derivate($self: &Self, $variable: f64) -> f64 { + $derivate + } + + $( + #[inline(always)] + fn variance_hint($self: &Self) -> f64 { + $variance_hint + } + + #[inline(always)] + fn bias_hint($self: &Self) -> f64 { + $bias_hint + } + )? } - } + }; + + ( $type:ty, $variable:ident, $eval:expr, $derivate:expr $(; $variance_hint:expr, $bias_hint:expr )? ) => { + impl_derivable!($type, $type, self, $variable, $eval, $derivate $(; $variance_hint, $bias_hint)?); + }; } #[derive(Clone, Copy, Debug, PartialEq)] -pub struct LeakyRelu(pub F); +pub struct Relu; -impl NeuraDerivable for LeakyRelu { - #[inline(always)] - fn eval(&self, input: f64) -> f64 { - if input > 0.0 { - input - } else { - self.0 * input - } +impl_derivable!(Relu, x, x.max(0.0), { + if x > 0.0 { + 1.0 + } else { + 0.0 } +}; 2.0, 0.1); - #[inline(always)] - fn derivate(&self, input: f64) -> f64 { - if input > 0.0 { - 1.0 - } else { - self.0 - } - } -} +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct LeakyRelu(pub F); -impl NeuraDerivable for LeakyRelu { - #[inline(always)] - fn eval(&self, input: f32) -> f32 { - if input > 0.0 { - input +impl_derivable!( + LeakyRelu, + LeakyRelu, + self, + x, + { + if x > 0.0 { + x } else { - self.0 * input + self.0 * x } - } - - #[inline(always)] - fn derivate(&self, input: f32) -> f32 { - if input > 0.0 { + }, + { + if x > 0.0 { 1.0 } else { self.0 } - } -} + }; + 2.0, 0.1 +); #[derive(Clone, Copy, Debug, PartialEq)] pub struct Tanh; -impl NeuraDerivable for Tanh { - #[inline(always)] - fn eval(&self, input: f64) -> f64 { - 0.5 * input.tanh() + 0.5 - } - - #[inline(always)] - fn derivate(&self, at: f64) -> f64 { - let tanh = at.tanh(); - 0.5 * (1.0 - tanh * tanh) - } -} - -impl NeuraDerivable for Tanh { - #[inline(always)] - fn eval(&self, input: f32) -> f32 { - 0.5 * input.tanh() + 0.5 - } - - #[inline(always)] - fn derivate(&self, at: f32) -> f32 { - let tanh = at.tanh(); - 0.5 * (1.0 - tanh * tanh) - } -} +impl_derivable!(Tanh, x, x.tanh(), { + let y = x.tanh(); + 1.0 - y * y +}); #[derive(Clone, Copy, Debug, PartialEq)] pub struct Linear; -impl NeuraDerivable for Linear { - #[inline(always)] - fn eval(&self, input: f64) -> f64 { - input - } - - #[inline(always)] - fn derivate(&self, _at: f64) -> f64 { - 1.0 - } -} - -impl NeuraDerivable for Linear { - #[inline(always)] - fn eval(&self, input: f32) -> f32 { - input - } - - #[inline(always)] - fn derivate(&self, _at: f32) -> f32 { - 1.0 - } -} +impl_derivable!(Linear, x, x, 1.0); diff --git a/src/derivable/loss.rs b/src/derivable/loss.rs index a15e4d3..301ea08 100644 --- a/src/derivable/loss.rs +++ b/src/derivable/loss.rs @@ -30,3 +30,56 @@ impl NeuraLoss for Euclidean { res } } + +/// The cross-entropy loss function, defined as `L(y, ŷ) = -Σᵢ(yᵢ*ln(ŷᵢ))`. +/// +/// This version of the cross-entropy function does not make assumptions about the target vector being one-hot encoded. +/// +/// This function requires that `ŷ` (the output of the neural network) is in `[0; 1]^n`. +/// This guarantee is notably not given by the `Relu`, `LeakyRelu` and `Swish` activation functions, +/// so you should pick another activation on the last layer, or pass it into a `NeuraSoftmax` layer. +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct CrossEntropy; + +const DERIVATIVE_CAP: f64 = 100.0; +const LOG_MIN: f64 = 0.00001; + +impl CrossEntropy { + #[inline(always)] + pub fn eval_single(&self, target: f64, actual: f64) -> f64 { + -target * actual.max(LOG_MIN).log(std::f64::consts::E) + } + + #[inline(always)] + pub fn derivate_single(&self, target: f64, actual: f64) -> f64 { + -(target / actual).min(DERIVATIVE_CAP) + } +} + +impl NeuraLoss for CrossEntropy { + type Input = [f64; N]; + type Target = [f64; N]; + + fn eval(&self, target: &Self::Target, actual: &Self::Input) -> f64 { + let mut result = 0.0; + + for i in 0..N { + result += self.eval_single(target[i], actual[i]); + } + + result + } + + fn nabla(&self, target: &Self::Target, actual: &Self::Input) -> Self::Input { + let mut result = [0.0; N]; + + for i in 0..N { + result[i] = self.derivate_single(target[i], actual[i]); + } + + result + } +} + +// TODO: a one-hot encoded, CrossEntropy + Softmax loss function? +// It would be a lot more efficient than the current method diff --git a/src/derivable/mod.rs b/src/derivable/mod.rs index b5c4412..57587e7 100644 --- a/src/derivable/mod.rs +++ b/src/derivable/mod.rs @@ -7,6 +7,18 @@ pub trait NeuraDerivable { /// Should return the derivative of `self.eval(input)` fn derivate(&self, at: F) -> F; + + /// Should return a hint for how much the variance for a random initialization should be + #[inline(always)] + fn variance_hint(&self) -> f64 { + 1.0 + } + + /// Should return a hint for what the default bias value should be + #[inline(always)] + fn bias_hint(&self) -> f64 { + 0.0 + } } pub trait NeuraLoss { diff --git a/src/derivable/regularize.rs b/src/derivable/regularize.rs index 91cdb30..48d9608 100644 --- a/src/derivable/regularize.rs +++ b/src/derivable/regularize.rs @@ -73,7 +73,7 @@ pub struct NeuraL2(pub F); impl NeuraDerivable for NeuraL2 { #[inline(always)] fn eval(&self, input: f64) -> f64 { - self.0 * (input * input) + (0.5 * self.0) * (input * input) } #[inline(always)] @@ -85,7 +85,7 @@ impl NeuraDerivable for NeuraL2 { impl NeuraDerivable for NeuraL2 { #[inline(always)] fn eval(&self, input: f32) -> f32 { - self.0 * (input * input) + (0.5 * self.0) * (input * input) } #[inline(always)] diff --git a/src/layer/dense.rs b/src/layer/dense.rs index 2c9e5fc..18e4f26 100644 --- a/src/layer/dense.rs +++ b/src/layer/dense.rs @@ -46,8 +46,14 @@ impl< pub fn from_rng(rng: &mut impl Rng, activation: Act, regularization: Reg) -> Self { let mut weights = [[0.0; INPUT_LEN]; OUTPUT_LEN]; - let distribution = - rand_distr::Normal::new(0.0, 2.0 / (INPUT_LEN as f64 + OUTPUT_LEN as f64)).unwrap(); + // Use Xavier (or He) initialisation, using the harmonic mean + // Ref: https://www.deeplearning.ai/ai-notes/initialization/index.html + let distribution = rand_distr::Normal::new( + 0.0, + activation.variance_hint() * 2.0 / (INPUT_LEN as f64 + OUTPUT_LEN as f64), + ) + .unwrap(); + // let distribution = rand_distr::Uniform::new(-0.5, 0.5); for i in 0..OUTPUT_LEN { for j in 0..INPUT_LEN { @@ -57,8 +63,8 @@ impl< Self { weights, - // Biases are zero-initialized, as this shouldn't cause any issues during training - bias: [0.0; OUTPUT_LEN], + // Biases are initialized based on the activation's hint + bias: [activation.bias_hint(); OUTPUT_LEN], activation, regularization, } @@ -96,20 +102,22 @@ impl< { type Delta = ([[f64; INPUT_LEN]; OUTPUT_LEN], [f64; OUTPUT_LEN]); - // TODO: double-check the math in this fn backpropagate( &self, input: &Self::Input, epsilon: Self::Output, ) -> (Self::Input, Self::Delta) { let evaluated = multiply_matrix_vector(&self.weights, input); - // Compute delta from epsilon, with `self.activation'(input) ° epsilon = delta` + // Compute delta (the input gradient of the neuron) from epsilon (the output gradient of the neuron), + // with `self.activation'(input) ° epsilon = delta` let mut delta = epsilon.clone(); for i in 0..OUTPUT_LEN { delta[i] *= self.activation.derivate(evaluated[i]); } + // Compute the weight gradient let weights_gradient = reverse_dot_product(&delta, input); + // According to https://datascience.stackexchange.com/questions/20139/gradients-for-bias-terms-in-backpropagation // The gradient of the bias is equal to the delta term of the backpropagation algorithm let bias_gradient = delta; diff --git a/src/layer/mod.rs b/src/layer/mod.rs index 4ac7393..d4153ed 100644 --- a/src/layer/mod.rs +++ b/src/layer/mod.rs @@ -7,6 +7,9 @@ pub use dropout::NeuraDropoutLayer; mod softmax; pub use softmax::NeuraSoftmaxLayer; +mod one_hot; +pub use one_hot::NeuraOneHotLayer; + pub trait NeuraLayer { type Input; type Output; @@ -46,4 +49,8 @@ macro_rules! neura_layer { ( "softmax", $length:expr ) => { $crate::layer::NeuraSoftmaxLayer::new() as $crate::layer::NeuraSoftmaxLayer<$length> }; + + ( "one_hot" ) => { + $crate::layer::NeuraOneHotLayer as $crate::layer::NeuraOneHotLayer<2, _> + }; } diff --git a/src/layer/one_hot.rs b/src/layer/one_hot.rs new file mode 100644 index 0000000..39335b6 --- /dev/null +++ b/src/layer/one_hot.rs @@ -0,0 +1,61 @@ +use crate::train::NeuraTrainableLayer; + +use super::NeuraLayer; + +/// A special layer that allows you to split a vector into one-hot vectors +#[derive(Debug, Clone, PartialEq)] +pub struct NeuraOneHotLayer; + +impl NeuraLayer for NeuraOneHotLayer +where + [(); LENGTH * CATS]: Sized, +{ + type Input = [f64; LENGTH]; + type Output = [f64; LENGTH * CATS]; + + fn eval(&self, input: &Self::Input) -> Self::Output { + let mut res = [0.0; LENGTH * CATS]; + + for i in 0..LENGTH { + let cat_low = input[i].floor().max(0.0).min(CATS as f64 - 2.0); + let amount = (input[i] - cat_low).max(0.0).min(1.0); + let cat_low = cat_low as usize; + res[i * LENGTH + cat_low] = 1.0 - amount; + res[i * LENGTH + cat_low + 1] = amount; + } + + res + } +} + +impl NeuraTrainableLayer for NeuraOneHotLayer +where + [(); LENGTH * CATS]: Sized, +{ + type Delta = (); + + fn backpropagate( + &self, + input: &Self::Input, + epsilon: Self::Output, + ) -> (Self::Input, Self::Delta) { + let mut res = [0.0; LENGTH]; + + for i in 0..LENGTH { + let cat_low = input[i].floor().max(0.0).min(CATS as f64 - 2.0) as usize; + let epsilon = -epsilon[i * LENGTH + cat_low] + epsilon[i * LENGTH + cat_low + 1]; + // Scale epsilon by how many entries were ignored + res[i] = epsilon * CATS as f64 / 2.0; + } + + (res, ()) + } + + fn regularize(&self) -> Self::Delta { + () + } + + fn apply_gradient(&mut self, _gradient: &Self::Delta) { + // Noop + } +} diff --git a/src/lib.rs b/src/lib.rs index 4bfe1e5..012bb19 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,6 @@ #![feature(generic_arg_infer)] +#![feature(generic_associated_types)] +#![feature(generic_const_exprs)] pub mod algebra; pub mod derivable; @@ -8,13 +10,16 @@ pub mod train; mod utils; +// TODO: move to a different file +pub use utils::{argmax, one_hot}; + pub mod prelude { // Macros pub use crate::{neura_layer, neura_network}; // Structs and traits pub use crate::layer::{NeuraDenseLayer, NeuraDropoutLayer, NeuraLayer}; - pub use crate::network::NeuraNetwork; + pub use crate::network::{NeuraNetwork, NeuraNetworkTail}; pub use crate::train::{NeuraBackprop, NeuraBatchedTrainer}; pub use crate::utils::cycle_shuffling; } diff --git a/src/network.rs b/src/network.rs index 3a6f4dc..de8b4f6 100644 --- a/src/network.rs +++ b/src/network.rs @@ -6,8 +6,8 @@ use crate::{ #[derive(Clone, Debug)] pub struct NeuraNetwork { - layer: Layer, - child_network: ChildNetwork, + pub layer: Layer, + pub child_network: ChildNetwork, } impl NeuraNetwork { @@ -25,20 +25,66 @@ impl NeuraNetwork { Self::new(layer, child_network) } - pub fn child_network(&self) -> &ChildNetwork { - &self.child_network + pub fn trim_front(self) -> ChildNetwork { + self.child_network } - pub fn layer(&self) -> &Layer { - &self.layer + pub fn push_front(self, layer: T) -> NeuraNetwork { + NeuraNetwork { + layer: layer, + child_network: self, + } } } -impl From for NeuraNetwork { - fn from(layer: Layer) -> Self { - Self { - layer, - child_network: (), +/// Operations on the tail end of the network +pub trait NeuraNetworkTail { + type TailTrimmed; + type TailPushed; + + fn trim_tail(self) -> Self::TailTrimmed; + fn push_tail(self, layer: T) -> Self::TailPushed; +} + +// Trimming the last layer returns an empty network +impl NeuraNetworkTail for NeuraNetwork { + type TailTrimmed = (); + type TailPushed = NeuraNetwork>; + + fn trim_tail(self) -> Self::TailTrimmed { + () + } + + fn push_tail(self, layer: T) -> Self::TailPushed { + NeuraNetwork { + layer: self.layer, + child_network: NeuraNetwork { + layer, + child_network: (), + }, + } + } +} + +// Trimming another layer returns a network which calls trim recursively +impl NeuraNetworkTail + for NeuraNetwork +{ + type TailTrimmed = NeuraNetwork::TailTrimmed>; + type TailPushed = + NeuraNetwork::TailPushed>; + + fn trim_tail(self) -> Self::TailTrimmed { + NeuraNetwork { + layer: self.layer, + child_network: self.child_network.trim_tail(), + } + } + + fn push_tail(self, layer: T) -> Self::TailPushed { + NeuraNetwork { + layer: self.layer, + child_network: self.child_network.push_tail(layer), } } } @@ -136,6 +182,15 @@ impl From for NeuraNetwork { + fn from(layer: Layer) -> Self { + Self { + layer, + child_network: (), + } + } +} + #[macro_export] macro_rules! neura_network { [] => { @@ -143,11 +198,11 @@ macro_rules! neura_network { }; [ $layer:expr $(,)? ] => { - NeuraNetwork::from($layer) + $crate::network::NeuraNetwork::from($layer) }; [ $first:expr, $($rest:expr),+ $(,)? ] => { - NeuraNetwork::new_match_output($first, neura_network![$($rest),+]) + $crate::network::NeuraNetwork::new_match_output($first, neura_network![$($rest),+]) }; } @@ -159,8 +214,6 @@ mod test { neura_layer, }; - use super::*; - #[test] fn test_neura_network_macro() { let mut rng = rand::thread_rng(); diff --git a/src/train.rs b/src/train.rs index c7627fe..1652452 100644 --- a/src/train.rs +++ b/src/train.rs @@ -2,7 +2,7 @@ use crate::{ algebra::NeuraVectorSpace, derivable::NeuraLoss, layer::NeuraLayer, network::NeuraNetwork, }; -// TODO: move this to layer/mod.rs +// TODO: move this trait to layer/mod.rs pub trait NeuraTrainableLayer: NeuraLayer { type Delta: NeuraVectorSpace; @@ -29,7 +29,7 @@ pub trait NeuraTrainableLayer: NeuraLayer { /// Applies `δW_l` to the weights of the layer fn apply_gradient(&mut self, gradient: &Self::Delta); - /// Called before an epoch begins, to allow the layer to set itself up for training. + /// Called before an iteration begins, to allow the layer to set itself up for training. #[inline(always)] fn prepare_epoch(&mut self) {} @@ -54,7 +54,7 @@ pub trait NeuraTrainable: NeuraLayer { /// Should return the regularization gradient fn regularize(&self) -> Self::Delta; - /// Called before an epoch begins, to allow the network to set itself up for training. + /// Called before an iteration begins, to allow the network to set itself up for training. fn prepare_epoch(&mut self); /// Called at the end of training, to allow the network to clean itself up @@ -116,7 +116,8 @@ impl + Clone> where NeuraNetwork: NeuraTrainable, { - self.loss.eval(target, &trainable.eval(&input)) + let output = trainable.eval(&input); + self.loss.eval(target, &output) } } @@ -138,15 +139,17 @@ pub struct NeuraBatchedTrainer { /// How many gradient computations to average before updating the weights pub batch_size: usize, - /// How many batches to run for; if `epochs * batch_size` exceeds the input length, then training will stop. + /// How many batches to run for; if `iterations * batch_size` exceeds the input length, then training will stop. /// You should use `cycle_shuffling` from the `prelude` module to avoid this. - pub epochs: usize, + /// + /// Note that this is different from epochs, which count how many times the dataset has been fully iterated over. + pub iterations: usize, - /// The trainer will log progress at every multiple of `log_epochs` steps. - /// If `log_epochs` is zero (default), then no progress will be logged. + /// The trainer will log progress at every multiple of `log_iterations` iterations. + /// If `log_iterations` is zero (default), then no progress will be logged. /// /// The test inputs is used to measure the score of the network. - pub log_epochs: usize, + pub log_iterations: usize, } impl Default for NeuraBatchedTrainer { @@ -155,17 +158,17 @@ impl Default for NeuraBatchedTrainer { learning_rate: 0.1, learning_momentum: 0.0, batch_size: 100, - epochs: 100, - log_epochs: 0, + iterations: 100, + log_iterations: 0, } } } impl NeuraBatchedTrainer { - pub fn new(learning_rate: f64, epochs: usize) -> Self { + pub fn new(learning_rate: f64, iterations: usize) -> Self { Self { learning_rate, - epochs, + iterations, ..Default::default() } } @@ -195,7 +198,7 @@ impl NeuraBatchedTrainer { // Contains `momentum_factor * factor * gradient_sum_previous_iter` let mut previous_gradient_sum = as NeuraTrainable>::Delta::zero(); - 'd: for epoch in 0..self.epochs { + 'd: for iteration in 0..self.iterations { let mut gradient_sum = as NeuraTrainable>::Delta::zero(); network.prepare_epoch(); @@ -211,7 +214,7 @@ impl NeuraBatchedTrainer { gradient_sum.mul_assign(factor); - // Add regularization gradient (TODO: check if it can be factored out of momentum) + // Add regularization gradient let mut reg_gradient = network.regularize(); reg_gradient.mul_assign(reg_factor); gradient_sum.add_assign(®_gradient); @@ -224,14 +227,14 @@ impl NeuraBatchedTrainer { previous_gradient_sum.mul_assign(momentum_factor); } - if self.log_epochs > 0 && (epoch + 1) % self.log_epochs == 0 { + if self.log_iterations > 0 && (iteration + 1) % self.log_iterations == 0 { network.cleanup(); let mut loss_sum = 0.0; for (input, target) in test_inputs { loss_sum += gradient_solver.score(&network, input, target); } loss_sum /= test_inputs.len() as f64; - println!("Epoch {}, Loss: {:.3}", epoch + 1, loss_sum); + println!("Iteration {}, Loss: {:.3}", iteration + 1, loss_sum); } } @@ -243,8 +246,11 @@ impl NeuraBatchedTrainer { mod test { use super::*; use crate::{ + assert_approx, derivable::{activation::Linear, loss::Euclidean, regularize::NeuraL0}, layer::NeuraDenseLayer, + network::NeuraNetworkTail, + neura_network, }; #[test] @@ -263,4 +269,40 @@ mod test { } } } + + #[test] + fn test_backpropagation_complex() { + const EPSILON: f64 = 0.00001; + // Test that we get the same values as https://hmkcode.com/ai/backpropagation-step-by-step/ + let network = neura_network![ + NeuraDenseLayer::new([[0.11, 0.21], [0.12, 0.08]], [0.0; 2], Linear, NeuraL0), + NeuraDenseLayer::new([[0.14, 0.15]], [0.0], Linear, NeuraL0) + ]; + + let input = [2.0, 3.0]; + let target = [1.0]; + + let intermediary = network.clone().trim_tail().eval(&input); + assert_approx!(0.85, intermediary[0], EPSILON); + assert_approx!(0.48, intermediary[1], EPSILON); + assert_approx!(0.191, network.eval(&input)[0], EPSILON); + + assert_approx!(0.327, Euclidean.eval(&target, &network.eval(&input)), 0.001); + + let delta = network.eval(&input)[0] - target[0]; + + let (gradient_first, gradient_second) = + NeuraBackprop::new(Euclidean).get_gradient(&network, &input, &target); + let gradient_first = gradient_first.0; + let gradient_second = gradient_second.0[0]; + + assert_approx!(gradient_second[0], intermediary[0] * delta, EPSILON); + assert_approx!(gradient_second[1], intermediary[1] * delta, EPSILON); + + assert_approx!(gradient_first[0][0], input[0] * delta * 0.14, EPSILON); + assert_approx!(gradient_first[0][1], input[1] * delta * 0.14, EPSILON); + + assert_approx!(gradient_first[1][0], input[0] * delta * 0.15, EPSILON); + assert_approx!(gradient_first[1][1], input[1] * delta * 0.15, EPSILON); + } } diff --git a/src/utils.rs b/src/utils.rs index 081312b..7367324 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -175,3 +175,54 @@ pub(crate) fn uniform_vector() -> [f64; LENGTH] { res } + +pub fn one_hot(value: usize) -> [f64; N] { + let mut res = [0.0; N]; + if value < N { + res[value] = 1.0; + } + res +} + +pub fn argmax(array: &[f64]) -> usize { + let mut res = 0; + + for n in 1..array.len() { + if array[n] > array[res] { + res = n; + } + } + + res +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_reverse_dot_product() { + let left = [2.0, 3.0, 5.0]; + let right = [7.0, 11.0, 13.0, 17.0]; + + let expected = [ + [14.0, 22.0, 26.0, 34.0], + [21.0, 33.0, 39.0, 51.0], + [35.0, 55.0, 65.0, 85.0], + ]; + + assert_eq!(expected, reverse_dot_product(&left, &right)); + } +} + +#[cfg(test)] +#[macro_export] +macro_rules! assert_approx { + ( $left:expr, $right:expr, $epsilon:expr ) => { + let left = $left; + let right = $right; + if (left - right).abs() >= $epsilon { + panic!("Expected {} to be approximately equal to {}", left, right); + } + }; +}