diff --git a/examples/bivariate.rs b/examples/bivariate.rs index cb89a56..10da1af 100644 --- a/examples/bivariate.rs +++ b/examples/bivariate.rs @@ -13,10 +13,12 @@ use rand::Rng; fn main() { let mut network = neura_sequential![ - neura_layer!("dense", 8), + neura_layer!("dense", 8).regularization(NeuraL1(0.001)), neura_layer!("dropout", 0.25), - neura_layer!("dense", 2).activation(Linear), - // neura_layer!("softmax"), + neura_layer!("dense", 2) + .activation(Linear) + .regularization(NeuraL1(0.001)), + neura_layer!("softmax"), ] .construct(NeuraShape::Vector(2)) .unwrap(); diff --git a/examples/convolution.rs b/examples/convolution.disabled-rs similarity index 100% rename from examples/convolution.rs rename to examples/convolution.disabled-rs diff --git a/src/derivable/loss.rs b/src/derivable/loss.rs index da7c2f5..ec26552 100644 --- a/src/derivable/loss.rs +++ b/src/derivable/loss.rs @@ -1,8 +1,6 @@ use nalgebra::DVector; use num::Float; -use crate::algebra::NeuraVector; - use super::NeuraLoss; #[derive(Clone, Copy, Debug, PartialEq)] diff --git a/src/layer/mod.rs b/src/layer/mod.rs index 4e4e4e0..672ee00 100644 --- a/src/layer/mod.rs +++ b/src/layer/mod.rs @@ -2,8 +2,7 @@ use crate::algebra::NeuraVectorSpace; pub mod dense; pub mod dropout; - -pub use dense::NeuraDenseLayer; +pub mod softmax; #[derive(Clone, Copy, PartialEq, Debug)] pub enum NeuraShape { @@ -121,4 +120,8 @@ macro_rules! neura_layer { ( "dropout", $probability:expr ) => { $crate::layer::dropout::NeuraDropoutLayer::new($probability, rand::thread_rng()) }; + + ( "softmax" ) => { + $crate::layer::softmax::NeuraSoftmaxLayer::new() + }; } diff --git a/src/layer/softmax.rs b/src/layer/softmax.rs new file mode 100644 index 0000000..e428677 --- /dev/null +++ b/src/layer/softmax.rs @@ -0,0 +1,175 @@ +use nalgebra::{DVector, Scalar}; +use num::{traits::NumAssignOps, Float}; + +use super::*; + +#[derive(Clone, Debug)] +pub struct NeuraSoftmaxLayer { + shape: NeuraShape, +} + +impl NeuraSoftmaxLayer { + pub fn new() -> Self { + Self { + shape: NeuraShape::Vector(0), + } + } +} + +impl NeuraLayer> for NeuraSoftmaxLayer { + type Output = DVector; + + fn eval(&self, input: &DVector) -> Self::Output { + let mut res = input.clone(); + + let mut max = F::zero(); + for &item in &res { + if item > max { + max = item; + } + } + + let mut sum = F::zero(); + for item in &mut res { + *item = (*item - max).exp(); + sum += *item; + } + + res /= sum; + + res + } +} + +impl NeuraPartialLayer for NeuraSoftmaxLayer { + type Constructed = Self; + type Err = (); + + fn construct(self, input_shape: NeuraShape) -> Result { + Ok(Self { shape: input_shape }) + } + + fn output_shape(constructed: &Self::Constructed) -> NeuraShape { + constructed.shape + } +} + +impl NeuraTrainableLayer> for NeuraSoftmaxLayer { + type Gradient = (); + + fn default_gradient(&self) -> Self::Gradient { + () + } + + fn backprop_layer( + &self, + input: &DVector, + mut epsilon: Self::Output, + ) -> (DVector, Self::Gradient) { + // Note: a constant value can be added to `input` to bring it to increase precision + let evaluated = self.eval(input); + + // Compute $a_{l-1,i} \epsilon_{l,i}$ + hadamard_product(&mut epsilon, &evaluated); + + // Compute $\sum_{k}{a_{l-1,k} \epsilon_{l,k}}$ + let sum_diagonal_terms = epsilon.sum(); + + for i in 0..input.len() { + // Multiply $\sum_{k}{a_{l-1,k} \epsilon_{l,k}}$ by $a_{l-1,i}$ and add it to $a_{l-1,i} \epsilon_{l,i}$ + epsilon[i] -= evaluated[i] * sum_diagonal_terms; + } + + (epsilon, ()) + } + + fn regularize_layer(&self) -> Self::Gradient { + () + } + + fn apply_gradient(&mut self, _gradient: &Self::Gradient) { + // Noop + } +} + +fn hadamard_product(left: &mut DVector, right: &DVector) { + for i in 0..left.len() { + left[i] *= right[i]; + } +} + +#[cfg(test)] +mod test { + use nalgebra::{dvector, DMatrix}; + + use crate::utils::uniform_vector; + + use super::*; + + #[test] + fn test_softmax_eval() { + const EPSILON: f64 = 0.000002; + let layer = NeuraSoftmaxLayer::new(); + + let result = layer.eval(&dvector![1.0, 2.0, 8.0]); + + assert!((result[0] - 0.0009088).abs() < EPSILON); + assert!((result[1] - 0.0024704).abs() < EPSILON); + assert!((result[2] - 0.9966208).abs() < EPSILON); + } + + // Based on https://stats.stackexchange.com/a/306710 + #[test] + fn test_softmax_backpropagation_two() { + const EPSILON: f64 = 0.000001; + let layer = NeuraSoftmaxLayer::new(); + + for input1 in [0.2, 0.3, 0.5] as [f64; 3] { + for input2 in [0.7, 1.1, 1.3] { + let input = dvector![input1, input2]; + let sum = input1.exp() + input2.exp(); + let output = dvector![input1.exp() / sum, input2.exp() / sum]; + for epsilon1 in [1.7, 1.9, 2.3] { + for epsilon2 in [2.9, 3.1, 3.7] { + let epsilon = dvector![epsilon1, epsilon2]; + + let (epsilon, _) = layer.backprop_layer(&input, epsilon); + let expected = [ + output[0] * (1.0 - output[0]) * epsilon1 + - output[1] * output[0] * epsilon2, + output[1] * (1.0 - output[1]) * epsilon2 + - output[1] * output[0] * epsilon1, + ]; + + assert!((epsilon[0] - expected[0]).abs() < EPSILON); + assert!((epsilon[1] - expected[1]).abs() < EPSILON); + } + } + } + } + } + + // Based on https://e2eml.school/softmax.html + #[test] + fn test_softmax_backpropagation() { + const EPSILON: f64 = 0.000001; + let layer = NeuraSoftmaxLayer::new(); + + for _ in 0..100 { + let input = uniform_vector(4); + let evaluated = layer.eval(&input); + let loss = uniform_vector(4); + + let mut derivative = &evaluated * evaluated.transpose(); + derivative *= -1.0; + derivative += DMatrix::from_diagonal(&evaluated); + + let expected = derivative * &loss; + let (actual, _) = layer.backprop_layer(&input, loss); + + for i in 0..4 { + assert!((expected[i] - actual[i]).abs() < EPSILON); + } + } + } +} diff --git a/src/network/sequential/mod.rs b/src/network/sequential/mod.rs index cbcc54f..ed95a87 100644 --- a/src/network/sequential/mod.rs +++ b/src/network/sequential/mod.rs @@ -247,7 +247,7 @@ mod test { use crate::{ derivable::{activation::Relu, regularize::NeuraL0}, - layer::{NeuraDenseLayer, NeuraLayer, NeuraShape}, + layer::{dense::NeuraDenseLayer, NeuraLayer, NeuraShape}, neura_layer, }; diff --git a/src/train.rs b/src/train.rs index b9de86e..a331955 100644 --- a/src/train.rs +++ b/src/train.rs @@ -186,7 +186,7 @@ mod test { use crate::{ assert_approx, derivable::{activation::Linear, loss::Euclidean, regularize::NeuraL0}, - layer::{NeuraDenseLayer, NeuraLayer}, + layer::{dense::NeuraDenseLayer, NeuraLayer}, network::sequential::{NeuraSequential, NeuraSequentialTail}, neura_sequential, }; diff --git a/src/utils.rs b/src/utils.rs index 442c5bd..6a1d976 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -89,17 +89,12 @@ where } #[cfg(test)] -pub(crate) fn uniform_vector() -> NeuraVector { +pub(crate) fn uniform_vector(length: usize) -> nalgebra::DVector { + use nalgebra::DVector; use rand::Rng; - let mut res: NeuraVector = NeuraVector::default(); let mut rng = rand::thread_rng(); - - for i in 0..LENGTH { - res[i] = rng.gen(); - } - - res + DVector::from_fn(length, |_, _| -> f64 { rng.gen() }) } pub fn one_hot(value: usize) -> NeuraVector {