✨ Softmax layer

2 years ago · cb862f12cc
parent 0c97a65013
commit cb862f12cc
8 changed files with 190 additions and 17 deletions
--- a/examples/bivariate.rs
+++ b/examples/bivariate.rs
@ -13,10 +13,12 @@ use rand::Rng;
 fn main() {
    let mut network = neura_sequential![
-        neura_layer!("dense", 8),
+        neura_layer!("dense", 8).regularization(NeuraL1(0.001)),
        neura_layer!("dropout", 0.25),
-        neura_layer!("dense", 2).activation(Linear),
+        neura_layer!("dense", 2)
-        // neura_layer!("softmax"),
+            .activation(Linear)
            .regularization(NeuraL1(0.001)),
        neura_layer!("softmax"),
    ]
    .construct(NeuraShape::Vector(2))
    .unwrap();
--- a/examples/convolution.disabled-rs
+++ b/examples/convolution.disabled-rs
--- a/src/derivable/loss.rs
+++ b/src/derivable/loss.rs
@ -1,8 +1,6 @@
 use nalgebra::DVector;
 use num::Float;
 use crate::algebra::NeuraVector;
 use super::NeuraLoss;
 #[derive(Clone, Copy, Debug, PartialEq)]
--- a/src/layer/mod.rs
+++ b/src/layer/mod.rs
@ -2,8 +2,7 @@ use crate::algebra::NeuraVectorSpace;
 pub mod dense;
 pub mod dropout;
-
+pub mod softmax;
 pub use dense::NeuraDenseLayer;
 #[derive(Clone, Copy, PartialEq, Debug)]
 pub enum NeuraShape {
@ -121,4 +120,8 @@ macro_rules! neura_layer {
    ( "dropout", $probability:expr ) => {
        $crate::layer::dropout::NeuraDropoutLayer::new($probability, rand::thread_rng())
    };
    ( "softmax" ) => {
        $crate::layer::softmax::NeuraSoftmaxLayer::new()
    };
 }
--- a/src/layer/softmax.rs
+++ b/src/layer/softmax.rs
@ -0,0 +1,175 @@
 use nalgebra::{DVector, Scalar};
 use num::{traits::NumAssignOps, Float};
 use super::*;
 #[derive(Clone, Debug)]
 pub struct NeuraSoftmaxLayer {
    shape: NeuraShape,
 }
 impl NeuraSoftmaxLayer {
    pub fn new() -> Self {
        Self {
            shape: NeuraShape::Vector(0),
        }
    }
 }
 impl<F: Float + Scalar + NumAssignOps> NeuraLayer<DVector<F>> for NeuraSoftmaxLayer {
    type Output = DVector<F>;
    fn eval(&self, input: &DVector<F>) -> Self::Output {
        let mut res = input.clone();
        let mut max = F::zero();
        for &item in &res {
            if item > max {
                max = item;
            }
        }
        let mut sum = F::zero();
        for item in &mut res {
            *item = (*item - max).exp();
            sum += *item;
        }
        res /= sum;
        res
    }
 }
 impl NeuraPartialLayer for NeuraSoftmaxLayer {
    type Constructed = Self;
    type Err = ();
    fn construct(self, input_shape: NeuraShape) -> Result<Self::Constructed, Self::Err> {
        Ok(Self { shape: input_shape })
    }
    fn output_shape(constructed: &Self::Constructed) -> NeuraShape {
        constructed.shape
    }
 }
 impl<F: Float + Scalar + NumAssignOps> NeuraTrainableLayer<DVector<F>> for NeuraSoftmaxLayer {
    type Gradient = ();
    fn default_gradient(&self) -> Self::Gradient {
        ()
    }
    fn backprop_layer(
        &self,
        input: &DVector<F>,
        mut epsilon: Self::Output,
    ) -> (DVector<F>, Self::Gradient) {
        // Note: a constant value can be added to `input` to bring it to increase precision
        let evaluated = self.eval(input);
        // Compute $a_{l-1,i} \epsilon_{l,i}$
        hadamard_product(&mut epsilon, &evaluated);
        // Compute $\sum_{k}{a_{l-1,k} \epsilon_{l,k}}$
        let sum_diagonal_terms = epsilon.sum();
        for i in 0..input.len() {
            // Multiply $\sum_{k}{a_{l-1,k} \epsilon_{l,k}}$ by $a_{l-1,i}$ and add it to $a_{l-1,i} \epsilon_{l,i}$
            epsilon[i] -= evaluated[i] * sum_diagonal_terms;
        }
        (epsilon, ())
    }
    fn regularize_layer(&self) -> Self::Gradient {
        ()
    }
    fn apply_gradient(&mut self, _gradient: &Self::Gradient) {
        // Noop
    }
 }
 fn hadamard_product<F: Float + std::ops::MulAssign>(left: &mut DVector<F>, right: &DVector<F>) {
    for i in 0..left.len() {
        left[i] *= right[i];
    }
 }
 #[cfg(test)]
 mod test {
    use nalgebra::{dvector, DMatrix};
    use crate::utils::uniform_vector;
    use super::*;
    #[test]
    fn test_softmax_eval() {
        const EPSILON: f64 = 0.000002;
        let layer = NeuraSoftmaxLayer::new();
        let result = layer.eval(&dvector![1.0, 2.0, 8.0]);
        assert!((result[0] - 0.0009088).abs() < EPSILON);
        assert!((result[1] - 0.0024704).abs() < EPSILON);
        assert!((result[2] - 0.9966208).abs() < EPSILON);
    }
    // Based on https://stats.stackexchange.com/a/306710
    #[test]
    fn test_softmax_backpropagation_two() {
        const EPSILON: f64 = 0.000001;
        let layer = NeuraSoftmaxLayer::new();
        for input1 in [0.2, 0.3, 0.5] as [f64; 3] {
            for input2 in [0.7, 1.1, 1.3] {
                let input = dvector![input1, input2];
                let sum = input1.exp() + input2.exp();
                let output = dvector![input1.exp() / sum, input2.exp() / sum];
                for epsilon1 in [1.7, 1.9, 2.3] {
                    for epsilon2 in [2.9, 3.1, 3.7] {
                        let epsilon = dvector![epsilon1, epsilon2];
                        let (epsilon, _) = layer.backprop_layer(&input, epsilon);
                        let expected = [
                            output[0] * (1.0 - output[0]) * epsilon1
                                - output[1] * output[0] * epsilon2,
                            output[1] * (1.0 - output[1]) * epsilon2
                                - output[1] * output[0] * epsilon1,
                        ];
                        assert!((epsilon[0] - expected[0]).abs() < EPSILON);
                        assert!((epsilon[1] - expected[1]).abs() < EPSILON);
                    }
                }
            }
        }
    }
    // Based on https://e2eml.school/softmax.html
    #[test]
    fn test_softmax_backpropagation() {
        const EPSILON: f64 = 0.000001;
        let layer = NeuraSoftmaxLayer::new();
        for _ in 0..100 {
            let input = uniform_vector(4);
            let evaluated = layer.eval(&input);
            let loss = uniform_vector(4);
            let mut derivative = &evaluated * evaluated.transpose();
            derivative *= -1.0;
            derivative += DMatrix::from_diagonal(&evaluated);
            let expected = derivative * &loss;
            let (actual, _) = layer.backprop_layer(&input, loss);
            for i in 0..4 {
                assert!((expected[i] - actual[i]).abs() < EPSILON);
            }
        }
    }
 }
--- a/src/network/sequential/mod.rs
+++ b/src/network/sequential/mod.rs
@ -247,7 +247,7 @@ mod test {
    use crate::{
        derivable::{activation::Relu, regularize::NeuraL0},
-        layer::{NeuraDenseLayer, NeuraLayer, NeuraShape},
+        layer::{dense::NeuraDenseLayer, NeuraLayer, NeuraShape},
        neura_layer,
    };
--- a/src/train.rs
+++ b/src/train.rs
@ -186,7 +186,7 @@ mod test {
    use crate::{
        assert_approx,
        derivable::{activation::Linear, loss::Euclidean, regularize::NeuraL0},
-        layer::{NeuraDenseLayer, NeuraLayer},
+        layer::{dense::NeuraDenseLayer, NeuraLayer},
        network::sequential::{NeuraSequential, NeuraSequentialTail},
        neura_sequential,
    };
--- a/src/utils.rs
+++ b/src/utils.rs
@ -89,17 +89,12 @@ where
 }
 #[cfg(test)]
-pub(crate) fn uniform_vector<const LENGTH: usize>() -> NeuraVector<LENGTH, f64> {
+pub(crate) fn uniform_vector(length: usize) -> nalgebra::DVector<f64> {
    use nalgebra::DVector;
    use rand::Rng;
    let mut res: NeuraVector<LENGTH, f64> = NeuraVector::default();
    let mut rng = rand::thread_rng();
-
+    DVector::from_fn(length, |_, _| -> f64 { rng.gen() })
    for i in 0..LENGTH {
        res[i] = rng.gen();
    }
    res
 }
 pub fn one_hot<const N: usize>(value: usize) -> NeuraVector<N, f64> {