✨ Re-order arguments of neura_layer, implement softmax and normalization

2 years ago · bca56a5557
parent 220c61ff6b
commit bca56a5557
13 changed files with 520 additions and 91 deletions
--- a/examples/bivariate.rs
+++ b/examples/bivariate.rs
@ -2,22 +2,26 @@

 use std::io::Write;

-use neuramethyst::prelude::*;
-use neuramethyst::derivable::activation::{Relu, Tanh, LeakyRelu};
+use neuramethyst::derivable::activation::Linear;
+#[allow(unused_imports)]
+use neuramethyst::derivable::activation::{LeakyRelu, Relu, Tanh};
 use neuramethyst::derivable::loss::Euclidean;
+use neuramethyst::derivable::regularize::NeuraElastic;
+use neuramethyst::prelude::*;

 use rand::Rng;

 fn main() {
    let mut network = neura_network![
-        neura_layer!("dense", LeakyRelu(0.01), 9, 2),
+        neura_layer!("dense", 2, 8; LeakyRelu(0.01)),
        neura_layer!("dropout", 0.1),
-        neura_layer!("dense", LeakyRelu(0.01), 9),
+        neura_layer!("dense", 8; LeakyRelu(0.01), NeuraElastic::new(0.0001, 0.002)),
        neura_layer!("dropout", 0.3),
-        neura_layer!("dense", LeakyRelu(0.01), 6),
+        neura_layer!("dense", 8; LeakyRelu(0.01), NeuraElastic::new(0.0001, 0.002)),
        neura_layer!("dropout", 0.1),
-        neura_layer!("dense", LeakyRelu(0.01), 4),
-        neura_layer!("dense", LeakyRelu(0.1), 2)
+        neura_layer!("dense", 4; LeakyRelu(0.1), NeuraElastic::new(0.0001, 0.002)),
+        neura_layer!("dense", 2; Linear),
+        neura_layer!("softmax"),
    ];
    // println!("{:#?}", network);

@ -39,20 +43,23 @@ fn main() {

    let test_inputs: Vec<_> = inputs.clone().take(100).collect();

-    let mut trainer = NeuraBatchedTrainer::new(0.1, 4000);
-    trainer.log_epochs = 500;
+    let mut trainer = NeuraBatchedTrainer::new(0.25, 1000);
+    trainer.log_epochs = 50;
+    trainer.learning_momentum = 0.05;
+    trainer.batch_size = 2000;

    trainer.train(
        NeuraBackprop::new(Euclidean),
        &mut network,
        inputs,
-        &test_inputs
+        &test_inputs,
    );

    let mut file = std::fs::File::create("target/bivariate.csv").unwrap();
    for (input, _target) in test_inputs {
        let guess = argmax(&network.eval(&input));
        writeln!(&mut file, "{},{},{}", input[0], input[1], guess).unwrap();
+        // println!("{:?}", network.eval(&input));
    }

    // println!("{:#?}", network);
--- a/examples/xor.rs
+++ b/examples/xor.rs
@ -1,25 +1,30 @@
 #![feature(generic_arg_infer)]

-use neuramethyst::prelude::*;
-use neuramethyst::derivable::activation::{Relu};
+use neuramethyst::derivable::activation::Relu;
 use neuramethyst::derivable::loss::Euclidean;
+use neuramethyst::prelude::*;

 fn main() {
    let mut network = neura_network![
-        neura_layer!("dense", Relu, 4, 2),
-        neura_layer!("dense", Relu, 3),
-        neura_layer!("dense", Relu, 1)
+        neura_layer!("dense", 2, 4; Relu),
+        neura_layer!("dense", 3; Relu),
+        neura_layer!("dense", 1; Relu)
    ];

    let inputs = [
        ([0.0, 0.0], [0.0]),
        ([0.0, 1.0], [1.0]),
        ([1.0, 0.0], [1.0]),
-        ([1.0, 1.0], [0.0])
+        ([1.0, 1.0], [0.0]),
    ];

    for (input, target) in inputs {
-        println!("Input: {:?}, target: {}, actual: {:.3}", &input, target[0], network.eval(&input)[0]);
+        println!(
+            "Input: {:?}, target: {}, actual: {:.3}",
+            &input,
+            target[0],
+            network.eval(&input)[0]
+        );
    }

    let mut trainer = NeuraBatchedTrainer::new(0.05, 1000);
@ -35,6 +40,11 @@ fn main() {
    );

    for (input, target) in inputs {
-        println!("Input: {:?}, target: {}, actual: {:.3}", &input, target[0], network.eval(&input)[0]);
+        println!(
+            "Input: {:?}, target: {}, actual: {:.3}",
+            &input,
+            target[0],
+            network.eval(&input)[0]
+        );
    }
 }
--- a/src/derivable/activation.rs
+++ b/src/derivable/activation.rs
@ -36,10 +36,9 @@ impl NeuraDerivable<f32> for Relu {
 }

 #[derive(Clone, Copy, Debug, PartialEq)]
-pub struct LeakyRelu(pub f64);
+pub struct LeakyRelu<F>(pub F);

-
-impl NeuraDerivable<f64> for LeakyRelu {
+impl NeuraDerivable<f64> for LeakyRelu<f64> {
    #[inline(always)]
    fn eval(&self, input: f64) -> f64 {
        if input > 0.0 {
@ -59,13 +58,13 @@ impl NeuraDerivable<f64> for LeakyRelu {
    }
 }

-impl NeuraDerivable<f32> for LeakyRelu {
+impl NeuraDerivable<f32> for LeakyRelu<f32> {
    #[inline(always)]
    fn eval(&self, input: f32) -> f32 {
        if input > 0.0 {
            input
        } else {
-            (self.0 as f32) * input
+            self.0 * input
        }
    }

@ -74,7 +73,7 @@ impl NeuraDerivable<f32> for LeakyRelu {
        if input > 0.0 {
            1.0
        } else {
-            self.0 as f32
+            self.0
        }
    }
 }
--- a/src/derivable/mod.rs
+++ b/src/derivable/mod.rs
@ -1,5 +1,6 @@
 pub mod activation;
 pub mod loss;
+pub mod regularize;

 pub trait NeuraDerivable<F> {
    fn eval(&self, input: F) -> F;
--- a/src/derivable/regularize.rs
+++ b/src/derivable/regularize.rs
@ -0,0 +1,134 @@
+use super::*;
+
+/// Default regularization, which is no regularization
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct NeuraL0;
+
+impl NeuraDerivable<f64> for NeuraL0 {
+    #[inline(always)]
+    fn eval(&self, _input: f64) -> f64 {
+        0.0
+    }
+
+    #[inline(always)]
+    fn derivate(&self, _at: f64) -> f64 {
+        0.0
+    }
+}
+
+impl NeuraDerivable<f32> for NeuraL0 {
+    #[inline(always)]
+    fn eval(&self, _input: f32) -> f32 {
+        0.0
+    }
+
+    #[inline(always)]
+    fn derivate(&self, _at: f32) -> f32 {
+        0.0
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct NeuraL1<F>(pub F);
+
+impl NeuraDerivable<f64> for NeuraL1<f64> {
+    #[inline(always)]
+    fn eval(&self, input: f64) -> f64 {
+        self.0 * input.abs()
+    }
+
+    #[inline(always)]
+    fn derivate(&self, at: f64) -> f64 {
+        if at > 0.0 {
+            self.0
+        } else if at < 0.0 {
+            -self.0
+        } else {
+            0.0
+        }
+    }
+}
+
+impl NeuraDerivable<f32> for NeuraL1<f32> {
+    #[inline(always)]
+    fn eval(&self, input: f32) -> f32 {
+        self.0 * input.abs()
+    }
+
+    #[inline(always)]
+    fn derivate(&self, at: f32) -> f32 {
+        if at > 0.0 {
+            self.0
+        } else if at < 0.0 {
+            -self.0
+        } else {
+            0.0
+        }
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct NeuraL2<F>(pub F);
+
+impl NeuraDerivable<f64> for NeuraL2<f64> {
+    #[inline(always)]
+    fn eval(&self, input: f64) -> f64 {
+        self.0 * (input * input)
+    }
+
+    #[inline(always)]
+    fn derivate(&self, at: f64) -> f64 {
+        self.0 * at
+    }
+}
+
+impl NeuraDerivable<f32> for NeuraL2<f32> {
+    #[inline(always)]
+    fn eval(&self, input: f32) -> f32 {
+        self.0 * (input * input)
+    }
+
+    #[inline(always)]
+    fn derivate(&self, at: f32) -> f32 {
+        self.0 * at
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct NeuraElastic<F> {
+    pub l1: F,
+    pub l2: F,
+}
+
+impl<F> NeuraElastic<F> {
+    pub fn new(l1_factor: F, l2_factor: F) -> Self {
+        Self {
+            l1: l1_factor,
+            l2: l2_factor,
+        }
+    }
+}
+
+impl NeuraDerivable<f64> for NeuraElastic<f64> {
+    #[inline(always)]
+    fn eval(&self, input: f64) -> f64 {
+        NeuraL1(self.l1).eval(input) + NeuraL2(self.l2).eval(input)
+    }
+
+    #[inline(always)]
+    fn derivate(&self, at: f64) -> f64 {
+        NeuraL1(self.l1).derivate(at) + NeuraL2(self.l2).derivate(at)
+    }
+}
+
+impl NeuraDerivable<f32> for NeuraElastic<f32> {
+    #[inline(always)]
+    fn eval(&self, input: f32) -> f32 {
+        NeuraL1(self.l1).eval(input) + NeuraL2(self.l2).eval(input)
+    }
+
+    #[inline(always)]
+    fn derivate(&self, at: f32) -> f32 {
+        NeuraL1(self.l1).derivate(at) + NeuraL2(self.l2).derivate(at)
+    }
+}
--- a/src/layer/dense.rs
+++ b/src/layer/dense.rs
@ -1,39 +1,53 @@
 use super::NeuraLayer;
-use crate::{derivable::NeuraDerivable, utils::{multiply_matrix_vector, reverse_dot_product, multiply_matrix_transpose_vector}, train::NeuraTrainableLayer, algebra::NeuraVectorSpace};
+use crate::{
+    algebra::NeuraVectorSpace,
+    derivable::NeuraDerivable,
+    train::NeuraTrainableLayer,
+    utils::{multiply_matrix_transpose_vector, multiply_matrix_vector, reverse_dot_product},
+};

-use rand_distr::Distribution;
 use rand::Rng;
+use rand_distr::Distribution;

 #[derive(Clone, Debug)]
 pub struct NeuraDenseLayer<
    Act: NeuraDerivable<f64>,
+    Reg: NeuraDerivable<f64>,
    const INPUT_LEN: usize,
    const OUTPUT_LEN: usize,
 > {
    weights: [[f64; INPUT_LEN]; OUTPUT_LEN],
    bias: [f64; OUTPUT_LEN],
    activation: Act,
+    regularization: Reg,
 }

-impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize>
-    NeuraDenseLayer<Act, INPUT_LEN, OUTPUT_LEN>
+impl<
+        Act: NeuraDerivable<f64>,
+        Reg: NeuraDerivable<f64>,
+        const INPUT_LEN: usize,
+        const OUTPUT_LEN: usize,
+    > NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
 {
    pub fn new(
        weights: [[f64; INPUT_LEN]; OUTPUT_LEN],
        bias: [f64; OUTPUT_LEN],
        activation: Act,
+        regularization: Reg,
    ) -> Self {
        Self {
            weights,
            bias,
            activation,
+            regularization,
        }
    }

-    pub fn from_rng(rng: &mut impl Rng, activation: Act) -> Self {
+    pub fn from_rng(rng: &mut impl Rng, activation: Act, regularization: Reg) -> Self {
        let mut weights = [[0.0; INPUT_LEN]; OUTPUT_LEN];

-        let distribution = rand_distr::Normal::new(0.0, 2.0 / (INPUT_LEN as f64 + OUTPUT_LEN as f64)).unwrap();
+        let distribution =
+            rand_distr::Normal::new(0.0, 2.0 / (INPUT_LEN as f64 + OUTPUT_LEN as f64)).unwrap();

        for i in 0..OUTPUT_LEN {
            for j in 0..INPUT_LEN {
@ -46,12 +60,17 @@ impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize>
            // Biases are zero-initialized, as this shouldn't cause any issues during training
            bias: [0.0; OUTPUT_LEN],
            activation,
+            regularization,
        }
    }
 }

-impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize> NeuraLayer
-    for NeuraDenseLayer<Act, INPUT_LEN, OUTPUT_LEN>
+impl<
+        Act: NeuraDerivable<f64>,
+        Reg: NeuraDerivable<f64>,
+        const INPUT_LEN: usize,
+        const OUTPUT_LEN: usize,
+    > NeuraLayer for NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
 {
    type Input = [f64; INPUT_LEN];

@ -68,13 +87,21 @@ impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize>
    }
 }

-impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize> NeuraTrainableLayer
-    for NeuraDenseLayer<Act, INPUT_LEN, OUTPUT_LEN>
+impl<
+        Act: NeuraDerivable<f64>,
+        Reg: NeuraDerivable<f64>,
+        const INPUT_LEN: usize,
+        const OUTPUT_LEN: usize,
+    > NeuraTrainableLayer for NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
 {
    type Delta = ([[f64; INPUT_LEN]; OUTPUT_LEN], [f64; OUTPUT_LEN]);

    // TODO: double-check the math in this
-    fn backpropagate(&self, input: &Self::Input, epsilon: Self::Output) -> (Self::Input, Self::Delta) {
+    fn backpropagate(
+        &self,
+        input: &Self::Input,
+        epsilon: Self::Output,
+    ) -> (Self::Input, Self::Delta) {
        let evaluated = multiply_matrix_vector(&self.weights, input);
        // Compute delta from epsilon, with `self.activation'(input) ° epsilon = delta`
        let mut delta = epsilon.clone();
@ -96,17 +123,32 @@ impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize>
        NeuraVectorSpace::add_assign(&mut self.weights, &gradient.0);
        NeuraVectorSpace::add_assign(&mut self.bias, &gradient.1);
    }
+
+    fn regularize(&self) -> Self::Delta {
+        let mut res = ([[0.0; INPUT_LEN]; OUTPUT_LEN], [0.0; OUTPUT_LEN]);
+
+        for i in 0..OUTPUT_LEN {
+            for j in 0..INPUT_LEN {
+                res.0[i][j] = self.regularization.derivate(self.weights[i][j]);
+            }
+        }
+
+        // Note: biases aren't taken into account here, as per https://stats.stackexchange.com/questions/153605/no-regularisation-term-for-bias-unit-in-neural-network
+
+        res
+    }
 }

 #[cfg(test)]
 mod test {
    use super::*;
-    use crate::derivable::activation::Relu;
+    use crate::derivable::{activation::Relu, regularize::NeuraL0};

    #[test]
    fn test_from_rng() {
        let mut rng = rand::thread_rng();
-        let layer: NeuraDenseLayer<_, 64, 32> = NeuraDenseLayer::from_rng(&mut rng, Relu);
+        let layer: NeuraDenseLayer<_, _, 64, 32> =
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0);
        let mut input = [0.0; 64];
        for x in 0..64 {
            input[x] = rng.gen();
--- a/src/layer/dropout.rs
+++ b/src/layer/dropout.rs
@ -59,6 +59,10 @@ impl<const LENGTH: usize, R: Rng> NeuraTrainableLayer for NeuraDropoutLayer<LENG
        (epsilon, ())
    }

+    fn regularize(&self) -> Self::Delta {
+        ()
+    }
+
    #[inline(always)]
    fn apply_gradient(&mut self, _gradient: &Self::Delta) {
        // Noop
--- a/src/layer/mod.rs
+++ b/src/layer/mod.rs
@ -4,6 +4,9 @@ pub use dense::NeuraDenseLayer;
 mod dropout;
 pub use dropout::NeuraDropoutLayer;

+mod softmax;
+pub use softmax::NeuraSoftmaxLayer;
+
 pub trait NeuraLayer {
    type Input;
    type Output;
@ -13,18 +16,34 @@ pub trait NeuraLayer {

 #[macro_export]
 macro_rules! neura_layer {
-    ( "dense", $activation:expr, $output:expr ) => {
-        NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation)
-            as NeuraDenseLayer<_, _, $output>
+    ( "dense", $( $shape:expr ),*; $activation:expr ) => {
+        $crate::layer::NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation, $crate::derivable::regularize::NeuraL0)
+            as neura_layer!("_dense_shape", $($shape),*)
+    };
+
+    ( "dense", $( $shape:expr ),*; $activation:expr, $regularization:expr ) => {
+        $crate::layer::NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation, $regularization)
+            as neura_layer!("_dense_shape", $($shape),*)
+    };
+
+    ( "_dense_shape", $output:expr ) => {
+        $crate::layer::NeuraDenseLayer<_, _, _, $output>
    };

-    ( "dense", $activation:expr, $output:expr, $input:expr ) => {
-        NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation)
-            as NeuraDenseLayer<_, $input, $output>
+    ( "_dense_shape", $input:expr, $output:expr ) => {
+        $crate::layer::NeuraDenseLayer<_, _, $input, $output>
    };

    ( "dropout", $probability:expr ) => {
-        NeuraDropoutLayer::new($probability, rand::thread_rng())
-            as NeuraDropoutLayer<_, _>
+        $crate::layer::NeuraDropoutLayer::new($probability, rand::thread_rng())
+            as $crate::layer::NeuraDropoutLayer<_, _>
+    };
+
+    ( "softmax" ) => {
+        $crate::layer::NeuraSoftmaxLayer::new() as $crate::layer::NeuraSoftmaxLayer<_>
+    };
+
+    ( "softmax", $length:expr ) => {
+        $crate::layer::NeuraSoftmaxLayer::new() as $crate::layer::NeuraSoftmaxLayer<$length>
    };
 }
--- a/src/layer/softmax.rs
+++ b/src/layer/softmax.rs
@ -0,0 +1,155 @@
+use crate::{train::NeuraTrainableLayer, utils::multiply_vectors_pointwise};
+
+use super::NeuraLayer;
+
+#[non_exhaustive]
+#[derive(Clone, Debug)]
+pub struct NeuraSoftmaxLayer<const LENGTH: usize>;
+
+impl<const LENGTH: usize> NeuraSoftmaxLayer<LENGTH> {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+impl<const LENGTH: usize> NeuraLayer for NeuraSoftmaxLayer<LENGTH> {
+    type Input = [f64; LENGTH];
+    type Output = [f64; LENGTH];
+
+    fn eval(&self, input: &Self::Input) -> Self::Output {
+        let mut res = input.clone();
+
+        let mut max = 0.0;
+        for item in &res {
+            if *item > max {
+                max = *item;
+            }
+        }
+
+        for item in &mut res {
+            *item = (*item - max).exp();
+        }
+
+        let mut sum = 0.0;
+        for item in &res {
+            sum += item;
+        }
+
+        for item in &mut res {
+            *item /= sum;
+        }
+
+        res
+    }
+}
+
+impl<const LENGTH: usize> NeuraTrainableLayer for NeuraSoftmaxLayer<LENGTH> {
+    type Delta = ();
+
+    fn backpropagate(
+        &self,
+        input: &Self::Input,
+        mut epsilon: Self::Output,
+    ) -> (Self::Input, Self::Delta) {
+        // Note: a constant value can be added to `input` to bring it to increase precision
+        let evaluated = self.eval(input);
+
+        // Compute $a_{l-1,i} \epsilon_{l,i}$
+        epsilon = multiply_vectors_pointwise(&epsilon, &evaluated);
+
+        // Compute $\sum_{k}{a_{l-1,k} \epsilon_{l,k}}$
+        let sum_diagonal_terms: f64 = epsilon.iter().copied().sum();
+
+        for i in 0..LENGTH {
+            // Multiply $\sum_{k}{a_{l-1,k} \epsilon_{l,k}}$ by $a_{l-1,i}$ and add it to $a_{l-1,i} \epsilon_{l,i}$
+            epsilon[i] -= evaluated[i] * sum_diagonal_terms;
+        }
+
+        (epsilon, ())
+    }
+
+    fn regularize(&self) -> Self::Delta {
+        ()
+    }
+
+    fn apply_gradient(&mut self, _gradient: &Self::Delta) {
+        // Noop
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::algebra::NeuraVectorSpace;
+    use crate::utils::{
+        matrix_from_diagonal, multiply_matrix_vector, reverse_dot_product, uniform_vector,
+    };
+
+    use super::*;
+
+    #[test]
+    fn test_softmax_eval() {
+        const EPSILON: f64 = 0.000002;
+        let layer = NeuraSoftmaxLayer::new() as NeuraSoftmaxLayer<3>;
+
+        let result = layer.eval(&[1.0, 2.0, 8.0]);
+
+        assert!((result[0] - 0.0009088).abs() < EPSILON);
+        assert!((result[1] - 0.0024704).abs() < EPSILON);
+        assert!((result[2] - 0.9966208).abs() < EPSILON);
+    }
+
+    // Based on https://stats.stackexchange.com/a/306710
+    #[test]
+    fn test_softmax_backpropagation_two() {
+        const EPSILON: f64 = 0.000001;
+        let layer = NeuraSoftmaxLayer::new() as NeuraSoftmaxLayer<2>;
+
+        for input1 in [0.2, 0.3, 0.5] as [f64; 3] {
+            for input2 in [0.7, 1.1, 1.3] {
+                let input = [input1, input2];
+                let sum = input1.exp() + input2.exp();
+                let output = [input1.exp() / sum, input2.exp() / sum];
+                for epsilon1 in [1.7, 1.9, 2.3] {
+                    for epsilon2 in [2.9, 3.1, 3.7] {
+                        let epsilon = [epsilon1, epsilon2];
+
+                        let (epsilon, _) = layer.backpropagate(&input, epsilon);
+                        let expected = [
+                            output[0] * (1.0 - output[0]) * epsilon1
+                                - output[1] * output[0] * epsilon2,
+                            output[1] * (1.0 - output[1]) * epsilon2
+                                - output[1] * output[0] * epsilon1,
+                        ];
+
+                        assert!((epsilon[0] - expected[0]).abs() < EPSILON);
+                        assert!((epsilon[1] - expected[1]).abs() < EPSILON);
+                    }
+                }
+            }
+        }
+    }
+
+    // Based on https://e2eml.school/softmax.html
+    #[test]
+    fn test_softmax_backpropagation() {
+        const EPSILON: f64 = 0.000001;
+        let layer = NeuraSoftmaxLayer::new() as NeuraSoftmaxLayer<4>;
+
+        for _ in 0..100 {
+            let input: [f64; 4] = uniform_vector();
+            let evaluated = layer.eval(&input);
+            let loss: [f64; 4] = uniform_vector();
+
+            let mut derivative = reverse_dot_product(&evaluated, &evaluated);
+            derivative.mul_assign(-1.0);
+            derivative.add_assign(&matrix_from_diagonal(&evaluated));
+
+            let expected = multiply_matrix_vector(&derivative, &loss);
+            let (actual, _) = layer.backpropagate(&input, loss);
+
+            for i in 0..4 {
+                assert!((expected[i] - actual[i]).abs() < EPSILON);
+            }
+        }
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -10,15 +10,11 @@ mod utils;

 pub mod prelude {
    // Macros
-    pub use crate::{neura_network, neura_layer};
+    pub use crate::{neura_layer, neura_network};

    // Structs and traits
-    pub use crate::network::{NeuraNetwork};
-    pub use crate::layer::{
-        NeuraLayer,
-        NeuraDenseLayer,
-        NeuraDropoutLayer
-    };
+    pub use crate::layer::{NeuraDenseLayer, NeuraDropoutLayer, NeuraLayer};
+    pub use crate::network::NeuraNetwork;
    pub use crate::train::{NeuraBackprop, NeuraBatchedTrainer};
    pub use crate::utils::cycle_shuffling;
 }
--- a/src/network.rs
+++ b/src/network.rs
@ -82,6 +82,10 @@ impl<Layer: NeuraTrainableLayer> NeuraTrainable for NeuraNetwork<Layer, ()> {
        self.layer.backpropagate(&input, backprop_epsilon)
    }

+    fn regularize(&self) -> Self::Delta {
+        self.layer.regularize()
+    }
+
    fn prepare_epoch(&mut self) {
        self.layer.prepare_epoch();
    }
@ -117,6 +121,10 @@ impl<Layer: NeuraTrainableLayer, ChildNetwork: NeuraTrainable<Input = Layer::Out
        (backprop_gradient, (layer_gradient, weights_gradient))
    }

+    fn regularize(&self) -> Self::Delta {
+        (self.layer.regularize(), self.child_network.regularize())
+    }
+
    fn prepare_epoch(&mut self) {
        self.layer.prepare_epoch();
        self.child_network.prepare_epoch();
@ -145,7 +153,11 @@ macro_rules! neura_network {

 #[cfg(test)]
 mod test {
-    use crate::{derivable::activation::Relu, layer::NeuraDenseLayer, neura_layer};
+    use crate::{
+        derivable::{activation::Relu, regularize::NeuraL0},
+        layer::NeuraDenseLayer,
+        neura_layer,
+    };

    use super::*;

@ -154,23 +166,24 @@ mod test {
        let mut rng = rand::thread_rng();

        let _ = neura_network![
-            NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, 8, 16>,
-            NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, _, 12>,
-            NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, _, 2>
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, 8, 16>,
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, _, 12>,
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, _, 2>
        ];

-        let _ =
-            neura_network![NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, 8, 16>,];
+        let _ = neura_network![
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, 8, 16>,
+        ];

        let _ = neura_network![
-            NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, 8, 16>,
-            NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, _, 12>,
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, 8, 16>,
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, _, 12>,
        ];

        let _ = neura_network![
-            neura_layer!("dense", Relu, 16, 8),
-            neura_layer!("dense", Relu, 12),
-            neura_layer!("dense", Relu, 2)
+            neura_layer!("dense", 8, 16; Relu),
+            neura_layer!("dense", 12; Relu),
+            neura_layer!("dense", 2; Relu)
        ];
    }
 }
--- a/src/train.rs
+++ b/src/train.rs
@ -1,8 +1,5 @@
 use crate::{
-    algebra::NeuraVectorSpace,
-    derivable::NeuraLoss,
-    layer::NeuraLayer,
-    network::NeuraNetwork,
+    algebra::NeuraVectorSpace, derivable::NeuraLoss, layer::NeuraLayer, network::NeuraNetwork,
 };

 // TODO: move this to layer/mod.rs
@ -26,6 +23,9 @@ pub trait NeuraTrainableLayer: NeuraLayer {
        epsilon: Self::Output,
    ) -> (Self::Input, Self::Delta);

+    /// Computes the regularization
+    fn regularize(&self) -> Self::Delta;
+
    /// Applies `δW_l` to the weights of the layer
    fn apply_gradient(&mut self, gradient: &Self::Delta);

@ -51,6 +51,9 @@ pub trait NeuraTrainable: NeuraLayer {
        loss: Loss,
    ) -> (Self::Input, Self::Delta);

+    /// Should return the regularization gradient
+    fn regularize(&self) -> Self::Delta;
+
    /// Called before an epoch begins, to allow the network to set itself up for training.
    fn prepare_epoch(&mut self);

@ -89,8 +92,8 @@ impl<Loss: NeuraLoss + Clone> NeuraBackprop<Loss> {
    }
 }

-impl<const N: usize, Loss: NeuraLoss<Input = [f64; N]> + Clone> NeuraGradientSolver<[f64; N], Loss::Target>
-    for NeuraBackprop<Loss>
+impl<const N: usize, Loss: NeuraLoss<Input = [f64; N]> + Clone>
+    NeuraGradientSolver<[f64; N], Loss::Target> for NeuraBackprop<Loss>
 {
    fn get_gradient<Layer: NeuraLayer, ChildNetwork>(
        &self,
@ -184,15 +187,17 @@ impl NeuraBatchedTrainer {
        NeuraNetwork<Layer, ChildNetwork>: NeuraTrainable<Input = Layer::Input, Output = Output>,
        Layer::Input: Clone,
    {
-        // TODO: apply shuffling?
        let mut iter = inputs.into_iter();
        let factor = -self.learning_rate / (self.batch_size as f64);
        let momentum_factor = self.learning_momentum / self.learning_rate;
+        let reg_factor = -self.learning_rate;

        // Contains `momentum_factor * factor * gradient_sum_previous_iter`
-        let mut previous_gradient_sum = <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
+        let mut previous_gradient_sum =
+            <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
        'd: for epoch in 0..self.epochs {
-            let mut gradient_sum = <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
+            let mut gradient_sum =
+                <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
            network.prepare_epoch();

            for _ in 0..self.batch_size {
@ -205,6 +210,12 @@ impl NeuraBatchedTrainer {
            }

            gradient_sum.mul_assign(factor);
+
+            // Add regularization gradient (TODO: check if it can be factored out of momentum)
+            let mut reg_gradient = network.regularize();
+            reg_gradient.mul_assign(reg_factor);
+            gradient_sum.add_assign(&reg_gradient);
+
            network.apply_gradient(&gradient_sum);

            if self.learning_momentum != 0.0 {
@ -230,23 +241,21 @@ impl NeuraBatchedTrainer {

 #[cfg(test)]
 mod test {
-    use crate::{layer::NeuraDenseLayer, derivable::{activation::Linear, loss::Euclidean}};
    use super::*;
+    use crate::{
+        derivable::{activation::Linear, loss::Euclidean, regularize::NeuraL0},
+        layer::NeuraDenseLayer,
+    };

    #[test]
    fn test_backpropagation_simple() {
        for wa in [0.0, 0.25, 0.5, 1.0] {
            for wb in [0.0, 0.25, 0.5, 1.0] {
-                let network = NeuraNetwork::new(
-                    NeuraDenseLayer::new([[wa, wb]], [0.0], Linear),
-                    ()
-                );
-
-                let gradient = NeuraBackprop::new(Euclidean).get_gradient(
-                    &network,
-                    &[1.0, 1.0],
-                    &[0.0]
-                );
+                let network =
+                    NeuraNetwork::new(NeuraDenseLayer::new([[wa, wb]], [0.0], Linear, NeuraL0), ());
+
+                let gradient =
+                    NeuraBackprop::new(Euclidean).get_gradient(&network, &[1.0, 1.0], &[0.0]);

                let expected = wa + wb;
                assert!((gradient.0[0][0] - expected) < 0.001);
--- a/src/utils.rs
+++ b/src/utils.rs
@ -33,6 +33,7 @@ pub(crate) fn multiply_matrix_transpose_vector<const WIDTH: usize, const HEIGHT:
    result
 }

+// Returns $left^{\top} \cdot right$, ie. $\ket{left} \bra{right}$
 pub(crate) fn reverse_dot_product<const WIDTH: usize, const HEIGHT: usize>(
    left: &[f64; HEIGHT],
    right: &[f64; WIDTH],
@ -48,6 +49,32 @@ pub(crate) fn reverse_dot_product<const WIDTH: usize, const HEIGHT: usize>(
    result
 }

+pub(crate) fn multiply_vectors_pointwise<const LENGTH: usize>(
+    left: &[f64; LENGTH],
+    right: &[f64; LENGTH],
+) -> [f64; LENGTH] {
+    let mut result = [0.0; LENGTH];
+
+    for i in 0..LENGTH {
+        result[i] = left[i] * right[i];
+    }
+
+    result
+}
+
+#[cfg(test)]
+pub(crate) fn matrix_from_diagonal<const LENGTH: usize>(
+    vector: &[f64; LENGTH],
+) -> [[f64; LENGTH]; LENGTH] {
+    let mut result = [[0.0; LENGTH]; LENGTH];
+
+    for i in 0..LENGTH {
+        result[i][i] = vector[i];
+    }
+
+    result
+}
+
 #[allow(dead_code)]
 pub(crate) fn assign_add_vector<const N: usize>(sum: &mut [f64; N], operand: &[f64; N]) {
    for i in 0..N {
@ -89,7 +116,10 @@ struct ShuffleCycled<I: Iterator, R: rand::Rng> {
    rng: R,
 }

-impl<I: Iterator, R: rand::Rng> Iterator for ShuffleCycled<I, R> where I::Item: Clone {
+impl<I: Iterator, R: rand::Rng> Iterator for ShuffleCycled<I, R>
+where
+    I::Item: Clone,
+{
    type Item = I::Item;

    #[inline]
@ -99,7 +129,7 @@ impl<I: Iterator, R: rand::Rng> Iterator for ShuffleCycled<I, R> where I::Item:
        if let Some(next) = self.iter.next() {
            // Base iterator is not empty yet
            self.buffer.push(next.clone());
-            return Some(next)
+            return Some(next);
        } else if self.buffer.len() > 0 {
            if self.index == 0 {
                // Shuffle the vector and return the first element, setting the index to 1
@ -118,12 +148,9 @@ impl<I: Iterator, R: rand::Rng> Iterator for ShuffleCycled<I, R> where I::Item:
    }
 }

-pub fn cycle_shuffling<I: Iterator>(
-    iter: I,
-    rng: impl rand::Rng
-) -> impl Iterator<Item=I::Item>
+pub fn cycle_shuffling<I: Iterator>(iter: I, rng: impl rand::Rng) -> impl Iterator<Item = I::Item>
 where
-    I::Item: Clone
+    I::Item: Clone,
 {
    let size_hint = iter.size_hint();
    let size_hint = size_hint.1.unwrap_or(size_hint.0).max(1);
@ -132,6 +159,19 @@ where
        buffer: Vec::with_capacity(size_hint),
        index: 0,
        iter,
-        rng
+        rng,
    }
 }
+
+#[cfg(test)]
+pub(crate) fn uniform_vector<const LENGTH: usize>() -> [f64; LENGTH] {
+    use rand::Rng;
+    let mut res = [0.0; LENGTH];
+    let mut rng = rand::thread_rng();
+
+    for i in 0..LENGTH {
+        res[i] = rng.gen();
+    }
+
+    res
+}