✨ Re-order arguments of neura_layer, implement softmax and normalization

2 years ago · bca56a5557
parent 220c61ff6b
commit bca56a5557
13 changed files with 520 additions and 91 deletions
--- a/examples/bivariate.rs
+++ b/examples/bivariate.rs
@ -2,22 +2,26 @@
 use std::io::Write;
-use neuramethyst::prelude::*;
+use neuramethyst::derivable::activation::Linear;
-use neuramethyst::derivable::activation::{Relu, Tanh, LeakyRelu};
+#[allow(unused_imports)]
 use neuramethyst::derivable::activation::{LeakyRelu, Relu, Tanh};
 use neuramethyst::derivable::loss::Euclidean;
 use neuramethyst::derivable::regularize::NeuraElastic;
 use neuramethyst::prelude::*;
 use rand::Rng;
 fn main() {
    let mut network = neura_network![
-        neura_layer!("dense", LeakyRelu(0.01), 9, 2),
+        neura_layer!("dense", 2, 8; LeakyRelu(0.01)),
        neura_layer!("dropout", 0.1),
-        neura_layer!("dense", LeakyRelu(0.01), 9),
+        neura_layer!("dense", 8; LeakyRelu(0.01), NeuraElastic::new(0.0001, 0.002)),
        neura_layer!("dropout", 0.3),
-        neura_layer!("dense", LeakyRelu(0.01), 6),
+        neura_layer!("dense", 8; LeakyRelu(0.01), NeuraElastic::new(0.0001, 0.002)),
        neura_layer!("dropout", 0.1),
-        neura_layer!("dense", LeakyRelu(0.01), 4),
+        neura_layer!("dense", 4; LeakyRelu(0.1), NeuraElastic::new(0.0001, 0.002)),
-        neura_layer!("dense", LeakyRelu(0.1), 2)
+        neura_layer!("dense", 2; Linear),
        neura_layer!("softmax"),
    ];
    // println!("{:#?}", network);
@ -39,20 +43,23 @@ fn main() {
    let test_inputs: Vec<_> = inputs.clone().take(100).collect();
-    let mut trainer = NeuraBatchedTrainer::new(0.1, 4000);
+    let mut trainer = NeuraBatchedTrainer::new(0.25, 1000);
-    trainer.log_epochs = 500;
+    trainer.log_epochs = 50;
    trainer.learning_momentum = 0.05;
    trainer.batch_size = 2000;
    trainer.train(
        NeuraBackprop::new(Euclidean),
        &mut network,
        inputs,
-        &test_inputs
+        &test_inputs,
    );
    let mut file = std::fs::File::create("target/bivariate.csv").unwrap();
    for (input, _target) in test_inputs {
        let guess = argmax(&network.eval(&input));
        writeln!(&mut file, "{},{},{}", input[0], input[1], guess).unwrap();
        // println!("{:?}", network.eval(&input));
    }
    // println!("{:#?}", network);
--- a/examples/xor.rs
+++ b/examples/xor.rs
@ -1,25 +1,30 @@
 #![feature(generic_arg_infer)]
-use neuramethyst::prelude::*;
+use neuramethyst::derivable::activation::Relu;
 use neuramethyst::derivable::activation::{Relu};
 use neuramethyst::derivable::loss::Euclidean;
 use neuramethyst::prelude::*;
 fn main() {
    let mut network = neura_network![
-        neura_layer!("dense", Relu, 4, 2),
+        neura_layer!("dense", 2, 4; Relu),
-        neura_layer!("dense", Relu, 3),
+        neura_layer!("dense", 3; Relu),
-        neura_layer!("dense", Relu, 1)
+        neura_layer!("dense", 1; Relu)
    ];
    let inputs = [
        ([0.0, 0.0], [0.0]),
        ([0.0, 1.0], [1.0]),
        ([1.0, 0.0], [1.0]),
-        ([1.0, 1.0], [0.0])
+        ([1.0, 1.0], [0.0]),
    ];
    for (input, target) in inputs {
-        println!("Input: {:?}, target: {}, actual: {:.3}", &input, target[0], network.eval(&input)[0]);
+        println!(
            "Input: {:?}, target: {}, actual: {:.3}",
            &input,
            target[0],
            network.eval(&input)[0]
        );
    }
    let mut trainer = NeuraBatchedTrainer::new(0.05, 1000);
@ -35,6 +40,11 @@ fn main() {
    );
    for (input, target) in inputs {
-        println!("Input: {:?}, target: {}, actual: {:.3}", &input, target[0], network.eval(&input)[0]);
+        println!(
            "Input: {:?}, target: {}, actual: {:.3}",
            &input,
            target[0],
            network.eval(&input)[0]
        );
    }
 }
--- a/src/derivable/activation.rs
+++ b/src/derivable/activation.rs
@ -36,10 +36,9 @@ impl NeuraDerivable<f32> for Relu {
 }
 #[derive(Clone, Copy, Debug, PartialEq)]
-pub struct LeakyRelu(pub f64);
+pub struct LeakyRelu<F>(pub F);
-
+impl NeuraDerivable<f64> for LeakyRelu<f64> {
 impl NeuraDerivable<f64> for LeakyRelu {
    #[inline(always)]
    fn eval(&self, input: f64) -> f64 {
        if input > 0.0 {
@ -59,13 +58,13 @@ impl NeuraDerivable<f64> for LeakyRelu {
    }
 }
-impl NeuraDerivable<f32> for LeakyRelu {
+impl NeuraDerivable<f32> for LeakyRelu<f32> {
    #[inline(always)]
    fn eval(&self, input: f32) -> f32 {
        if input > 0.0 {
            input
        } else {
-            (self.0 as f32) * input
+            self.0 * input
        }
    }
@ -74,7 +73,7 @@ impl NeuraDerivable<f32> for LeakyRelu {
        if input > 0.0 {
            1.0
        } else {
-            self.0 as f32
+            self.0
        }
    }
 }
--- a/src/derivable/mod.rs
+++ b/src/derivable/mod.rs
@ -1,5 +1,6 @@
 pub mod activation;
 pub mod loss;
 pub mod regularize;
 pub trait NeuraDerivable<F> {
    fn eval(&self, input: F) -> F;
--- a/src/derivable/regularize.rs
+++ b/src/derivable/regularize.rs
@ -0,0 +1,134 @@
 use super::*;
 /// Default regularization, which is no regularization
 #[derive(Clone, Copy, Debug, PartialEq)]
 pub struct NeuraL0;
 impl NeuraDerivable<f64> for NeuraL0 {
    #[inline(always)]
    fn eval(&self, _input: f64) -> f64 {
        0.0
    }
    #[inline(always)]
    fn derivate(&self, _at: f64) -> f64 {
        0.0
    }
 }
 impl NeuraDerivable<f32> for NeuraL0 {
    #[inline(always)]
    fn eval(&self, _input: f32) -> f32 {
        0.0
    }
    #[inline(always)]
    fn derivate(&self, _at: f32) -> f32 {
        0.0
    }
 }
 #[derive(Clone, Copy, Debug, PartialEq)]
 pub struct NeuraL1<F>(pub F);
 impl NeuraDerivable<f64> for NeuraL1<f64> {
    #[inline(always)]
    fn eval(&self, input: f64) -> f64 {
        self.0 * input.abs()
    }
    #[inline(always)]
    fn derivate(&self, at: f64) -> f64 {
        if at > 0.0 {
            self.0
        } else if at < 0.0 {
            -self.0
        } else {
            0.0
        }
    }
 }
 impl NeuraDerivable<f32> for NeuraL1<f32> {
    #[inline(always)]
    fn eval(&self, input: f32) -> f32 {
        self.0 * input.abs()
    }
    #[inline(always)]
    fn derivate(&self, at: f32) -> f32 {
        if at > 0.0 {
            self.0
        } else if at < 0.0 {
            -self.0
        } else {
            0.0
        }
    }
 }
 #[derive(Clone, Copy, Debug, PartialEq)]
 pub struct NeuraL2<F>(pub F);
 impl NeuraDerivable<f64> for NeuraL2<f64> {
    #[inline(always)]
    fn eval(&self, input: f64) -> f64 {
        self.0 * (input * input)
    }
    #[inline(always)]
    fn derivate(&self, at: f64) -> f64 {
        self.0 * at
    }
 }
 impl NeuraDerivable<f32> for NeuraL2<f32> {
    #[inline(always)]
    fn eval(&self, input: f32) -> f32 {
        self.0 * (input * input)
    }
    #[inline(always)]
    fn derivate(&self, at: f32) -> f32 {
        self.0 * at
    }
 }
 #[derive(Clone, Copy, Debug, PartialEq)]
 pub struct NeuraElastic<F> {
    pub l1: F,
    pub l2: F,
 }
 impl<F> NeuraElastic<F> {
    pub fn new(l1_factor: F, l2_factor: F) -> Self {
        Self {
            l1: l1_factor,
            l2: l2_factor,
        }
    }
 }
 impl NeuraDerivable<f64> for NeuraElastic<f64> {
    #[inline(always)]
    fn eval(&self, input: f64) -> f64 {
        NeuraL1(self.l1).eval(input) + NeuraL2(self.l2).eval(input)
    }
    #[inline(always)]
    fn derivate(&self, at: f64) -> f64 {
        NeuraL1(self.l1).derivate(at) + NeuraL2(self.l2).derivate(at)
    }
 }
 impl NeuraDerivable<f32> for NeuraElastic<f32> {
    #[inline(always)]
    fn eval(&self, input: f32) -> f32 {
        NeuraL1(self.l1).eval(input) + NeuraL2(self.l2).eval(input)
    }
    #[inline(always)]
    fn derivate(&self, at: f32) -> f32 {
        NeuraL1(self.l1).derivate(at) + NeuraL2(self.l2).derivate(at)
    }
 }
--- a/src/layer/dense.rs
+++ b/src/layer/dense.rs
@ -1,39 +1,53 @@
 use super::NeuraLayer;
-use crate::{derivable::NeuraDerivable, utils::{multiply_matrix_vector, reverse_dot_product, multiply_matrix_transpose_vector}, train::NeuraTrainableLayer, algebra::NeuraVectorSpace};
+use crate::{
    algebra::NeuraVectorSpace,
    derivable::NeuraDerivable,
    train::NeuraTrainableLayer,
    utils::{multiply_matrix_transpose_vector, multiply_matrix_vector, reverse_dot_product},
 };
 use rand_distr::Distribution;
 use rand::Rng;
 use rand_distr::Distribution;
 #[derive(Clone, Debug)]
 pub struct NeuraDenseLayer<
    Act: NeuraDerivable<f64>,
    Reg: NeuraDerivable<f64>,
    const INPUT_LEN: usize,
    const OUTPUT_LEN: usize,
 > {
    weights: [[f64; INPUT_LEN]; OUTPUT_LEN],
    bias: [f64; OUTPUT_LEN],
    activation: Act,
    regularization: Reg,
 }
-impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize>
+impl<
-    NeuraDenseLayer<Act, INPUT_LEN, OUTPUT_LEN>
+        Act: NeuraDerivable<f64>,
        Reg: NeuraDerivable<f64>,
        const INPUT_LEN: usize,
        const OUTPUT_LEN: usize,
    > NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
 {
    pub fn new(
        weights: [[f64; INPUT_LEN]; OUTPUT_LEN],
        bias: [f64; OUTPUT_LEN],
        activation: Act,
        regularization: Reg,
    ) -> Self {
        Self {
            weights,
            bias,
            activation,
            regularization,
        }
    }
-    pub fn from_rng(rng: &mut impl Rng, activation: Act) -> Self {
+    pub fn from_rng(rng: &mut impl Rng, activation: Act, regularization: Reg) -> Self {
        let mut weights = [[0.0; INPUT_LEN]; OUTPUT_LEN];
-        let distribution = rand_distr::Normal::new(0.0, 2.0 / (INPUT_LEN as f64 + OUTPUT_LEN as f64)).unwrap();
+        let distribution =
            rand_distr::Normal::new(0.0, 2.0 / (INPUT_LEN as f64 + OUTPUT_LEN as f64)).unwrap();
        for i in 0..OUTPUT_LEN {
            for j in 0..INPUT_LEN {
@ -46,12 +60,17 @@ impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize>
            // Biases are zero-initialized, as this shouldn't cause any issues during training
            bias: [0.0; OUTPUT_LEN],
            activation,
            regularization,
        }
    }
 }
-impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize> NeuraLayer
+impl<
-    for NeuraDenseLayer<Act, INPUT_LEN, OUTPUT_LEN>
+        Act: NeuraDerivable<f64>,
        Reg: NeuraDerivable<f64>,
        const INPUT_LEN: usize,
        const OUTPUT_LEN: usize,
    > NeuraLayer for NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
 {
    type Input = [f64; INPUT_LEN];
@ -68,13 +87,21 @@ impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize>
    }
 }
-impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize> NeuraTrainableLayer
+impl<
-    for NeuraDenseLayer<Act, INPUT_LEN, OUTPUT_LEN>
+        Act: NeuraDerivable<f64>,
        Reg: NeuraDerivable<f64>,
        const INPUT_LEN: usize,
        const OUTPUT_LEN: usize,
    > NeuraTrainableLayer for NeuraDenseLayer<Act, Reg, INPUT_LEN, OUTPUT_LEN>
 {
    type Delta = ([[f64; INPUT_LEN]; OUTPUT_LEN], [f64; OUTPUT_LEN]);
    // TODO: double-check the math in this
-    fn backpropagate(&self, input: &Self::Input, epsilon: Self::Output) -> (Self::Input, Self::Delta) {
+    fn backpropagate(
        &self,
        input: &Self::Input,
        epsilon: Self::Output,
    ) -> (Self::Input, Self::Delta) {
        let evaluated = multiply_matrix_vector(&self.weights, input);
        // Compute delta from epsilon, with `self.activation'(input) ° epsilon = delta`
        let mut delta = epsilon.clone();
@ -96,17 +123,32 @@ impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize>
        NeuraVectorSpace::add_assign(&mut self.weights, &gradient.0);
        NeuraVectorSpace::add_assign(&mut self.bias, &gradient.1);
    }
    fn regularize(&self) -> Self::Delta {
        let mut res = ([[0.0; INPUT_LEN]; OUTPUT_LEN], [0.0; OUTPUT_LEN]);
        for i in 0..OUTPUT_LEN {
            for j in 0..INPUT_LEN {
                res.0[i][j] = self.regularization.derivate(self.weights[i][j]);
            }
        }
        // Note: biases aren't taken into account here, as per https://stats.stackexchange.com/questions/153605/no-regularisation-term-for-bias-unit-in-neural-network
        res
    }
 }
 #[cfg(test)]
 mod test {
    use super::*;
-    use crate::derivable::activation::Relu;
+    use crate::derivable::{activation::Relu, regularize::NeuraL0};
    #[test]
    fn test_from_rng() {
        let mut rng = rand::thread_rng();
-        let layer: NeuraDenseLayer<_, 64, 32> = NeuraDenseLayer::from_rng(&mut rng, Relu);
+        let layer: NeuraDenseLayer<_, _, 64, 32> =
            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0);
        let mut input = [0.0; 64];
        for x in 0..64 {
            input[x] = rng.gen();
--- a/src/layer/dropout.rs
+++ b/src/layer/dropout.rs
@ -59,6 +59,10 @@ impl<const LENGTH: usize, R: Rng> NeuraTrainableLayer for NeuraDropoutLayer<LENG
        (epsilon, ())
    }
    fn regularize(&self) -> Self::Delta {
        ()
    }
    #[inline(always)]
    fn apply_gradient(&mut self, _gradient: &Self::Delta) {
        // Noop
--- a/src/layer/mod.rs
+++ b/src/layer/mod.rs
@ -4,6 +4,9 @@ pub use dense::NeuraDenseLayer;
 mod dropout;
 pub use dropout::NeuraDropoutLayer;
 mod softmax;
 pub use softmax::NeuraSoftmaxLayer;
 pub trait NeuraLayer {
    type Input;
    type Output;
@ -13,18 +16,34 @@ pub trait NeuraLayer {
 #[macro_export]
 macro_rules! neura_layer {
-    ( "dense", $activation:expr, $output:expr ) => {
+    ( "dense", $( $shape:expr ),*; $activation:expr ) => {
-        NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation)
+        $crate::layer::NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation, $crate::derivable::regularize::NeuraL0)
-            as NeuraDenseLayer<_, _, $output>
+            as neura_layer!("_dense_shape", $($shape),*)
    };
    ( "dense", $( $shape:expr ),*; $activation:expr, $regularization:expr ) => {
        $crate::layer::NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation, $regularization)
            as neura_layer!("_dense_shape", $($shape),*)
    };
    ( "_dense_shape", $output:expr ) => {
        $crate::layer::NeuraDenseLayer<_, _, _, $output>
    };
-    ( "dense", $activation:expr, $output:expr, $input:expr ) => {
+    ( "_dense_shape", $input:expr, $output:expr ) => {
-        NeuraDenseLayer::from_rng(&mut rand::thread_rng(), $activation)
+        $crate::layer::NeuraDenseLayer<_, _, $input, $output>
            as NeuraDenseLayer<_, $input, $output>
    };
    ( "dropout", $probability:expr ) => {
-        NeuraDropoutLayer::new($probability, rand::thread_rng())
+        $crate::layer::NeuraDropoutLayer::new($probability, rand::thread_rng())
-            as NeuraDropoutLayer<_, _>
+            as $crate::layer::NeuraDropoutLayer<_, _>
    };
    ( "softmax" ) => {
        $crate::layer::NeuraSoftmaxLayer::new() as $crate::layer::NeuraSoftmaxLayer<_>
    };
    ( "softmax", $length:expr ) => {
        $crate::layer::NeuraSoftmaxLayer::new() as $crate::layer::NeuraSoftmaxLayer<$length>
    };
 }
--- a/src/layer/softmax.rs
+++ b/src/layer/softmax.rs
@ -0,0 +1,155 @@
 use crate::{train::NeuraTrainableLayer, utils::multiply_vectors_pointwise};
 use super::NeuraLayer;
 #[non_exhaustive]
 #[derive(Clone, Debug)]
 pub struct NeuraSoftmaxLayer<const LENGTH: usize>;
 impl<const LENGTH: usize> NeuraSoftmaxLayer<LENGTH> {
    pub fn new() -> Self {
        Self
    }
 }
 impl<const LENGTH: usize> NeuraLayer for NeuraSoftmaxLayer<LENGTH> {
    type Input = [f64; LENGTH];
    type Output = [f64; LENGTH];
    fn eval(&self, input: &Self::Input) -> Self::Output {
        let mut res = input.clone();
        let mut max = 0.0;
        for item in &res {
            if *item > max {
                max = *item;
            }
        }
        for item in &mut res {
            *item = (*item - max).exp();
        }
        let mut sum = 0.0;
        for item in &res {
            sum += item;
        }
        for item in &mut res {
            *item /= sum;
        }
        res
    }
 }
 impl<const LENGTH: usize> NeuraTrainableLayer for NeuraSoftmaxLayer<LENGTH> {
    type Delta = ();
    fn backpropagate(
        &self,
        input: &Self::Input,
        mut epsilon: Self::Output,
    ) -> (Self::Input, Self::Delta) {
        // Note: a constant value can be added to `input` to bring it to increase precision
        let evaluated = self.eval(input);
        // Compute $a_{l-1,i} \epsilon_{l,i}$
        epsilon = multiply_vectors_pointwise(&epsilon, &evaluated);
        // Compute $\sum_{k}{a_{l-1,k} \epsilon_{l,k}}$
        let sum_diagonal_terms: f64 = epsilon.iter().copied().sum();
        for i in 0..LENGTH {
            // Multiply $\sum_{k}{a_{l-1,k} \epsilon_{l,k}}$ by $a_{l-1,i}$ and add it to $a_{l-1,i} \epsilon_{l,i}$
            epsilon[i] -= evaluated[i] * sum_diagonal_terms;
        }
        (epsilon, ())
    }
    fn regularize(&self) -> Self::Delta {
        ()
    }
    fn apply_gradient(&mut self, _gradient: &Self::Delta) {
        // Noop
    }
 }
 #[cfg(test)]
 mod test {
    use crate::algebra::NeuraVectorSpace;
    use crate::utils::{
        matrix_from_diagonal, multiply_matrix_vector, reverse_dot_product, uniform_vector,
    };
    use super::*;
    #[test]
    fn test_softmax_eval() {
        const EPSILON: f64 = 0.000002;
        let layer = NeuraSoftmaxLayer::new() as NeuraSoftmaxLayer<3>;
        let result = layer.eval(&[1.0, 2.0, 8.0]);
        assert!((result[0] - 0.0009088).abs() < EPSILON);
        assert!((result[1] - 0.0024704).abs() < EPSILON);
        assert!((result[2] - 0.9966208).abs() < EPSILON);
    }
    // Based on https://stats.stackexchange.com/a/306710
    #[test]
    fn test_softmax_backpropagation_two() {
        const EPSILON: f64 = 0.000001;
        let layer = NeuraSoftmaxLayer::new() as NeuraSoftmaxLayer<2>;
        for input1 in [0.2, 0.3, 0.5] as [f64; 3] {
            for input2 in [0.7, 1.1, 1.3] {
                let input = [input1, input2];
                let sum = input1.exp() + input2.exp();
                let output = [input1.exp() / sum, input2.exp() / sum];
                for epsilon1 in [1.7, 1.9, 2.3] {
                    for epsilon2 in [2.9, 3.1, 3.7] {
                        let epsilon = [epsilon1, epsilon2];
                        let (epsilon, _) = layer.backpropagate(&input, epsilon);
                        let expected = [
                            output[0] * (1.0 - output[0]) * epsilon1
                                - output[1] * output[0] * epsilon2,
                            output[1] * (1.0 - output[1]) * epsilon2
                                - output[1] * output[0] * epsilon1,
                        ];
                        assert!((epsilon[0] - expected[0]).abs() < EPSILON);
                        assert!((epsilon[1] - expected[1]).abs() < EPSILON);
                    }
                }
            }
        }
    }
    // Based on https://e2eml.school/softmax.html
    #[test]
    fn test_softmax_backpropagation() {
        const EPSILON: f64 = 0.000001;
        let layer = NeuraSoftmaxLayer::new() as NeuraSoftmaxLayer<4>;
        for _ in 0..100 {
            let input: [f64; 4] = uniform_vector();
            let evaluated = layer.eval(&input);
            let loss: [f64; 4] = uniform_vector();
            let mut derivative = reverse_dot_product(&evaluated, &evaluated);
            derivative.mul_assign(-1.0);
            derivative.add_assign(&matrix_from_diagonal(&evaluated));
            let expected = multiply_matrix_vector(&derivative, &loss);
            let (actual, _) = layer.backpropagate(&input, loss);
            for i in 0..4 {
                assert!((expected[i] - actual[i]).abs() < EPSILON);
            }
        }
    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -10,15 +10,11 @@ mod utils;
 pub mod prelude {
    // Macros
-    pub use crate::{neura_network, neura_layer};
+    pub use crate::{neura_layer, neura_network};
    // Structs and traits
-    pub use crate::network::{NeuraNetwork};
+    pub use crate::layer::{NeuraDenseLayer, NeuraDropoutLayer, NeuraLayer};
-    pub use crate::layer::{
+    pub use crate::network::NeuraNetwork;
        NeuraLayer,
        NeuraDenseLayer,
        NeuraDropoutLayer
    };
    pub use crate::train::{NeuraBackprop, NeuraBatchedTrainer};
    pub use crate::utils::cycle_shuffling;
 }
--- a/src/network.rs
+++ b/src/network.rs
@ -82,6 +82,10 @@ impl<Layer: NeuraTrainableLayer> NeuraTrainable for NeuraNetwork<Layer, ()> {
        self.layer.backpropagate(&input, backprop_epsilon)
    }
    fn regularize(&self) -> Self::Delta {
        self.layer.regularize()
    }
    fn prepare_epoch(&mut self) {
        self.layer.prepare_epoch();
    }
@ -117,6 +121,10 @@ impl<Layer: NeuraTrainableLayer, ChildNetwork: NeuraTrainable<Input = Layer::Out
        (backprop_gradient, (layer_gradient, weights_gradient))
    }
    fn regularize(&self) -> Self::Delta {
        (self.layer.regularize(), self.child_network.regularize())
    }
    fn prepare_epoch(&mut self) {
        self.layer.prepare_epoch();
        self.child_network.prepare_epoch();
@ -145,7 +153,11 @@ macro_rules! neura_network {
 #[cfg(test)]
 mod test {
-    use crate::{derivable::activation::Relu, layer::NeuraDenseLayer, neura_layer};
+    use crate::{
        derivable::{activation::Relu, regularize::NeuraL0},
        layer::NeuraDenseLayer,
        neura_layer,
    };
    use super::*;
@ -154,23 +166,24 @@ mod test {
        let mut rng = rand::thread_rng();
        let _ = neura_network![
-            NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, 8, 16>,
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, 8, 16>,
-            NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, _, 12>,
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, _, 12>,
-            NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, _, 2>
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, _, 2>
        ];
-        let _ =
+        let _ = neura_network![
-            neura_network![NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, 8, 16>,];
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, 8, 16>,
        ];
        let _ = neura_network![
-            NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, 8, 16>,
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, 8, 16>,
-            NeuraDenseLayer::from_rng(&mut rng, Relu) as NeuraDenseLayer<_, _, 12>,
+            NeuraDenseLayer::from_rng(&mut rng, Relu, NeuraL0) as NeuraDenseLayer<_, _, _, 12>,
        ];
        let _ = neura_network![
-            neura_layer!("dense", Relu, 16, 8),
+            neura_layer!("dense", 8, 16; Relu),
-            neura_layer!("dense", Relu, 12),
+            neura_layer!("dense", 12; Relu),
-            neura_layer!("dense", Relu, 2)
+            neura_layer!("dense", 2; Relu)
        ];
    }
 }
--- a/src/train.rs
+++ b/src/train.rs
@ -1,8 +1,5 @@
 use crate::{
-    algebra::NeuraVectorSpace,
+    algebra::NeuraVectorSpace, derivable::NeuraLoss, layer::NeuraLayer, network::NeuraNetwork,
    derivable::NeuraLoss,
    layer::NeuraLayer,
    network::NeuraNetwork,
 };
 // TODO: move this to layer/mod.rs
@ -26,6 +23,9 @@ pub trait NeuraTrainableLayer: NeuraLayer {
        epsilon: Self::Output,
    ) -> (Self::Input, Self::Delta);
    /// Computes the regularization
    fn regularize(&self) -> Self::Delta;
    /// Applies `δW_l` to the weights of the layer
    fn apply_gradient(&mut self, gradient: &Self::Delta);
@ -51,6 +51,9 @@ pub trait NeuraTrainable: NeuraLayer {
        loss: Loss,
    ) -> (Self::Input, Self::Delta);
    /// Should return the regularization gradient
    fn regularize(&self) -> Self::Delta;
    /// Called before an epoch begins, to allow the network to set itself up for training.
    fn prepare_epoch(&mut self);
@ -89,8 +92,8 @@ impl<Loss: NeuraLoss + Clone> NeuraBackprop<Loss> {
    }
 }
-impl<const N: usize, Loss: NeuraLoss<Input = [f64; N]> + Clone> NeuraGradientSolver<[f64; N], Loss::Target>
+impl<const N: usize, Loss: NeuraLoss<Input = [f64; N]> + Clone>
-    for NeuraBackprop<Loss>
+    NeuraGradientSolver<[f64; N], Loss::Target> for NeuraBackprop<Loss>
 {
    fn get_gradient<Layer: NeuraLayer, ChildNetwork>(
        &self,
@ -184,15 +187,17 @@ impl NeuraBatchedTrainer {
        NeuraNetwork<Layer, ChildNetwork>: NeuraTrainable<Input = Layer::Input, Output = Output>,
        Layer::Input: Clone,
    {
        // TODO: apply shuffling?
        let mut iter = inputs.into_iter();
        let factor = -self.learning_rate / (self.batch_size as f64);
        let momentum_factor = self.learning_momentum / self.learning_rate;
        let reg_factor = -self.learning_rate;
        // Contains `momentum_factor * factor * gradient_sum_previous_iter`
-        let mut previous_gradient_sum = <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
+        let mut previous_gradient_sum =
            <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
        'd: for epoch in 0..self.epochs {
-            let mut gradient_sum = <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
+            let mut gradient_sum =
                <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
            network.prepare_epoch();
            for _ in 0..self.batch_size {
@ -205,6 +210,12 @@ impl NeuraBatchedTrainer {
            }
            gradient_sum.mul_assign(factor);
            // Add regularization gradient (TODO: check if it can be factored out of momentum)
            let mut reg_gradient = network.regularize();
            reg_gradient.mul_assign(reg_factor);
            gradient_sum.add_assign(&reg_gradient);
            network.apply_gradient(&gradient_sum);
            if self.learning_momentum != 0.0 {
@ -230,23 +241,21 @@ impl NeuraBatchedTrainer {
 #[cfg(test)]
 mod test {
    use crate::{layer::NeuraDenseLayer, derivable::{activation::Linear, loss::Euclidean}};
    use super::*;
    use crate::{
        derivable::{activation::Linear, loss::Euclidean, regularize::NeuraL0},
        layer::NeuraDenseLayer,
    };
    #[test]
    fn test_backpropagation_simple() {
        for wa in [0.0, 0.25, 0.5, 1.0] {
            for wb in [0.0, 0.25, 0.5, 1.0] {
-                let network = NeuraNetwork::new(
+                let network =
-                    NeuraDenseLayer::new([[wa, wb]], [0.0], Linear),
+                    NeuraNetwork::new(NeuraDenseLayer::new([[wa, wb]], [0.0], Linear, NeuraL0), ());
-                    ()
+
-                );
+                let gradient =
-
+                    NeuraBackprop::new(Euclidean).get_gradient(&network, &[1.0, 1.0], &[0.0]);
                let gradient = NeuraBackprop::new(Euclidean).get_gradient(
                    &network,
                    &[1.0, 1.0],
                    &[0.0]
                );
                let expected = wa + wb;
                assert!((gradient.0[0][0] - expected) < 0.001);
--- a/src/utils.rs
+++ b/src/utils.rs
@ -33,6 +33,7 @@ pub(crate) fn multiply_matrix_transpose_vector<const WIDTH: usize, const HEIGHT:
    result
 }
 // Returns $left^{\top} \cdot right$, ie. $\ket{left} \bra{right}$
 pub(crate) fn reverse_dot_product<const WIDTH: usize, const HEIGHT: usize>(
    left: &[f64; HEIGHT],
    right: &[f64; WIDTH],
@ -48,6 +49,32 @@ pub(crate) fn reverse_dot_product<const WIDTH: usize, const HEIGHT: usize>(
    result
 }
 pub(crate) fn multiply_vectors_pointwise<const LENGTH: usize>(
    left: &[f64; LENGTH],
    right: &[f64; LENGTH],
 ) -> [f64; LENGTH] {
    let mut result = [0.0; LENGTH];
    for i in 0..LENGTH {
        result[i] = left[i] * right[i];
    }
    result
 }
 #[cfg(test)]
 pub(crate) fn matrix_from_diagonal<const LENGTH: usize>(
    vector: &[f64; LENGTH],
 ) -> [[f64; LENGTH]; LENGTH] {
    let mut result = [[0.0; LENGTH]; LENGTH];
    for i in 0..LENGTH {
        result[i][i] = vector[i];
    }
    result
 }
 #[allow(dead_code)]
 pub(crate) fn assign_add_vector<const N: usize>(sum: &mut [f64; N], operand: &[f64; N]) {
    for i in 0..N {
@ -89,7 +116,10 @@ struct ShuffleCycled<I: Iterator, R: rand::Rng> {
    rng: R,
 }
-impl<I: Iterator, R: rand::Rng> Iterator for ShuffleCycled<I, R> where I::Item: Clone {
+impl<I: Iterator, R: rand::Rng> Iterator for ShuffleCycled<I, R>
 where
    I::Item: Clone,
 {
    type Item = I::Item;
    #[inline]
@ -99,7 +129,7 @@ impl<I: Iterator, R: rand::Rng> Iterator for ShuffleCycled<I, R> where I::Item:
        if let Some(next) = self.iter.next() {
            // Base iterator is not empty yet
            self.buffer.push(next.clone());
-            return Some(next)
+            return Some(next);
        } else if self.buffer.len() > 0 {
            if self.index == 0 {
                // Shuffle the vector and return the first element, setting the index to 1
@ -118,12 +148,9 @@ impl<I: Iterator, R: rand::Rng> Iterator for ShuffleCycled<I, R> where I::Item:
    }
 }
-pub fn cycle_shuffling<I: Iterator>(
+pub fn cycle_shuffling<I: Iterator>(iter: I, rng: impl rand::Rng) -> impl Iterator<Item = I::Item>
    iter: I,
    rng: impl rand::Rng
 ) -> impl Iterator<Item=I::Item>
 where
-    I::Item: Clone
+    I::Item: Clone,
 {
    let size_hint = iter.size_hint();
    let size_hint = size_hint.1.unwrap_or(size_hint.0).max(1);
@ -132,6 +159,19 @@ where
        buffer: Vec::with_capacity(size_hint),
        index: 0,
        iter,
-        rng
+        rng,
    }
 }
 #[cfg(test)]
 pub(crate) fn uniform_vector<const LENGTH: usize>() -> [f64; LENGTH] {
    use rand::Rng;
    let mut res = [0.0; LENGTH];
    let mut rng = rand::thread_rng();
    for i in 0..LENGTH {
        res[i] = rng.gen();
    }
    res
 }