From 8ac82e20e29ba8b7d95f1e09b097e60b2eec8ea3 Mon Sep 17 00:00:00 2001
From: Adrien Burgun <adrien.burgun@orange.fr>
Date: Wed, 12 Apr 2023 17:16:04 +0200
Subject: [PATCH] :sparkles: Working backpropagation :3

---
 examples/bivariate.rs       |  71 +++++++++++++++
 examples/xor.rs             |  28 +++---
 src/derivable/activation.rs |  71 +++++++++++++++
 src/layer/dense.rs          |   6 +-
 src/lib.rs                  |   7 +-
 src/train.rs                | 174 +++++++++++++++++++++++++++---------
 src/utils.rs                |  93 +++++++++++++++----
 7 files changed, 367 insertions(+), 83 deletions(-)
 create mode 100644 examples/bivariate.rs
diff --git a/examples/bivariate.rs b/examples/bivariate.rs
new file mode 100644
index 0000000..4a7c0b0
--- /dev/null
+++ b/examples/bivariate.rs
@@ -0,0 +1,71 @@
+#![feature(generic_arg_infer)]
+
+use std::io::Write;
+
+use neuramethyst::prelude::*;
+use neuramethyst::derivable::activation::{Relu, Tanh, LeakyRelu};
+use neuramethyst::derivable::loss::Euclidean;
+
+use rand::Rng;
+
+fn main() {
+    let mut network = neura_network![
+        neura_layer!("dense", LeakyRelu(0.01), 4, 2),
+        neura_layer!("dense", Tanh, 3),
+        neura_layer!("dense", Relu, 2)
+    ];
+
+    let mut rng = rand::thread_rng();
+    let inputs = (0..=1).cycle().map(move |category| {
+        let (x, y) = if category == 0 {
+            let radius: f64 = rng.gen_range(0.0..1.0);
+            let radius = radius.sqrt();
+            let angle = rng.gen_range(0.0..std::f64::consts::TAU);
+            (angle.cos() * radius, angle.sin() * radius)
+        } else {
+            let radius: f64 = rng.gen_range(1.0..2.0);
+            let angle = rng.gen_range(0.0..std::f64::consts::TAU);
+            (angle.cos() * radius, angle.sin() * radius)
+        };
+
+        ([x, y], one_hot::<2>(category))
+    });
+
+    let test_inputs: Vec<_> = inputs.clone().take(100).collect();
+
+    let mut trainer = NeuraBatchedTrainer::new(0.1, 4000);
+    trainer.log_epochs = 500;
+
+    trainer.train(
+        NeuraBackprop::new(Euclidean),
+        &mut network,
+        inputs,
+        &test_inputs
+    );
+
+    let mut file = std::fs::File::create("target/bivariate.csv").unwrap();
+    for (input, _target) in test_inputs {
+        let guess = argmax(&network.eval(&input));
+        writeln!(&mut file, "{},{},{}", input[0], input[1], guess).unwrap();
+    }
+}
+
+fn one_hot<const N: usize>(value: usize) -> [f64; N] {
+    let mut res = [0.0; N];
+    if value < N {
+        res[value] = 1.0;
+    }
+    res
+}
+
+fn argmax(array: &[f64]) -> usize {
+    let mut res = 0;
+
+    for n in 1..array.len() {
+        if array[n] > array[res] {
+            res = n;
+        }
+    }
+
+    res
+}
diff --git a/examples/xor.rs b/examples/xor.rs
index 9d19aa0..fa1a88b 100644
--- a/examples/xor.rs
+++ b/examples/xor.rs
@@ -1,13 +1,13 @@
 #![feature(generic_arg_infer)]
 
 use neuramethyst::prelude::*;
-use neuramethyst::derivable::activation::{Relu, Tanh};
+use neuramethyst::derivable::activation::{Relu};
 use neuramethyst::derivable::loss::Euclidean;
 
 fn main() {
     let mut network = neura_network![
-        neura_layer!("dense", Tanh, 2, 2),
-        neura_layer!("dense", Tanh, 3),
+        neura_layer!("dense", Relu, 4, 2),
+        neura_layer!("dense", Relu, 3),
         neura_layer!("dense", Relu, 1)
     ];
 
@@ -18,25 +18,23 @@ fn main() {
         ([1.0, 1.0], [0.0])
     ];
 
-    // println!("{:#?}", network);
-
     for (input, target) in inputs {
-        println!("Input: {:?}, target: {}, actual: {}", &input, target[0], network.eval(&input)[0]);
+        println!("Input: {:?}, target: {}, actual: {:.3}", &input, target[0], network.eval(&input)[0]);
     }
 
-    train_batched(
+    let mut trainer = NeuraBatchedTrainer::new(0.05, 1000);
+    trainer.batch_size = 6;
+    trainer.log_epochs = 250;
+    trainer.learning_momentum = 0.01;
+
+    trainer.train(
+        NeuraBackprop::new(Euclidean),
         &mut network,
-        inputs.clone(),
+        cycle_shuffling(inputs.iter().cloned(), rand::thread_rng()),
         &inputs,
-        NeuraBackprop::new(Euclidean),
-        0.01,
-        1,
-        25
     );
 
-    // println!("{:#?}", network);
-
     for (input, target) in inputs {
-        println!("Input: {:?}, target: {}, actual: {}", &input, target[0], network.eval(&input)[0]);
+        println!("Input: {:?}, target: {}, actual: {:.3}", &input, target[0], network.eval(&input)[0]);
     }
 }
diff --git a/src/derivable/activation.rs b/src/derivable/activation.rs
index 0bac5ee..9c13bd3 100644
--- a/src/derivable/activation.rs
+++ b/src/derivable/activation.rs
@@ -35,6 +35,50 @@ impl NeuraDerivable<f32> for Relu {
     }
 }
 
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct LeakyRelu(pub f64);
+
+
+impl NeuraDerivable<f64> for LeakyRelu {
+    #[inline(always)]
+    fn eval(&self, input: f64) -> f64 {
+        if input > 0.0 {
+            input
+        } else {
+            self.0 * input
+        }
+    }
+
+    #[inline(always)]
+    fn derivate(&self, input: f64) -> f64 {
+        if input > 0.0 {
+            1.0
+        } else {
+            self.0
+        }
+    }
+}
+
+impl NeuraDerivable<f32> for LeakyRelu {
+    #[inline(always)]
+    fn eval(&self, input: f32) -> f32 {
+        if input > 0.0 {
+            input
+        } else {
+            (self.0 as f32) * input
+        }
+    }
+
+    #[inline(always)]
+    fn derivate(&self, input: f32) -> f32 {
+        if input > 0.0 {
+            1.0
+        } else {
+            self.0 as f32
+        }
+    }
+}
+
 #[derive(Clone, Copy, Debug, PartialEq)]
 pub struct Tanh;
 
@@ -63,3 +107,30 @@ impl NeuraDerivable<f32> for Tanh {
         0.5 * (1.0 - tanh * tanh)
     }
 }
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct Linear;
+
+impl NeuraDerivable<f64> for Linear {
+    #[inline(always)]
+    fn eval(&self, input: f64) -> f64 {
+        input
+    }
+
+    #[inline(always)]
+    fn derivate(&self, _at: f64) -> f64 {
+        1.0
+    }
+}
+
+impl NeuraDerivable<f32> for Linear {
+    #[inline(always)]
+    fn eval(&self, input: f32) -> f32 {
+        input
+    }
+
+    #[inline(always)]
+    fn derivate(&self, _at: f32) -> f32 {
+        1.0
+    }
+}
diff --git a/src/layer/dense.rs b/src/layer/dense.rs
index 2929f22..1776bc8 100644
--- a/src/layer/dense.rs
+++ b/src/layer/dense.rs
@@ -35,7 +35,7 @@ impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize>
 
         for i in 0..OUTPUT_LEN {
             for j in 0..INPUT_LEN {
-                weights[i][j] = rng.gen_range(-multiplier..multiplier);
+                weights[i][j] = rng.gen_range(0.0..multiplier);
             }
         }
 
@@ -74,10 +74,10 @@ impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize>
     // TODO: double-check the math in this
     fn backpropagate(&self, input: &Self::Input, epsilon: Self::Output) -> (Self::Input, Self::Delta) {
         let evaluated = multiply_matrix_vector(&self.weights, input);
-        // Compute delta from epsilon, with `self.activation'(z) * epsilon = delta`
+        // Compute delta from epsilon, with `self.activation'(input) ° epsilon = delta`
         let mut delta = epsilon.clone();
         for i in 0..OUTPUT_LEN {
-            delta[i] = self.activation.derivate(evaluated[i]);
+            delta[i] *= self.activation.derivate(evaluated[i]);
         }
 
         let weights_gradient = reverse_dot_product(&delta, input);
diff --git a/src/lib.rs b/src/lib.rs
index d17f734..eb9b2e8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -13,7 +13,8 @@ pub mod prelude {
     pub use crate::{neura_network, neura_layer};
 
     // Structs and traits
-    pub use super::network::{NeuraNetwork};
-    pub use super::layer::{NeuraLayer, NeuraDenseLayer};
-    pub use super::train::{NeuraBackprop, train_batched};
+    pub use crate::network::{NeuraNetwork};
+    pub use crate::layer::{NeuraLayer, NeuraDenseLayer};
+    pub use crate::train::{NeuraBackprop, NeuraBatchedTrainer};
+    pub use crate::utils::cycle_shuffling;
 }
diff --git a/src/train.rs b/src/train.rs
index adc23fb..13074b4 100644
--- a/src/train.rs
+++ b/src/train.rs
@@ -3,7 +3,7 @@ use crate::{
     algebra::NeuraVectorSpace,
     derivable::NeuraLoss,
     layer::NeuraLayer,
-    network::NeuraNetwork,
+    network::NeuraNetwork, utils::cycle_shuffling,
 };
 
 pub trait NeuraTrainableLayer: NeuraLayer {
@@ -44,7 +44,7 @@ pub trait NeuraTrainable: NeuraLayer {
     ) -> (Self::Input, Self::Delta);
 }
 
-pub trait NeuraTrainer<Output, Target = Output> {
+pub trait NeuraGradientSolver<Output, Target = Output> {
     fn get_gradient<Layer: NeuraLayer, ChildNetwork>(
         &self,
         trainable: &NeuraNetwork<Layer, ChildNetwork>,
@@ -75,7 +75,7 @@ impl<Loss: NeuraLoss + Clone> NeuraBackprop<Loss> {
     }
 }
 
-impl<const N: usize, Loss: NeuraLoss<Input = [f64; N]> + Clone> NeuraTrainer<[f64; N], Loss::Target>
+impl<const N: usize, Loss: NeuraLoss<Input = [f64; N]> + Clone> NeuraGradientSolver<[f64; N], Loss::Target>
     for NeuraBackprop<Loss>
 {
     fn get_gradient<Layer: NeuraLayer, ChildNetwork>(
@@ -103,49 +103,137 @@ impl<const N: usize, Loss: NeuraLoss<Input = [f64; N]> + Clone> NeuraTrainer<[f6
     }
 }
 
-pub fn train_batched<
-    Output,
-    Target,
-    Trainer: NeuraTrainer<Output, Target>,
-    Layer: NeuraLayer,
-    ChildNetwork,
-    Inputs: IntoIterator<Item = (Layer::Input, Target)>,
->(
-    network: &mut NeuraNetwork<Layer, ChildNetwork>,
-    inputs: Inputs,
-    test_inputs: &[(Layer::Input, Target)],
-    trainer: Trainer,
-    learning_rate: f64,
-    batch_size: usize,
-    epochs: usize,
-) where
-    NeuraNetwork<Layer, ChildNetwork>: NeuraTrainable<Input = Layer::Input, Output = Output>,
-    Inputs::IntoIter: Clone,
-{
-    // TODO: apply shuffling?
-    let mut iter = inputs.into_iter().cycle();
-    let factor = -learning_rate / (batch_size as f64);
-
-    'd: for epoch in 0..epochs {
-        let mut gradient_sum = <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
-
-        for _ in 0..batch_size {
-            if let Some((input, target)) = iter.next() {
-                let gradient = trainer.get_gradient(&network, &input, &target);
-                gradient_sum.add_assign(&gradient);
-            } else {
-                break 'd;
-            }
+#[non_exhaustive]
+pub struct NeuraBatchedTrainer {
+    /// The learning rate of the gradient descent algorithm; the weights `W` will be updated as follows:
+    /// `W += -learning_rate * gradient_average`.
+    ///
+    /// Defaults to `0.1`
+    pub learning_rate: f64,
+
+    /// The momentum of the gradient descent algorithm; if set to a non-zero value, then the weights `W` will be updated as follows:
+    /// `W += -learning_rate * gradient_average - learning_momentum * previous_gradient`.
+    /// This value should be smaller than `learning_rate`.
+    ///
+    /// Defaults to `0.0`
+    pub learning_momentum: f64,
+
+    /// How many gradient computations to average before updating the weights
+    pub batch_size: usize,
+
+    /// How many batches to run for; if `epochs * batch_size` exceeds the input length, then training will stop.
+    /// You should use `cycle_shuffling` from the `prelude` module to avoid this.
+    pub epochs: usize,
+
+    /// The trainer will log progress at every multiple of `log_epochs` steps.
+    /// If `log_epochs` is zero (default), then no progress will be logged.
+    ///
+    /// The test inputs is used to measure the score of the network.
+    pub log_epochs: usize,
+}
+
+impl Default for NeuraBatchedTrainer {
+    fn default() -> Self {
+        Self {
+            learning_rate: 0.1,
+            learning_momentum: 0.0,
+            batch_size: 100,
+            epochs: 100,
+            log_epochs: 0,
         }
+    }
+}
 
-        gradient_sum.mul_assign(factor);
-        network.apply_gradient(&gradient_sum);
+impl NeuraBatchedTrainer {
+    pub fn new(learning_rate: f64, epochs: usize) -> Self {
+        Self {
+            learning_rate,
+            epochs,
+            ..Default::default()
+        }
+    }
+
+    pub fn train<
+        Output,
+        Target: Clone,
+        GradientSolver: NeuraGradientSolver<Output, Target>,
+        Layer: NeuraLayer,
+        ChildNetwork,
+        Inputs: IntoIterator<Item = (Layer::Input, Target)>,
+    >(
+        &self,
+        gradient_solver: GradientSolver,
+        network: &mut NeuraNetwork<Layer, ChildNetwork>,
+        inputs: Inputs,
+        test_inputs: &[(Layer::Input, Target)],
+    ) where
+        NeuraNetwork<Layer, ChildNetwork>: NeuraTrainable<Input = Layer::Input, Output = Output>,
+        Layer::Input: Clone,
+    {
+        // TODO: apply shuffling?
+        let mut iter = inputs.into_iter();
+        let factor = -self.learning_rate / (self.batch_size as f64);
+        let momentum_factor = self.learning_momentum / self.learning_rate;
+
+        // Contains `momentum_factor * factor * gradient_sum_previous_iter`
+        let mut previous_gradient_sum = <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
+        'd: for epoch in 0..self.epochs {
+            let mut gradient_sum = <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
+
+            for _ in 0..self.batch_size {
+                if let Some((input, target)) = iter.next() {
+                    let gradient = gradient_solver.get_gradient(&network, &input, &target);
+                    gradient_sum.add_assign(&gradient);
+                } else {
+                    break 'd;
+                }
+            }
+
+            gradient_sum.mul_assign(factor);
+            network.apply_gradient(&gradient_sum);
+
+            if self.learning_momentum != 0.0 {
+                network.apply_gradient(&previous_gradient_sum);
+                previous_gradient_sum = gradient_sum;
+                previous_gradient_sum.mul_assign(momentum_factor);
+            }
 
-        let mut loss_sum = 0.0;
-        for (input, target) in test_inputs {
-            loss_sum += trainer.score(&network, input, target);
+            if self.log_epochs > 0 && (epoch + 1) % self.log_epochs == 0 {
+                let mut loss_sum = 0.0;
+                for (input, target) in test_inputs {
+                    loss_sum += gradient_solver.score(&network, input, target);
+                }
+                loss_sum /= test_inputs.len() as f64;
+                println!("Epoch {}, Loss: {:.3}", epoch + 1, loss_sum);
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::{layer::NeuraDenseLayer, derivable::{activation::Linear, loss::Euclidean}};
+    use super::*;
+
+    #[test]
+    fn test_backpropagation_simple() {
+        for wa in [0.0, 0.25, 0.5, 1.0] {
+            for wb in [0.0, 0.25, 0.5, 1.0] {
+                let network = NeuraNetwork::new(
+                    NeuraDenseLayer::new([[wa, wb]], [0.0], Linear),
+                    ()
+                );
+
+                let gradient = NeuraBackprop::new(Euclidean).get_gradient(
+                    &network,
+                    &[1.0, 1.0],
+                    &[0.0]
+                );
+
+                let expected = wa + wb;
+                assert!((gradient.0[0][0] - expected) < 0.001);
+                assert!((gradient.0[0][1] - expected) < 0.001);
+            }
         }
-        loss_sum /= test_inputs.len() as f64;
-        println!("Epoch {epoch}, Loss: {:.3}", loss_sum);
     }
 }
diff --git a/src/utils.rs b/src/utils.rs
index 7b63642..4f8e535 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -54,36 +54,91 @@ pub(crate) fn assign_add_vector<const N: usize>(sum: &mut [f64; N], operand: &[f
     }
 }
 
+struct Chunked<J: Iterator> {
+    iter: J,
+    chunk_size: usize,
+}
+
+impl<J: Iterator> Iterator for Chunked<J> {
+    type Item = Vec<J::Item>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut result = Vec::with_capacity(self.chunk_size);
+
+        for _ in 0..self.chunk_size {
+            if let Some(item) = self.iter.next() {
+                result.push(item);
+            } else {
+                break;
+            }
+        }
+
+        if result.len() > 0 {
+            Some(result)
+        } else {
+            None
+        }
+    }
+}
+
 pub(crate) fn chunked<I: Iterator>(
     iter: I,
     chunk_size: usize,
 ) -> impl Iterator<Item = Vec<I::Item>> {
-    struct Chunked<J: Iterator> {
-        iter: J,
-        chunk_size: usize,
-    }
+    Chunked { iter, chunk_size }
+}
 
-    impl<J: Iterator> Iterator for Chunked<J> {
-        type Item = Vec<J::Item>;
 
-        fn next(&mut self) -> Option<Self::Item> {
-            let mut result = Vec::with_capacity(self.chunk_size);
+struct ShuffleCycled<I: Iterator, R: rand::Rng> {
+    buffer: Vec<I::Item>,
+    index: usize,
+    iter: I,
+    rng: R,
+}
 
-            for _ in 0..self.chunk_size {
-                if let Some(item) = self.iter.next() {
-                    result.push(item);
-                } else {
-                    break;
-                }
-            }
+impl<I: Iterator, R: rand::Rng> Iterator for ShuffleCycled<I, R> where I::Item: Clone {
+    type Item = I::Item;
 
-            if result.len() > 0 {
-                Some(result)
+    #[inline]
+    fn next(&mut self) -> Option<Self::Item> {
+        use rand::prelude::SliceRandom;
+
+        if let Some(next) = self.iter.next() {
+            // Base iterator is not empty yet
+            self.buffer.push(next.clone());
+            return Some(next)
+        } else if self.buffer.len() > 0 {
+            if self.index == 0 {
+                // Shuffle the vector and return the first element, setting the index to 1
+                self.buffer.shuffle(&mut self.rng);
+                self.index = 1;
+                Some(self.buffer[0].clone())
             } else {
-                None
+                // Keep consuming the shuffled vector
+                let res = self.buffer[self.index].clone();
+                self.index = (self.index + 1) % self.buffer.len();
+                Some(res)
             }
+        } else {
+            None
         }
     }
+}
 
-    Chunked { iter, chunk_size }
+pub fn cycle_shuffling<I: Iterator>(
+    iter: I,
+    rng: impl rand::Rng
+) -> impl Iterator<Item=I::Item>
+where
+    I::Item: Clone
+{
+    let size_hint = iter.size_hint();
+    let size_hint = size_hint.1.unwrap_or(size_hint.0).max(1);
+
+    ShuffleCycled {
+        buffer: Vec::with_capacity(size_hint),
+        index: 0,
+        iter,
+        rng
+    }
 }