✨ Working backpropagation :3

2 years ago · 8ac82e20e2
parent 7a6921a1c1
commit 8ac82e20e2
7 changed files with 367 additions and 83 deletions
--- a/examples/bivariate.rs
+++ b/examples/bivariate.rs
@ -0,0 +1,71 @@
 #![feature(generic_arg_infer)]
 use std::io::Write;
 use neuramethyst::prelude::*;
 use neuramethyst::derivable::activation::{Relu, Tanh, LeakyRelu};
 use neuramethyst::derivable::loss::Euclidean;
 use rand::Rng;
 fn main() {
    let mut network = neura_network![
        neura_layer!("dense", LeakyRelu(0.01), 4, 2),
        neura_layer!("dense", Tanh, 3),
        neura_layer!("dense", Relu, 2)
    ];
    let mut rng = rand::thread_rng();
    let inputs = (0..=1).cycle().map(move |category| {
        let (x, y) = if category == 0 {
            let radius: f64 = rng.gen_range(0.0..1.0);
            let radius = radius.sqrt();
            let angle = rng.gen_range(0.0..std::f64::consts::TAU);
            (angle.cos() * radius, angle.sin() * radius)
        } else {
            let radius: f64 = rng.gen_range(1.0..2.0);
            let angle = rng.gen_range(0.0..std::f64::consts::TAU);
            (angle.cos() * radius, angle.sin() * radius)
        };
        ([x, y], one_hot::<2>(category))
    });
    let test_inputs: Vec<_> = inputs.clone().take(100).collect();
    let mut trainer = NeuraBatchedTrainer::new(0.1, 4000);
    trainer.log_epochs = 500;
    trainer.train(
        NeuraBackprop::new(Euclidean),
        &mut network,
        inputs,
        &test_inputs
    );
    let mut file = std::fs::File::create("target/bivariate.csv").unwrap();
    for (input, _target) in test_inputs {
        let guess = argmax(&network.eval(&input));
        writeln!(&mut file, "{},{},{}", input[0], input[1], guess).unwrap();
    }
 }
 fn one_hot<const N: usize>(value: usize) -> [f64; N] {
    let mut res = [0.0; N];
    if value < N {
        res[value] = 1.0;
    }
    res
 }
 fn argmax(array: &[f64]) -> usize {
    let mut res = 0;
    for n in 1..array.len() {
        if array[n] > array[res] {
            res = n;
        }
    }
    res
 }
--- a/examples/xor.rs
+++ b/examples/xor.rs
@ -1,13 +1,13 @@
 #![feature(generic_arg_infer)]
 use neuramethyst::prelude::*;
-use neuramethyst::derivable::activation::{Relu, Tanh};
+use neuramethyst::derivable::activation::{Relu};
 use neuramethyst::derivable::loss::Euclidean;
 fn main() {
    let mut network = neura_network![
-        neura_layer!("dense", Tanh, 2, 2),
+        neura_layer!("dense", Relu, 4, 2),
-        neura_layer!("dense", Tanh, 3),
+        neura_layer!("dense", Relu, 3),
        neura_layer!("dense", Relu, 1)
    ];
@ -18,25 +18,23 @@ fn main() {
        ([1.0, 1.0], [0.0])
    ];
    // println!("{:#?}", network);
    for (input, target) in inputs {
-        println!("Input: {:?}, target: {}, actual: {}", &input, target[0], network.eval(&input)[0]);
+        println!("Input: {:?}, target: {}, actual: {:.3}", &input, target[0], network.eval(&input)[0]);
    }
-    train_batched(
+    let mut trainer = NeuraBatchedTrainer::new(0.05, 1000);
    trainer.batch_size = 6;
    trainer.log_epochs = 250;
    trainer.learning_momentum = 0.01;
    trainer.train(
        NeuraBackprop::new(Euclidean),
        &mut network,
-        inputs.clone(),
+        cycle_shuffling(inputs.iter().cloned(), rand::thread_rng()),
        &inputs,
        NeuraBackprop::new(Euclidean),
        0.01,
        1,
        25
    );
    // println!("{:#?}", network);
    for (input, target) in inputs {
-        println!("Input: {:?}, target: {}, actual: {}", &input, target[0], network.eval(&input)[0]);
+        println!("Input: {:?}, target: {}, actual: {:.3}", &input, target[0], network.eval(&input)[0]);
    }
 }
--- a/src/derivable/activation.rs
+++ b/src/derivable/activation.rs
@ -35,6 +35,50 @@ impl NeuraDerivable<f32> for Relu {
    }
 }
 #[derive(Clone, Copy, Debug, PartialEq)]
 pub struct LeakyRelu(pub f64);
 impl NeuraDerivable<f64> for LeakyRelu {
    #[inline(always)]
    fn eval(&self, input: f64) -> f64 {
        if input > 0.0 {
            input
        } else {
            self.0 * input
        }
    }
    #[inline(always)]
    fn derivate(&self, input: f64) -> f64 {
        if input > 0.0 {
            1.0
        } else {
            self.0
        }
    }
 }
 impl NeuraDerivable<f32> for LeakyRelu {
    #[inline(always)]
    fn eval(&self, input: f32) -> f32 {
        if input > 0.0 {
            input
        } else {
            (self.0 as f32) * input
        }
    }
    #[inline(always)]
    fn derivate(&self, input: f32) -> f32 {
        if input > 0.0 {
            1.0
        } else {
            self.0 as f32
        }
    }
 }
 #[derive(Clone, Copy, Debug, PartialEq)]
 pub struct Tanh;
@ -63,3 +107,30 @@ impl NeuraDerivable<f32> for Tanh {
        0.5 * (1.0 - tanh * tanh)
    }
 }
 #[derive(Clone, Copy, Debug, PartialEq)]
 pub struct Linear;
 impl NeuraDerivable<f64> for Linear {
    #[inline(always)]
    fn eval(&self, input: f64) -> f64 {
        input
    }
    #[inline(always)]
    fn derivate(&self, _at: f64) -> f64 {
        1.0
    }
 }
 impl NeuraDerivable<f32> for Linear {
    #[inline(always)]
    fn eval(&self, input: f32) -> f32 {
        input
    }
    #[inline(always)]
    fn derivate(&self, _at: f32) -> f32 {
        1.0
    }
 }
--- a/src/layer/dense.rs
+++ b/src/layer/dense.rs
@ -35,7 +35,7 @@ impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize>
        for i in 0..OUTPUT_LEN {
            for j in 0..INPUT_LEN {
-                weights[i][j] = rng.gen_range(-multiplier..multiplier);
+                weights[i][j] = rng.gen_range(0.0..multiplier);
            }
        }
@ -74,10 +74,10 @@ impl<Act: NeuraDerivable<f64>, const INPUT_LEN: usize, const OUTPUT_LEN: usize>
    // TODO: double-check the math in this
    fn backpropagate(&self, input: &Self::Input, epsilon: Self::Output) -> (Self::Input, Self::Delta) {
        let evaluated = multiply_matrix_vector(&self.weights, input);
-        // Compute delta from epsilon, with `self.activation'(z) * epsilon = delta`
+        // Compute delta from epsilon, with `self.activation'(input) ° epsilon = delta`
        let mut delta = epsilon.clone();
        for i in 0..OUTPUT_LEN {
-            delta[i] = self.activation.derivate(evaluated[i]);
+            delta[i] *= self.activation.derivate(evaluated[i]);
        }
        let weights_gradient = reverse_dot_product(&delta, input);
--- a/src/lib.rs
+++ b/src/lib.rs
@ -13,7 +13,8 @@ pub mod prelude {
    pub use crate::{neura_network, neura_layer};
    // Structs and traits
-    pub use super::network::{NeuraNetwork};
+    pub use crate::network::{NeuraNetwork};
-    pub use super::layer::{NeuraLayer, NeuraDenseLayer};
+    pub use crate::layer::{NeuraLayer, NeuraDenseLayer};
-    pub use super::train::{NeuraBackprop, train_batched};
+    pub use crate::train::{NeuraBackprop, NeuraBatchedTrainer};
    pub use crate::utils::cycle_shuffling;
 }
--- a/src/train.rs
+++ b/src/train.rs
@ -3,7 +3,7 @@ use crate::{
    algebra::NeuraVectorSpace,
    derivable::NeuraLoss,
    layer::NeuraLayer,
-    network::NeuraNetwork,
+    network::NeuraNetwork, utils::cycle_shuffling,
 };
 pub trait NeuraTrainableLayer: NeuraLayer {
@ -44,7 +44,7 @@ pub trait NeuraTrainable: NeuraLayer {
    ) -> (Self::Input, Self::Delta);
 }
-pub trait NeuraTrainer<Output, Target = Output> {
+pub trait NeuraGradientSolver<Output, Target = Output> {
    fn get_gradient<Layer: NeuraLayer, ChildNetwork>(
        &self,
        trainable: &NeuraNetwork<Layer, ChildNetwork>,
@ -75,7 +75,7 @@ impl<Loss: NeuraLoss + Clone> NeuraBackprop<Loss> {
    }
 }
-impl<const N: usize, Loss: NeuraLoss<Input = [f64; N]> + Clone> NeuraTrainer<[f64; N], Loss::Target>
+impl<const N: usize, Loss: NeuraLoss<Input = [f64; N]> + Clone> NeuraGradientSolver<[f64; N], Loss::Target>
    for NeuraBackprop<Loss>
 {
    fn get_gradient<Layer: NeuraLayer, ChildNetwork>(
@ -103,35 +103,86 @@ impl<const N: usize, Loss: NeuraLoss<Input = [f64; N]> + Clone> NeuraTrainer<[f6
    }
 }
-pub fn train_batched<
+#[non_exhaustive]
 pub struct NeuraBatchedTrainer {
    /// The learning rate of the gradient descent algorithm; the weights `W` will be updated as follows:
    /// `W += -learning_rate * gradient_average`.
    ///
    /// Defaults to `0.1`
    pub learning_rate: f64,
    /// The momentum of the gradient descent algorithm; if set to a non-zero value, then the weights `W` will be updated as follows:
    /// `W += -learning_rate * gradient_average - learning_momentum * previous_gradient`.
    /// This value should be smaller than `learning_rate`.
    ///
    /// Defaults to `0.0`
    pub learning_momentum: f64,
    /// How many gradient computations to average before updating the weights
    pub batch_size: usize,
    /// How many batches to run for; if `epochs * batch_size` exceeds the input length, then training will stop.
    /// You should use `cycle_shuffling` from the `prelude` module to avoid this.
    pub epochs: usize,
    /// The trainer will log progress at every multiple of `log_epochs` steps.
    /// If `log_epochs` is zero (default), then no progress will be logged.
    ///
    /// The test inputs is used to measure the score of the network.
    pub log_epochs: usize,
 }
 impl Default for NeuraBatchedTrainer {
    fn default() -> Self {
        Self {
            learning_rate: 0.1,
            learning_momentum: 0.0,
            batch_size: 100,
            epochs: 100,
            log_epochs: 0,
        }
    }
 }
 impl NeuraBatchedTrainer {
    pub fn new(learning_rate: f64, epochs: usize) -> Self {
        Self {
            learning_rate,
            epochs,
            ..Default::default()
        }
    }
    pub fn train<
        Output,
-    Target,
+        Target: Clone,
-    Trainer: NeuraTrainer<Output, Target>,
+        GradientSolver: NeuraGradientSolver<Output, Target>,
        Layer: NeuraLayer,
        ChildNetwork,
        Inputs: IntoIterator<Item = (Layer::Input, Target)>,
->(
+    >(
        &self,
        gradient_solver: GradientSolver,
        network: &mut NeuraNetwork<Layer, ChildNetwork>,
        inputs: Inputs,
        test_inputs: &[(Layer::Input, Target)],
-    trainer: Trainer,
+    ) where
    learning_rate: f64,
    batch_size: usize,
    epochs: usize,
 ) where
        NeuraNetwork<Layer, ChildNetwork>: NeuraTrainable<Input = Layer::Input, Output = Output>,
-    Inputs::IntoIter: Clone,
+        Layer::Input: Clone,
-{
+    {
        // TODO: apply shuffling?
-    let mut iter = inputs.into_iter().cycle();
+        let mut iter = inputs.into_iter();
-    let factor = -learning_rate / (batch_size as f64);
+        let factor = -self.learning_rate / (self.batch_size as f64);
        let momentum_factor = self.learning_momentum / self.learning_rate;
-    'd: for epoch in 0..epochs {
+        // Contains `momentum_factor * factor * gradient_sum_previous_iter`
        let mut previous_gradient_sum = <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
        'd: for epoch in 0..self.epochs {
            let mut gradient_sum = <NeuraNetwork<Layer, ChildNetwork> as NeuraTrainable>::Delta::zero();
-        for _ in 0..batch_size {
+            for _ in 0..self.batch_size {
                if let Some((input, target)) = iter.next() {
-                let gradient = trainer.get_gradient(&network, &input, &target);
+                    let gradient = gradient_solver.get_gradient(&network, &input, &target);
                    gradient_sum.add_assign(&gradient);
                } else {
                    break 'd;
@ -141,11 +192,48 @@ pub fn train_batched<
            gradient_sum.mul_assign(factor);
            network.apply_gradient(&gradient_sum);
            if self.learning_momentum != 0.0 {
                network.apply_gradient(&previous_gradient_sum);
                previous_gradient_sum = gradient_sum;
                previous_gradient_sum.mul_assign(momentum_factor);
            }
            if self.log_epochs > 0 && (epoch + 1) % self.log_epochs == 0 {
                let mut loss_sum = 0.0;
                for (input, target) in test_inputs {
-            loss_sum += trainer.score(&network, input, target);
+                    loss_sum += gradient_solver.score(&network, input, target);
                }
                loss_sum /= test_inputs.len() as f64;
-        println!("Epoch {epoch}, Loss: {:.3}", loss_sum);
+                println!("Epoch {}, Loss: {:.3}", epoch + 1, loss_sum);
            }
        }
    }
 }
 #[cfg(test)]
 mod test {
    use crate::{layer::NeuraDenseLayer, derivable::{activation::Linear, loss::Euclidean}};
    use super::*;
    #[test]
    fn test_backpropagation_simple() {
        for wa in [0.0, 0.25, 0.5, 1.0] {
            for wb in [0.0, 0.25, 0.5, 1.0] {
                let network = NeuraNetwork::new(
                    NeuraDenseLayer::new([[wa, wb]], [0.0], Linear),
                    ()
                );
                let gradient = NeuraBackprop::new(Euclidean).get_gradient(
                    &network,
                    &[1.0, 1.0],
                    &[0.0]
                );
                let expected = wa + wb;
                assert!((gradient.0[0][0] - expected) < 0.001);
                assert!((gradient.0[0][1] - expected) < 0.001);
            }
        }
    }
 }
--- a/src/utils.rs
+++ b/src/utils.rs
@ -54,16 +54,12 @@ pub(crate) fn assign_add_vector<const N: usize>(sum: &mut [f64; N], operand: &[f
    }
 }
-pub(crate) fn chunked<I: Iterator>(
+struct Chunked<J: Iterator> {
    iter: I,
    chunk_size: usize,
 ) -> impl Iterator<Item = Vec<I::Item>> {
    struct Chunked<J: Iterator> {
    iter: J,
    chunk_size: usize,
-    }
+}
-    impl<J: Iterator> Iterator for Chunked<J> {
+impl<J: Iterator> Iterator for Chunked<J> {
    type Item = Vec<J::Item>;
    fn next(&mut self) -> Option<Self::Item> {
@ -83,7 +79,66 @@ pub(crate) fn chunked<I: Iterator>(
            None
        }
    }
-    }
+}
 pub(crate) fn chunked<I: Iterator>(
    iter: I,
    chunk_size: usize,
 ) -> impl Iterator<Item = Vec<I::Item>> {
    Chunked { iter, chunk_size }
 }
 struct ShuffleCycled<I: Iterator, R: rand::Rng> {
    buffer: Vec<I::Item>,
    index: usize,
    iter: I,
    rng: R,
 }
 impl<I: Iterator, R: rand::Rng> Iterator for ShuffleCycled<I, R> where I::Item: Clone {
    type Item = I::Item;
    #[inline]
    fn next(&mut self) -> Option<Self::Item> {
        use rand::prelude::SliceRandom;
        if let Some(next) = self.iter.next() {
            // Base iterator is not empty yet
            self.buffer.push(next.clone());
            return Some(next)
        } else if self.buffer.len() > 0 {
            if self.index == 0 {
                // Shuffle the vector and return the first element, setting the index to 1
                self.buffer.shuffle(&mut self.rng);
                self.index = 1;
                Some(self.buffer[0].clone())
            } else {
                // Keep consuming the shuffled vector
                let res = self.buffer[self.index].clone();
                self.index = (self.index + 1) % self.buffer.len();
                Some(res)
            }
        } else {
            None
        }
    }
 }
 pub fn cycle_shuffling<I: Iterator>(
    iter: I,
    rng: impl rand::Rng
 ) -> impl Iterator<Item=I::Item>
 where
    I::Item: Clone
 {
    let size_hint = iter.size_hint();
    let size_hint = size_hint.1.unwrap_or(size_hint.0).max(1);
    ShuffleCycled {
        buffer: Vec::with_capacity(size_hint),
        index: 0,
        iter,
        rng
    }
 }