From bc86802ddc2220a41b30377489a26850c2d44fcc Mon Sep 17 00:00:00 2001
From: Varuna Jayasiri Implementation of fundamental techniques namely Cross-Validation and Early Stopping
-
- Getting data is expensive and in some cases, one has no option but to use a limited amount of data for training their machine learning models.
- This is where Cross-Validation is useful. Steps are as follows:
- Cross-Validation & Early Stopping
-Cross-Validation
-
-
-
Therefore, user has to find a tradeoff between bias and variance.
--
Early-Stopping is one of the way to find this tradeoff. It helps to find a good setting of parameters and preventing overfitting on dataset and saving computation time. - This can be visualized through the following graph of train loss and validation loss over time:
- It can be seen that train error continue to decrease but the validation error start to increase after around 40 epochs. - Therefore, our goal is to stop the training after the validation loss increases
- - - +3import torch
@@ -128,10 +97,7 @@
- Cross-Validation
- Splitting of training set in folds can be represented as:
-
-
+
21def cross_val_train(cost, trainset, epochs, splits, device=None):
@@ -190,7 +156,7 @@
- Training steps
+ training steps
65 net.train() # Enable Dropout
@@ -203,7 +169,6 @@
#
Get the inputs; data is a list of [inputs, labels]
-Load the inputs in GPU if available else CPU
68 if device:
@@ -242,7 +207,7 @@
- Calculate loss
+ Print loss
82 running_loss += loss.item()
@@ -258,7 +223,7 @@
- Validation and printing the metrics
+ Validation
90 loss_accuracy = Test(net, cost, valdata, device)
@@ -294,17 +259,7 @@
- Early stopping
- Early stopping can be understood graphically - the way weights change during the course of training.
-
- Early stopping refered from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py
110 if losses[epoch] > min_loss:
@@ -358,7 +313,7 @@
- Retrieve the model which has the best accuracy over the validation set
+
138def retreive_best_trial():
@@ -412,7 +367,7 @@
- Forward pass
+ forward pass
168 loss = cost(output, labels)Update validation loss
+update validation loss
171 _, preds = torch.max(output, dim=1)
@@ -502,7 +457,7 @@
- Loss in batch
+ loss in batch
197 loss += cost(outputs, labels)
@@ -514,7 +469,7 @@
- Calculate loss and accuracy over the validation set
+ losses[epoch] += loss.item()
201 _, predicted = torch.max(outputs.data, 1)
diff --git a/docs/gan/cycle_gan.html b/docs/gan/cycle_gan/index.html
similarity index 99%
rename from docs/gan/cycle_gan.html
rename to docs/gan/cycle_gan/index.html
index 538c6099..3fb584f1 100644
--- a/docs/gan/cycle_gan.html
+++ b/docs/gan/cycle_gan/index.html
@@ -12,7 +12,7 @@
-
+
@@ -22,8 +22,8 @@
Cycle GAN
-
-
+
+
-
-$p_{data}(\pmb{x})$ is the probability distribution over data,
-whilst $p_{\pmb{z}}(\pmb{z})$ probability distribution of $\pmb{z}$, which is set to
-gaussian noise.
-This file defines the loss functions. Here is an MNIST example
-with two multilayer perceptron for the generator and discriminator.
- 34import torch
-35import torch.nn as nn
-36import torch.utils.data
-37import torch.utils.data
-38
-39from labml_helpers.module import ModuleDiscriminator should ascend on the gradient,
-- -
-$m$ is the mini-batch size and $(i)$ is used to index samples in the mini-batch. -$\pmb{x}$ are samples from $p_{data}$ and $\pmb{z}$ are samples from $p_z$.
-42class DiscriminatorLogitsLoss(Module):57 def __init__(self, smoothing: float = 0.2):
-58 super().__init__()We use PyTorch Binary Cross Entropy Loss, which is -$-\sum\Big[y \log(\hat{y}) + (1 - y) \log(1 - \hat{y})\Big]$, -where $y$ are the labels and $\hat{y}$ are the predictions. -Note the negative sign. -We use labels equal to $1$ for $\pmb{x}$ from $p_{data}$ -and labels equal to $0$ for $\pmb{x}$ from $p_{G}.$ -Then descending on the sum of these is the same as ascending on -the above gradient.
-BCEWithLogitsLoss combines softmax and binary cross entropy loss.
69 self.loss_true = nn.BCEWithLogitsLoss()
-70 self.loss_false = nn.BCEWithLogitsLoss()We use label smoothing because it seems to work better in some cases
-73 self.smoothing = smoothingLabels are registered as buffered and persistence is set to False.
76 self.register_buffer('labels_true', _create_labels(256, 1.0 - smoothing, 1.0), False)
-77 self.register_buffer('labels_false', _create_labels(256, 0.0, smoothing), False)logits_true are logits from $D(\pmb{x}^{(i)})$ and
-logits_false are logits from $D(G(\pmb{z}^{(i)}))$
79 def __call__(self, logits_true: torch.Tensor, logits_false: torch.Tensor):84 if len(logits_true) > len(self.labels_true):
-85 self.register_buffer("labels_true",
-86 _create_labels(len(logits_true), 1.0 - self.smoothing, 1.0, logits_true.device), False)
-87 if len(logits_false) > len(self.labels_false):
-88 self.register_buffer("labels_false",
-89 _create_labels(len(logits_false), 0.0, self.smoothing, logits_false.device), False)
-90
-91 return (self.loss_true(logits_true, self.labels_true[:len(logits_true)]),
-92 self.loss_false(logits_false, self.labels_false[:len(logits_false)]))95class GeneratorLogitsLoss(Module):105 def __init__(self, smoothing: float = 0.2):
-106 super().__init__()
-107 self.loss_true = nn.BCEWithLogitsLoss()
-108 self.smoothing = smoothingWe use labels equal to $1$ for $\pmb{x}$ from $p_{G}.$ -Then descending on this loss is the same as descending on -the above gradient.
-112 self.register_buffer('fake_labels', _create_labels(256, 1.0 - smoothing, 1.0), False)114 def __call__(self, logits: torch.Tensor):
-115 if len(logits) > len(self.fake_labels):
-116 self.register_buffer("fake_labels",
-117 _create_labels(len(logits), 1.0 - self.smoothing, 1.0, logits.device), False)
-118
-119 return self.loss_true(logits, self.fake_labels[:len(logits)])Create smoothed labels
-122def _create_labels(n: int, r1: float, r2: float, device: torch.device = None):126 return torch.empty(n, 1, requires_grad=False, device=device).uniform_(r1, r2)This is an implementation of +Generative Adversarial Networks.
+The generator, $G(\pmb{z}; \theta_g)$ generates samples that match the +distribution of data, while the discriminator, $D(\pmb{x}; \theta_g)$ +gives the probability that $\pmb{x}$ came from data rather than $G$.
+We train $D$ and $G$ simultaneously on a two-player min-max game with value +function $V(G, D)$.
++ +
+$p_{data}(\pmb{x})$ is the probability distribution over data, +whilst $p_{\pmb{z}}(\pmb{z})$ probability distribution of $\pmb{z}$, which is set to +gaussian noise.
+This file defines the loss functions. Here is an MNIST example +with two multilayer perceptron for the generator and discriminator.
+34import torch
+35import torch.nn as nn
+36import torch.utils.data
+37import torch.utils.data
+38
+39from labml_helpers.module import ModuleDiscriminator should ascend on the gradient,
++ +
+$m$ is the mini-batch size and $(i)$ is used to index samples in the mini-batch. +$\pmb{x}$ are samples from $p_{data}$ and $\pmb{z}$ are samples from $p_z$.
+42class DiscriminatorLogitsLoss(Module):57 def __init__(self, smoothing: float = 0.2):
+58 super().__init__()We use PyTorch Binary Cross Entropy Loss, which is +$-\sum\Big[y \log(\hat{y}) + (1 - y) \log(1 - \hat{y})\Big]$, +where $y$ are the labels and $\hat{y}$ are the predictions. +Note the negative sign. +We use labels equal to $1$ for $\pmb{x}$ from $p_{data}$ +and labels equal to $0$ for $\pmb{x}$ from $p_{G}.$ +Then descending on the sum of these is the same as ascending on +the above gradient.
+BCEWithLogitsLoss combines softmax and binary cross entropy loss.
69 self.loss_true = nn.BCEWithLogitsLoss()
+70 self.loss_false = nn.BCEWithLogitsLoss()We use label smoothing because it seems to work better in some cases
+73 self.smoothing = smoothingLabels are registered as buffered and persistence is set to False.
76 self.register_buffer('labels_true', _create_labels(256, 1.0 - smoothing, 1.0), False)
+77 self.register_buffer('labels_false', _create_labels(256, 0.0, smoothing), False)logits_true are logits from $D(\pmb{x}^{(i)})$ and
+logits_false are logits from $D(G(\pmb{z}^{(i)}))$
79 def __call__(self, logits_true: torch.Tensor, logits_false: torch.Tensor):84 if len(logits_true) > len(self.labels_true):
+85 self.register_buffer("labels_true",
+86 _create_labels(len(logits_true), 1.0 - self.smoothing, 1.0, logits_true.device), False)
+87 if len(logits_false) > len(self.labels_false):
+88 self.register_buffer("labels_false",
+89 _create_labels(len(logits_false), 0.0, self.smoothing, logits_false.device), False)
+90
+91 return (self.loss_true(logits_true, self.labels_true[:len(logits_true)]),
+92 self.loss_false(logits_false, self.labels_false[:len(logits_false)]))95class GeneratorLogitsLoss(Module):105 def __init__(self, smoothing: float = 0.2):
+106 super().__init__()
+107 self.loss_true = nn.BCEWithLogitsLoss()
+108 self.smoothing = smoothingWe use labels equal to $1$ for $\pmb{x}$ from $p_{G}.$ +Then descending on this loss is the same as descending on +the above gradient.
+112 self.register_buffer('fake_labels', _create_labels(256, 1.0 - smoothing, 1.0), False)114 def __call__(self, logits: torch.Tensor):
+115 if len(logits) > len(self.fake_labels):
+116 self.register_buffer("fake_labels",
+117 _create_labels(len(logits), 1.0 - self.smoothing, 1.0, logits.device), False)
+118
+119 return self.loss_true(logits, self.fake_labels[:len(logits)])Create smoothed labels
+122def _create_labels(n: int, r1: float, r2: float, device: torch.device = None):126 return torch.empty(n, 1, requires_grad=False, device=device).uniform_(r1, r2)