mirror of
https://github.com/labmlai/annotated_deep_learning_paper_implementations.git
synced 2025-08-26 16:50:39 +08:00
gpt notes
This commit is contained in:
@ -40,7 +40,9 @@ class CrossEntropyLoss(Module):
|
||||
|
||||
class NLPAutoRegressionConfigs(TrainValidConfigs):
|
||||
"""
|
||||
<a id="NLPAutoRegressionConfigs">
|
||||
## Trainer configurations
|
||||
</a>
|
||||
|
||||
This has the basic configurations for NLP auto-regressive task training.
|
||||
All the properties are configurable.
|
||||
|
@ -51,7 +51,7 @@ class AdaBelief(RAdam):
|
||||
|
||||
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
|
||||
weight_decay: WeightDecay = WeightDecay(), amsgrad=False,
|
||||
degenerated_to_sgd=True,
|
||||
degenerate_to_sgd=True,
|
||||
rectify=True, defaults=None):
|
||||
"""
|
||||
### Initialize the optimizer
|
||||
@ -71,7 +71,7 @@ class AdaBelief(RAdam):
|
||||
"""
|
||||
|
||||
defaults = {} if defaults is None else defaults
|
||||
super().__init__(params, lr, betas, eps, weight_decay, amsgrad, degenerated_to_sgd, defaults)
|
||||
super().__init__(params, lr, betas, eps, weight_decay, amsgrad, degenerate_to_sgd, defaults)
|
||||
self.rectify = rectify
|
||||
|
||||
def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Parameter):
|
||||
|
@ -17,7 +17,9 @@ from labml_nn.optimizers.amsgrad import AMSGrad
|
||||
|
||||
class AdamWarmupCosineDecay(AMSGrad):
|
||||
"""
|
||||
## Adam Optimizer with Warmup
|
||||
<a id="EmbeddingsWithPositionalEncoding">
|
||||
## Adam Optimizer with Warmup and Cosine Decay
|
||||
</a>
|
||||
|
||||
This class extends from AMSGrad optimizer defined in [`amsgrad.py`](amsgrad.html).
|
||||
"""
|
||||
|
@ -3,6 +3,8 @@
|
||||
title: Configurable optimizer module
|
||||
summary: This implements a configurable module for optimizers.
|
||||
---
|
||||
|
||||
# Configurable Optimizer
|
||||
"""
|
||||
|
||||
from typing import Tuple
|
||||
@ -14,28 +16,56 @@ from labml_nn.optimizers import WeightDecay
|
||||
|
||||
|
||||
class OptimizerConfigs(BaseConfigs):
|
||||
"""
|
||||
<a id="OptimizerConfigs">
|
||||
## Optimizer Configurations
|
||||
</a>
|
||||
"""
|
||||
|
||||
# Optimizer
|
||||
optimizer: torch.optim.Adam
|
||||
|
||||
# Weight decay
|
||||
weight_decay_obj: WeightDecay
|
||||
# Whether weight decay is decoupled;
|
||||
# i.e. weight decay is not added to gradients
|
||||
weight_decouple: bool = True
|
||||
# Weight decay
|
||||
weight_decay: float = 0.0
|
||||
# Whether weight decay is absolute or should be multiplied by learning rate
|
||||
weight_decay_absolute: bool = False
|
||||
|
||||
# Whether the adam update is optimized (different epsilon)
|
||||
optimized_adam_update: bool = True
|
||||
|
||||
# Parameters to be optimized
|
||||
parameters: any
|
||||
|
||||
# Learning rate $\alpha$
|
||||
learning_rate: float = 0.01
|
||||
# Beta values $(\beta_1, \beta_2)$ for Adam
|
||||
betas: Tuple[float, float] = (0.9, 0.999)
|
||||
# Epsilon $\epsilon$ for adam
|
||||
eps: float = 1e-08
|
||||
|
||||
# Momentum for SGD
|
||||
momentum: float = 0.5
|
||||
# Whether to use AMSGrad
|
||||
amsgrad: bool = False
|
||||
|
||||
# Number of warmup optimizer steps
|
||||
warmup: int = 2_000
|
||||
# Total number of optimizer steps (for cosine decay)
|
||||
total_steps: int = int(1e10)
|
||||
degenerated_to_sgd: bool = True
|
||||
|
||||
# Whether to degenerate to SGD in AdaBelief
|
||||
degenerate_to_sgd: bool = True
|
||||
|
||||
# Whether to use Rectified Adam in AdaBelief
|
||||
rectify: bool = True
|
||||
|
||||
# Model embedding size for Noam optimizer
|
||||
d_model: int
|
||||
step_factor: int = 1024
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(_primary='optimizer')
|
||||
@ -84,7 +114,7 @@ def _radam_optimizer(c: OptimizerConfigs):
|
||||
return RAdam(c.parameters,
|
||||
lr=c.learning_rate, betas=c.betas, eps=c.eps,
|
||||
weight_decay=c.weight_decay_obj, amsgrad=c.amsgrad,
|
||||
degenerated_to_sgd=c.degenerated_to_sgd)
|
||||
degenerated_to_sgd=c.degenerate_to_sgd)
|
||||
|
||||
|
||||
@option(OptimizerConfigs.optimizer, 'AdaBelief')
|
||||
@ -93,7 +123,7 @@ def _ada_belief_optimizer(c: OptimizerConfigs):
|
||||
return AdaBelief(c.parameters,
|
||||
lr=c.learning_rate, betas=c.betas, eps=c.eps,
|
||||
weight_decay=c.weight_decay_obj, amsgrad=c.amsgrad,
|
||||
degenerated_to_sgd=c.degenerated_to_sgd,
|
||||
degenerate_to_sgd=c.degenerate_to_sgd,
|
||||
rectify=c.rectify)
|
||||
|
||||
|
||||
|
@ -18,60 +18,98 @@ from .models import EmbeddingsWithPositionalEncoding, EmbeddingsWithLearnedPosit
|
||||
|
||||
|
||||
class TransformerConfigs(BaseConfigs):
|
||||
"""
|
||||
<a id="TransformerConfigs">
|
||||
## Transformer Configurations
|
||||
</a>
|
||||
|
||||
This defines configurations for a transformer.
|
||||
The configurations are calculate using option functions.
|
||||
These are lazy loaded and therefore only the necessary modules
|
||||
are calculated.
|
||||
"""
|
||||
# Number of attention heads
|
||||
n_heads: int = 8
|
||||
# Transformer embedding size
|
||||
d_model: int = 512
|
||||
# Number of layers
|
||||
n_layers: int = 6
|
||||
# Number of features in position-wise feedforward layer
|
||||
d_ff: int = 2048
|
||||
# Dropout probability
|
||||
dropout: float = 0.1
|
||||
# Number of tokens in the source vocabulary (for token embeddings)
|
||||
n_src_vocab: int
|
||||
# Number of tokens in the target vocabulary (to generate logits for prediction)
|
||||
n_tgt_vocab: int
|
||||
|
||||
# The encoder self attention
|
||||
encoder_attn: MultiHeadAttention = 'mha'
|
||||
# The decoder self attention
|
||||
decoder_attn: MultiHeadAttention = 'mha'
|
||||
# The decoder memory attention
|
||||
decoder_mem_attn: MultiHeadAttention = 'mha'
|
||||
# Position-wise feedforward layer
|
||||
feed_forward: FeedForward
|
||||
# Activation in position-wise feedforward layer
|
||||
feed_forward_activation: nn.Module = 'ReLU'
|
||||
|
||||
encoder_layer: TransformerLayer = 'normal'
|
||||
decoder_layer: TransformerLayer = 'normal'
|
||||
# Encoder layer
|
||||
encoder_layer: TransformerLayer = 'default'
|
||||
# Decoder layer
|
||||
decoder_layer: TransformerLayer = 'default'
|
||||
|
||||
encoder: Encoder = 'normal'
|
||||
decoder: Decoder = 'normal'
|
||||
# Encoder consisting of multiple encoder layers
|
||||
encoder: Encoder = 'default'
|
||||
# Encoder consisting of multiple decoder layers
|
||||
decoder: Decoder = 'default'
|
||||
|
||||
# Embedding layer for source
|
||||
src_embed: Module = 'fixed_pos'
|
||||
# Embedding layer for target (for decoder)
|
||||
tgt_embed: Module = 'fixed_pos'
|
||||
|
||||
# Logit generator for prediction
|
||||
generator: Generator = 'default'
|
||||
|
||||
# Encoder-decoder
|
||||
encoder_decoder: EncoderDecoder
|
||||
|
||||
|
||||
@option(TransformerConfigs.feed_forward_activation, 'ReLU')
|
||||
def _feed_forward_activation_relu():
|
||||
"""
|
||||
ReLU activation
|
||||
"""
|
||||
return nn.ReLU()
|
||||
|
||||
|
||||
@option(TransformerConfigs.feed_forward_activation, 'GELU')
|
||||
def _feed_forward_activation_relu():
|
||||
"""
|
||||
GELU activation
|
||||
"""
|
||||
return nn.GELU()
|
||||
|
||||
|
||||
@option(TransformerConfigs.feed_forward, 'default')
|
||||
def _feed_forward(c: TransformerConfigs):
|
||||
"""
|
||||
Create feedforward layer
|
||||
"""
|
||||
return FeedForward(c.d_model, c.d_ff, c.dropout, c.feed_forward_activation)
|
||||
|
||||
|
||||
# ## MHA
|
||||
# ### Multi-head Attention
|
||||
def _mha(c: TransformerConfigs):
|
||||
return MultiHeadAttention(c.n_heads, c.d_model)
|
||||
|
||||
|
||||
calculate(TransformerConfigs.encoder_attn, 'mha', _mha)
|
||||
calculate(TransformerConfigs.decoder_attn, 'mha', _mha)
|
||||
calculate(TransformerConfigs.decoder_mem_attn, 'mha', _mha)
|
||||
|
||||
|
||||
# ## Relative MHA
|
||||
# ### Relative Multi-head Attention
|
||||
def _relative_mha(c: TransformerConfigs):
|
||||
from .relative_mha import RelativeMultiHeadAttention
|
||||
return RelativeMultiHeadAttention(c.n_heads, c.d_model)
|
||||
@ -82,60 +120,90 @@ calculate(TransformerConfigs.decoder_attn, 'relative', _relative_mha)
|
||||
calculate(TransformerConfigs.decoder_mem_attn, 'relative', _relative_mha)
|
||||
|
||||
|
||||
@option(TransformerConfigs.encoder_layer, 'normal')
|
||||
@option(TransformerConfigs.encoder_layer, 'default')
|
||||
def _encoder_layer(c: TransformerConfigs):
|
||||
"""
|
||||
Encoder layer
|
||||
"""
|
||||
return TransformerLayer(d_model=c.d_model, self_attn=c.encoder_attn,
|
||||
src_attn=None, feed_forward=copy.deepcopy(c.feed_forward),
|
||||
dropout_prob=c.dropout)
|
||||
|
||||
|
||||
@option(TransformerConfigs.decoder_layer, 'normal')
|
||||
@option(TransformerConfigs.decoder_layer, 'default')
|
||||
def _decoder_layer(c: TransformerConfigs):
|
||||
"""
|
||||
Decoder layer
|
||||
"""
|
||||
return TransformerLayer(d_model=c.d_model, self_attn=c.decoder_attn,
|
||||
src_attn=c.decoder_mem_attn, feed_forward=copy.deepcopy(c.feed_forward),
|
||||
dropout_prob=c.dropout)
|
||||
|
||||
|
||||
@option(TransformerConfigs.encoder, 'normal')
|
||||
@option(TransformerConfigs.encoder, 'default')
|
||||
def _encoder(c: TransformerConfigs):
|
||||
"""
|
||||
Encoder
|
||||
"""
|
||||
return Encoder(c.encoder_layer, c.n_layers)
|
||||
|
||||
|
||||
@option(TransformerConfigs.decoder, 'normal')
|
||||
@option(TransformerConfigs.decoder, 'default')
|
||||
def _decoder(c: TransformerConfigs):
|
||||
"""
|
||||
Decoder
|
||||
"""
|
||||
return Decoder(c.decoder_layer, c.n_layers)
|
||||
|
||||
|
||||
@option(TransformerConfigs.generator, 'default')
|
||||
def _generator(c: TransformerConfigs):
|
||||
"""
|
||||
Logit generator
|
||||
"""
|
||||
return Generator(c.n_tgt_vocab, c.d_model)
|
||||
|
||||
|
||||
# ## Positional Embeddings
|
||||
@option(TransformerConfigs.src_embed, 'fixed_pos')
|
||||
def _src_embed_with_positional(c: TransformerConfigs):
|
||||
"""
|
||||
Source embedding with fixed positional encodings
|
||||
"""
|
||||
return EmbeddingsWithPositionalEncoding(c.d_model, c.n_src_vocab)
|
||||
|
||||
|
||||
@option(TransformerConfigs.tgt_embed, 'fixed_pos')
|
||||
def _tgt_embed_with_positional(c: TransformerConfigs):
|
||||
"""
|
||||
Target embedding with fixed positional encodings
|
||||
"""
|
||||
return EmbeddingsWithPositionalEncoding(c.d_model, c.n_tgt_vocab)
|
||||
|
||||
|
||||
# ## Learned Positional Embeddings
|
||||
@option(TransformerConfigs.src_embed, 'learned_pos')
|
||||
def _src_embed_with_learned_positional(c: TransformerConfigs):
|
||||
"""
|
||||
Source embedding with learned positional encodings
|
||||
"""
|
||||
return EmbeddingsWithLearnedPositionalEncoding(c.d_model, c.n_src_vocab)
|
||||
|
||||
|
||||
@option(TransformerConfigs.tgt_embed, 'learned_pos')
|
||||
def _tgt_embed_with_learned_positional(c: TransformerConfigs):
|
||||
"""
|
||||
Target embedding with learned positional encodings
|
||||
"""
|
||||
return EmbeddingsWithLearnedPositionalEncoding(c.d_model, c.n_tgt_vocab)
|
||||
|
||||
|
||||
# ## No Positional Embeddings
|
||||
@option(TransformerConfigs.src_embed, 'no_pos')
|
||||
def _src_embed_without_positional(c: TransformerConfigs):
|
||||
"""
|
||||
Source embedding without positional encodings
|
||||
"""
|
||||
return nn.Embedding(c.n_src_vocab, c.d_model)
|
||||
|
||||
|
||||
@ -144,6 +212,6 @@ def _tgt_embed_without_positional(c: TransformerConfigs):
|
||||
return nn.Embedding(c.n_tgt_vocab, c.d_model)
|
||||
|
||||
|
||||
@option(TransformerConfigs.encoder_decoder, 'normal')
|
||||
@option(TransformerConfigs.encoder_decoder, 'default')
|
||||
def _encoder_decoder(c: TransformerConfigs):
|
||||
return EncoderDecoder(c.encoder, c.decoder, c.src_embed, c.tgt_embed, c.generator)
|
||||
|
@ -27,11 +27,11 @@ For the transformer we reuse the
|
||||
"""
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from labml import experiment
|
||||
from labml.configs import option
|
||||
from labml_helpers.module import Module
|
||||
from torch import nn
|
||||
|
||||
from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
|
||||
from labml_nn.optimizers.configs import OptimizerConfigs
|
||||
from labml_nn.transformers import TransformerConfigs, Encoder
|
||||
@ -45,6 +45,7 @@ class GPT(Module):
|
||||
This consists of a token embedding layer, transformer encoder, and
|
||||
a final linear layer that gives token logits.
|
||||
"""
|
||||
|
||||
def __init__(self, encoder: Encoder, src_embed: Module, generator: Module):
|
||||
"""
|
||||
* `encoder` is the transformer [Encoder](../models.html#Encoder)
|
||||
@ -82,42 +83,71 @@ class Configs(NLPAutoRegressionConfigs):
|
||||
"""
|
||||
## Configurations
|
||||
|
||||
This inherits
|
||||
This inherits from
|
||||
[`NLPAutoRegressionConfigs`](../../experiments/nlp_autoregression.html#NLPAutoRegressionConfigs)
|
||||
"""
|
||||
|
||||
# GPT model
|
||||
model: GPT
|
||||
# Transformer
|
||||
transformer: TransformerConfigs
|
||||
# Weight decay
|
||||
weight_decay: float = 0.1
|
||||
# Number of tokens for wamup
|
||||
warmup_steps: int = 128 * 128 * 20
|
||||
|
||||
# Custom optimizer
|
||||
optimizer = 'transformer_optimizer'
|
||||
|
||||
|
||||
@option(Configs.transformer, 'GPT')
|
||||
def _transformer_configs(c: Configs):
|
||||
"""
|
||||
### Transformer configurations
|
||||
"""
|
||||
|
||||
# We use our
|
||||
# [configurable transformer implementation](../configs.html#TransformerConfigs)
|
||||
conf = TransformerConfigs()
|
||||
# Set the vocabulary sizes for embeddings and generating logits
|
||||
conf.n_src_vocab = c.n_tokens
|
||||
conf.n_tgt_vocab = c.n_tokens
|
||||
# GPT uses GELU activation for position wise feedforward
|
||||
conf.feed_forward_activation = 'GELU'
|
||||
|
||||
#
|
||||
return conf
|
||||
|
||||
|
||||
def _init_weights(module):
|
||||
if isinstance(module, (nn.Linear, nn.Embedding)):
|
||||
"""
|
||||
### Initialize weights
|
||||
|
||||
Weights of linear layers and embedding layers are initialized
|
||||
to $\mathcal{N}(0, 0.02)$
|
||||
instead of the default Xavier initialzation.
|
||||
"""
|
||||
|
||||
if not isinstance(module, (nn.Linear, nn.Embedding)):
|
||||
return
|
||||
|
||||
module.weight.data.normal_(mean=0.0, std=0.02)
|
||||
|
||||
# Initialize biases to $0$
|
||||
if isinstance(module, nn.Linear) and module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
elif isinstance(module, nn.LayerNorm):
|
||||
module.bias.data.zero_()
|
||||
module.weight.data.fill_(1.0)
|
||||
|
||||
|
||||
@option(Configs.model)
|
||||
def _model(c: Configs):
|
||||
"""
|
||||
Create GPT model and initialize weights
|
||||
"""
|
||||
m = GPT(c.transformer.encoder,
|
||||
c.transformer.src_embed,
|
||||
c.transformer.generator).to(c.device)
|
||||
|
||||
# Apply custom weight initialization
|
||||
m.apply(_init_weights)
|
||||
|
||||
return m
|
||||
@ -125,39 +155,25 @@ def _model(c: Configs):
|
||||
|
||||
@option(NLPAutoRegressionConfigs.optimizer)
|
||||
def transformer_optimizer(c: NLPAutoRegressionConfigs):
|
||||
optimizer = OptimizerConfigs()
|
||||
"""
|
||||
### Create custom optimizer with weight decay
|
||||
|
||||
This code is taken from [minGPT](https://github.com/karpathy/minGPT).
|
||||
This applies weight decay only to weights of linear layers.
|
||||
"""
|
||||
# Collect names of parameters to apply weight decay
|
||||
decay = set()
|
||||
no_decay = set()
|
||||
whitelist_weight_modules = (nn.Linear,)
|
||||
blacklist_weight_modules = (nn.LayerNorm, nn.Embedding)
|
||||
for mn, m in c.model.named_modules():
|
||||
for pn, p in m.named_parameters():
|
||||
fpn = f'{mn}.{pn}' if mn else pn # full param name
|
||||
|
||||
if fpn.find('positional_encodings') != -1:
|
||||
no_decay.add(fpn)
|
||||
elif fpn.endswith('bias'):
|
||||
# all biases will not be decayed
|
||||
no_decay.add(fpn)
|
||||
elif fpn.endswith('weight'):
|
||||
if isinstance(m, whitelist_weight_modules):
|
||||
# weights of whitelist modules will be weight decayed
|
||||
if fpn.endswith('weight') and isinstance(m, nn.Linear):
|
||||
decay.add(fpn)
|
||||
elif isinstance(m, blacklist_weight_modules):
|
||||
# weights of blacklist modules will NOT be weight decayed
|
||||
no_decay.add(fpn)
|
||||
|
||||
# validate that we considered every parameter
|
||||
# Get all the parameters
|
||||
param_dict = {pn: p for pn, p in c.model.named_parameters()}
|
||||
|
||||
inter_params = decay & no_decay
|
||||
if inter_params:
|
||||
raise ValueError("Repeated parameters", inter_params)
|
||||
|
||||
missing_params = set(param_dict.keys()) - (decay | no_decay)
|
||||
if missing_params:
|
||||
raise ValueError('Missing parameters', missing_params)
|
||||
# Parameters that are not decayed
|
||||
no_decay = set(param_dict.keys()) - decay
|
||||
|
||||
# create the pytorch optimizer object
|
||||
opt_groups = [
|
||||
@ -165,15 +181,33 @@ def transformer_optimizer(c: NLPAutoRegressionConfigs):
|
||||
{"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
|
||||
]
|
||||
|
||||
# Create a [configurable optimizer](../optimizers/configs.html#OptimizerConfigs),
|
||||
# so that we can change these simple by passing
|
||||
# a config dictionary.
|
||||
optimizer = OptimizerConfigs()
|
||||
|
||||
# Set parameter groups for optimization
|
||||
optimizer.parameters = opt_groups
|
||||
# Use [cosine decay optimizer](../optimizers/adam_warmup_cosine_decay.html)
|
||||
# This is what GPT uses
|
||||
optimizer.optimizer = 'AdamWarmupCosineDecay'
|
||||
# Set model embedding size, required if we use [Noam optimizer](../optimizers/noam.html)
|
||||
# which has an exponential decay
|
||||
optimizer.d_model = c.d_model
|
||||
# Set default weight decay.
|
||||
# This is not required since we set the weight decay in the parameter groups
|
||||
optimizer.weight_decay = c.weight_decay
|
||||
# GPT uses a maximum learning rate of $6 \times 10^{-4}$
|
||||
optimizer.learning_rate = 6e-4
|
||||
# $\beta_1 = 0.9, \beta_2 = 0.95$
|
||||
optimizer.betas = (0.9, 0.95)
|
||||
# $\epsilon = 10^{-8}$
|
||||
optimizer.eps = 1e-8
|
||||
# Weight decay decoupled from gradients
|
||||
optimizer.weight_decouple = True
|
||||
optimizer.total_steps = c.epochs * len(c.text.train)
|
||||
# Total number of optimization steps for learning rate cosine decay
|
||||
optimizer.total_steps = c.epochs * len(c.text.train) // (c.batch_size * c.seq_len)
|
||||
# Number of warmup optimization steps
|
||||
optimizer.warmup = c.warmup_steps // (c.batch_size * c.seq_len)
|
||||
|
||||
return optimizer
|
||||
@ -185,33 +219,39 @@ def main():
|
||||
# Create configs
|
||||
conf = Configs()
|
||||
# Load configurations
|
||||
experiment.configs(conf,
|
||||
# A dictionary of configurations to override
|
||||
{'tokenizer': 'character',
|
||||
experiment.configs(conf, {
|
||||
# Use character level tokenizer
|
||||
'tokenizer': 'character',
|
||||
# Prompt separator is blank
|
||||
'prompt_separator': '',
|
||||
# Starting prompt for sampling
|
||||
'prompt': 'It is ',
|
||||
# Use Tiny Shakespeare dataset
|
||||
'text': 'tiny_shakespeare',
|
||||
|
||||
# Use a context size of $128$
|
||||
'seq_len': 128,
|
||||
# Train for $32$ epochs
|
||||
'epochs': 32,
|
||||
# Batch size $128$
|
||||
'batch_size': 128,
|
||||
# Switch between training and validation for $10$ times
|
||||
# per epoch
|
||||
'inner_iterations': 10,
|
||||
|
||||
# Transformer configurations
|
||||
'transformer.d_model': 512,
|
||||
'transformer.d_ff': 2048,
|
||||
'transformer.n_heads': 8,
|
||||
'transformer.n_layers': 6})
|
||||
|
||||
# This is needed to initialize models
|
||||
conf.n_tokens = conf.text.n_tokens
|
||||
'transformer.n_layers': 6
|
||||
})
|
||||
|
||||
# Set models for saving and loading
|
||||
experiment.add_pytorch_models({'model': conf.model})
|
||||
|
||||
# Start the experiment
|
||||
with experiment.start():
|
||||
# `TrainValidConfigs.run`
|
||||
# Run training
|
||||
conf.run()
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user