From 0e59313b4b0345929c2a9776495912ddab4d5514 Mon Sep 17 00:00:00 2001 From: yunjey Date: Thu, 13 Apr 2017 19:49:10 +0900 Subject: [PATCH] modified the code --- tutorials/09 - Image Captioning/download.sh | 18 +-- tutorials/09 - Image Captioning/model.py | 16 +-- .../09 - Image Captioning/requirements.txt | 8 +- tutorials/09 - Image Captioning/sample.py | 68 ++++++----- tutorials/09 - Image Captioning/train.py | 112 +++++++++++------- 5 files changed, 134 insertions(+), 88 deletions(-) diff --git a/tutorials/09 - Image Captioning/download.sh b/tutorials/09 - Image Captioning/download.sh index ad2d0da..751c87d 100755 --- a/tutorials/09 - Image Captioning/download.sh +++ b/tutorials/09 - Image Captioning/download.sh @@ -1,9 +1,11 @@ -wget http://msvocds.blob.core.windows.net/annotations-1-0-3/captions_train-val2014.zip -P data/ -wget http://msvocds.blob.core.windows.net/coco2014/train2014.zip -P data/ -wget http://msvocds.blob.core.windows.net/coco2014/val2014.zip -P data/ +mkdir data +wget http://msvocds.blob.core.windows.net/annotations-1-0-3/captions_train-val2014.zip -P ./data/ +wget http://msvocds.blob.core.windows.net/coco2014/train2014.zip -P ./data/ +wget http://msvocds.blob.core.windows.net/coco2014/val2014.zip -P ./data/ -unzip data/captions_train-val2014.zip -d data/ -unzip data/train2014.zip -d data/ -rm data/train2014.zip -unzip data/val2014.zip -d data/ -rm data/val2014.zip +unzip ./data/captions_train-val2014.zip -d ./data/ +rm ./data/captions_train-val2014.zip +unzip ./data/train2014.zip -d ./data/ +rm ./data/train2014.zip +unzip ./data/val2014.zip -d ./data/ +rm ./data/val2014.zip diff --git a/tutorials/09 - Image Captioning/model.py b/tutorials/09 - Image Captioning/model.py index 4b0b65c..e79fa02 100644 --- a/tutorials/09 - Image Captioning/model.py +++ b/tutorials/09 - Image Captioning/model.py @@ -7,7 +7,7 @@ from torch.autograd import Variable class EncoderCNN(nn.Module): def __init__(self, embed_size): - """Loads the pretrained ResNet-152 and replace top fc layer.""" + """Load the pretrained ResNet-152 and replace top fc layer.""" super(EncoderCNN, self).__init__() self.resnet = models.resnet152(pretrained=True) for param in self.resnet.parameters(): @@ -17,12 +17,12 @@ class EncoderCNN(nn.Module): self.init_weights() def init_weights(self): - """Initialize weights.""" + """Initialize the weights.""" self.resnet.fc.weight.data.normal_(0.0, 0.02) self.resnet.fc.bias.data.fill_(0) def forward(self, images): - """Extracts the image feature vectors.""" + """Extract the image feature vectors.""" features = self.resnet(images) features = self.bn(features) return features @@ -44,7 +44,7 @@ class DecoderRNN(nn.Module): self.linear.bias.data.fill_(0) def forward(self, features, captions, lengths): - """Decodes image feature vectors and generates captions.""" + """Decode image feature vectors and generates captions.""" embeddings = self.embed(captions) embeddings = torch.cat((features.unsqueeze(1), embeddings), 1) packed = pack_padded_sequence(embeddings, lengths, batch_first=True) @@ -56,11 +56,11 @@ class DecoderRNN(nn.Module): """Samples captions for given image features (Greedy search).""" sampled_ids = [] inputs = features.unsqueeze(1) - for i in range(20): - hiddens, states = self.lstm(inputs, states) # (batch_size, 1, hidden_size) - outputs = self.linear(hiddens.squeeze(1)) # (batch_size, vocab_size) + for i in range(20): # maximum sampling length + hiddens, states = self.lstm(inputs, states) # (batch_size, 1, hidden_size) + outputs = self.linear(hiddens.squeeze(1)) # (batch_size, vocab_size) predicted = outputs.max(1)[1] sampled_ids.append(predicted) inputs = self.embed(predicted) - sampled_ids = torch.cat(sampled_ids, 1) # (batch_size, 20) + sampled_ids = torch.cat(sampled_ids, 1) # (batch_size, 20) return sampled_ids.squeeze() \ No newline at end of file diff --git a/tutorials/09 - Image Captioning/requirements.txt b/tutorials/09 - Image Captioning/requirements.txt index 761c878..778d2a8 100644 --- a/tutorials/09 - Image Captioning/requirements.txt +++ b/tutorials/09 - Image Captioning/requirements.txt @@ -1,5 +1,5 @@ -matplotlib==2.0.0 -nltk==3.2.2 -numpy==1.12.0 -Pillow==4.0.0 +matplotlib +nltk +numpy +Pillow argparse \ No newline at end of file diff --git a/tutorials/09 - Image Captioning/sample.py b/tutorials/09 - Image Captioning/sample.py index fdc5693..be8ee71 100644 --- a/tutorials/09 - Image Captioning/sample.py +++ b/tutorials/09 - Image Captioning/sample.py @@ -1,48 +1,46 @@ -from vocab import Vocabulary -from model import EncoderCNN, DecoderRNN -from configuration import Config -from PIL import Image -from torch.autograd import Variable import torch -import torchvision.transforms as T import matplotlib.pyplot as plt import numpy as np import argparse import pickle import os +from torch.autograd import Variable +from torchvision import transforms +from build_vocab import Vocabulary +from model import EncoderCNN, DecoderRNN +from PIL import Image -def main(params): - # Configuration for hyper-parameters - config = Config() +def main(args): + # Image preprocessing + transform = transforms.Compose([ + transforms.Scale(args.crop_size), + transforms.CenterCrop(args.crop_size), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) - # Image Preprocessing - transform = config.test_transform - - # Load vocabulary - with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f: + # Load vocabulary wrapper + with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models - encoder = EncoderCNN(config.embed_size) + encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) - decoder = DecoderRNN(config.embed_size, config.hidden_size, - len(vocab), config.num_layers) + decoder = DecoderRNN(args.embed_size, args.hidden_size, + len(vocab), args.num_layers) # Load the trained model parameters - encoder.load_state_dict(torch.load(os.path.join(config.model_path, - config.trained_encoder))) - decoder.load_state_dict(torch.load(os.path.join(config.model_path, - config.trained_decoder))) + encoder.load_state_dict(torch.load(args.encoder_path)) + decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image - image = Image.open(params['image']) + image = Image.open(args.image) image_tensor = Variable(transform(image).unsqueeze(0)) # Set initial states - state = (Variable(torch.zeros(config.num_layers, 1, config.hidden_size)), - Variable(torch.zeros(config.num_layers, 1, config.hidden_size))) + state = (Variable(torch.zeros(args.num_layers, 1, args.hidden_size)), + Variable(torch.zeros(args.num_layers, 1, args.hidden_size))) # If use gpu if torch.cuda.is_available(): @@ -71,7 +69,23 @@ def main(params): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--image', type=str, required=True, help='image for generating caption') + parser.add_argument('--image', type=str, required=True, + help='input image for generating caption') + parser.add_argument('--encoder_path', type=str, default='./models/encoder-5-3000.pkl', + help='path for trained encoder') + parser.add_argument('--decoder_path', type=str, default='./models/decoder-5-3000.pkl', + help='path for trained decoder') + parser.add_argument('--vocab_path', type=str, default='./data/vocab.pkl', + help='path for vocabulary wrapper') + parser.add_argument('--crop_size', type=int, default=224, + help='size for center cropping images') + + # Model parameters (should be same as paramters in train.py) + parser.add_argument('--embed_size', type=int , default=256, + help='dimension of word embedding vectors') + parser.add_argument('--hidden_size', type=int , default=512, + help='dimension of lstm hidden states') + parser.add_argument('--num_layers', type=int , default=1 , + help='number of layers in lstm') args = parser.parse_args() - params = vars(args) - main(params) \ No newline at end of file + main(args) \ No newline at end of file diff --git a/tutorials/09 - Image Captioning/train.py b/tutorials/09 - Image Captioning/train.py index d167fd1..05578f5 100644 --- a/tutorials/09 - Image Captioning/train.py +++ b/tutorials/09 - Image Captioning/train.py @@ -1,56 +1,55 @@ -from data import get_data_loader -from vocab import Vocabulary -from configuration import Config +import argparse +import torch +import torch.nn as nn +import numpy as np +import os +from data_loader import get_loader +from build_vocab import Vocabulary from model import EncoderCNN, DecoderRNN from torch.autograd import Variable from torch.nn.utils.rnn import pack_padded_sequence -import torch -import torch.nn as nn -import numpy as np -import pickle -import os +from torchvision import transforms -def main(): - # Configuration for hyper-parameters - config = Config() - +def main(args): # Create model directory - if not os.path.exists(config.model_path): - os.makedirs(config.model_path) + if not os.path.exists(args.model_path): + os.makedirs(args.model_path) # Image preprocessing - transform = config.train_transform + transform = transforms.Compose([ + transforms.RandomCrop(args.crop_size), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) - # Load vocabulary wrapper - with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f: + # Load vocabulary wrapper. + with open(vocab_path, 'rb') as f: vocab = pickle.load(f) - - # Build data loader - image_path = os.path.join(config.image_path, 'train2014') - json_path = os.path.join(config.caption_path, 'captions_train2014.json') - train_loader = get_data_loader(image_path, json_path, vocab, - transform, config.batch_size, - shuffle=True, num_workers=config.num_threads) - total_step = len(train_loader) - - # Build Models - encoder = EncoderCNN(config.embed_size) - decoder = DecoderRNN(config.embed_size, config.hidden_size, - len(vocab), config.num_layers) - if torch.cuda.is_available() + # Build data loader + data_loader = get_loader(args.image_dir, args.caption_path, vocab, + transform, args.batch_size, + shuffle=True, num_workers=args.num_workers) + + # Build the models + encoder = EncoderCNN(args.embed_size) + decoder = DecoderRNN(args.embed_size, args.hidden_size, + len(vocab), args.num_layers) + + if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters()) - optimizer = torch.optim.Adam(params, lr=config.learning_rate) + optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models - for epoch in range(config.num_epochs): - for i, (images, captions, lengths) in enumerate(train_loader): + total_step = len(data_loader) + for epoch in range(args.num_epochs): + for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = Variable(images) @@ -70,19 +69,50 @@ def main(): optimizer.step() # Print log info - if i % config.log_step == 0: + if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' - %(epoch, config.num_epochs, i, total_step, + %(epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) - # Save the Model - if (i+1) % config.save_step == 0: + # Save the models + if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), - os.path.join(config.model_path, + os.path.join(args.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(encoder.state_dict(), - os.path.join(config.model_path, + os.path.join(args.model_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1))) if __name__ == '__main__': - main() \ No newline at end of file + parser = argparse.ArgumentParser() + parser.add_argument('--model_path', type=str, default='./models/' , + help='path for saving trained models') + parser.add_argument('--crop_size', type=int, default=224 , + help='size for randomly cropping images') + parser.add_argument('--vocab_path', type=str, default='./data/vocab.pkl', + help='path for vocabulary wrapper') + parser.add_argument('--image_dir', type=str, default='./data/resized2014' , + help='directory for resized images') + parser.add_argument('--caption_path', type=str, + default='./data/annotations/captions_train2014.json', + help='path for train annotation json file') + parser.add_argument('--log_step', type=int , default=10, + help='step size for prining log info') + parser.add_argument('--save_step', type=int , default=1000, + help='step size for saving trained models') + + # Model parameters + parser.add_argument('--embed_size', type=int , default=256 , + help='dimension of word embedding vectors') + parser.add_argument('--hidden_size', type=int , default=512 , + help='dimension of lstm hidden states') + parser.add_argument('--num_layers', type=int , default=1 , + help='number of layers in lstm') + + parser.add_argument('--num_epochs', type=int, default=5) + parser.add_argument('--batch_size', type=int, default=128) + parser.add_argument('--num_workers', type=int, default=2) + parser.add_argument('--learning_rate', type=float, default=0.001) + args = parser.parse_args() + print(args) + main(args) \ No newline at end of file