modified the code

This commit is contained in:
yunjey
2017-04-13 19:49:10 +09:00
parent fd445c96b9
commit 0e59313b4b
5 changed files with 134 additions and 88 deletions

View File

@ -1,9 +1,11 @@
wget http://msvocds.blob.core.windows.net/annotations-1-0-3/captions_train-val2014.zip -P data/ mkdir data
wget http://msvocds.blob.core.windows.net/coco2014/train2014.zip -P data/ wget http://msvocds.blob.core.windows.net/annotations-1-0-3/captions_train-val2014.zip -P ./data/
wget http://msvocds.blob.core.windows.net/coco2014/val2014.zip -P data/ wget http://msvocds.blob.core.windows.net/coco2014/train2014.zip -P ./data/
wget http://msvocds.blob.core.windows.net/coco2014/val2014.zip -P ./data/
unzip data/captions_train-val2014.zip -d data/ unzip ./data/captions_train-val2014.zip -d ./data/
unzip data/train2014.zip -d data/ rm ./data/captions_train-val2014.zip
rm data/train2014.zip unzip ./data/train2014.zip -d ./data/
unzip data/val2014.zip -d data/ rm ./data/train2014.zip
rm data/val2014.zip unzip ./data/val2014.zip -d ./data/
rm ./data/val2014.zip

View File

@ -7,7 +7,7 @@ from torch.autograd import Variable
class EncoderCNN(nn.Module): class EncoderCNN(nn.Module):
def __init__(self, embed_size): def __init__(self, embed_size):
"""Loads the pretrained ResNet-152 and replace top fc layer.""" """Load the pretrained ResNet-152 and replace top fc layer."""
super(EncoderCNN, self).__init__() super(EncoderCNN, self).__init__()
self.resnet = models.resnet152(pretrained=True) self.resnet = models.resnet152(pretrained=True)
for param in self.resnet.parameters(): for param in self.resnet.parameters():
@ -17,12 +17,12 @@ class EncoderCNN(nn.Module):
self.init_weights() self.init_weights()
def init_weights(self): def init_weights(self):
"""Initialize weights.""" """Initialize the weights."""
self.resnet.fc.weight.data.normal_(0.0, 0.02) self.resnet.fc.weight.data.normal_(0.0, 0.02)
self.resnet.fc.bias.data.fill_(0) self.resnet.fc.bias.data.fill_(0)
def forward(self, images): def forward(self, images):
"""Extracts the image feature vectors.""" """Extract the image feature vectors."""
features = self.resnet(images) features = self.resnet(images)
features = self.bn(features) features = self.bn(features)
return features return features
@ -44,7 +44,7 @@ class DecoderRNN(nn.Module):
self.linear.bias.data.fill_(0) self.linear.bias.data.fill_(0)
def forward(self, features, captions, lengths): def forward(self, features, captions, lengths):
"""Decodes image feature vectors and generates captions.""" """Decode image feature vectors and generates captions."""
embeddings = self.embed(captions) embeddings = self.embed(captions)
embeddings = torch.cat((features.unsqueeze(1), embeddings), 1) embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
packed = pack_padded_sequence(embeddings, lengths, batch_first=True) packed = pack_padded_sequence(embeddings, lengths, batch_first=True)
@ -56,11 +56,11 @@ class DecoderRNN(nn.Module):
"""Samples captions for given image features (Greedy search).""" """Samples captions for given image features (Greedy search)."""
sampled_ids = [] sampled_ids = []
inputs = features.unsqueeze(1) inputs = features.unsqueeze(1)
for i in range(20): for i in range(20): # maximum sampling length
hiddens, states = self.lstm(inputs, states) # (batch_size, 1, hidden_size) hiddens, states = self.lstm(inputs, states) # (batch_size, 1, hidden_size)
outputs = self.linear(hiddens.squeeze(1)) # (batch_size, vocab_size) outputs = self.linear(hiddens.squeeze(1)) # (batch_size, vocab_size)
predicted = outputs.max(1)[1] predicted = outputs.max(1)[1]
sampled_ids.append(predicted) sampled_ids.append(predicted)
inputs = self.embed(predicted) inputs = self.embed(predicted)
sampled_ids = torch.cat(sampled_ids, 1) # (batch_size, 20) sampled_ids = torch.cat(sampled_ids, 1) # (batch_size, 20)
return sampled_ids.squeeze() return sampled_ids.squeeze()

View File

@ -1,5 +1,5 @@
matplotlib==2.0.0 matplotlib
nltk==3.2.2 nltk
numpy==1.12.0 numpy
Pillow==4.0.0 Pillow
argparse argparse

View File

@ -1,48 +1,46 @@
from vocab import Vocabulary
from model import EncoderCNN, DecoderRNN
from configuration import Config
from PIL import Image
from torch.autograd import Variable
import torch import torch
import torchvision.transforms as T
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
import argparse import argparse
import pickle import pickle
import os import os
from torch.autograd import Variable
from torchvision import transforms
from build_vocab import Vocabulary
from model import EncoderCNN, DecoderRNN
from PIL import Image
def main(params): def main(args):
# Configuration for hyper-parameters # Image preprocessing
config = Config() transform = transforms.Compose([
transforms.Scale(args.crop_size),
transforms.CenterCrop(args.crop_size),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
# Image Preprocessing # Load vocabulary wrapper
transform = config.test_transform with open(args.vocab_path, 'rb') as f:
# Load vocabulary
with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f:
vocab = pickle.load(f) vocab = pickle.load(f)
# Build Models # Build Models
encoder = EncoderCNN(config.embed_size) encoder = EncoderCNN(args.embed_size)
encoder.eval() # evaluation mode (BN uses moving mean/variance) encoder.eval() # evaluation mode (BN uses moving mean/variance)
decoder = DecoderRNN(config.embed_size, config.hidden_size, decoder = DecoderRNN(args.embed_size, args.hidden_size,
len(vocab), config.num_layers) len(vocab), args.num_layers)
# Load the trained model parameters # Load the trained model parameters
encoder.load_state_dict(torch.load(os.path.join(config.model_path, encoder.load_state_dict(torch.load(args.encoder_path))
config.trained_encoder))) decoder.load_state_dict(torch.load(args.decoder_path))
decoder.load_state_dict(torch.load(os.path.join(config.model_path,
config.trained_decoder)))
# Prepare Image # Prepare Image
image = Image.open(params['image']) image = Image.open(args.image)
image_tensor = Variable(transform(image).unsqueeze(0)) image_tensor = Variable(transform(image).unsqueeze(0))
# Set initial states # Set initial states
state = (Variable(torch.zeros(config.num_layers, 1, config.hidden_size)), state = (Variable(torch.zeros(args.num_layers, 1, args.hidden_size)),
Variable(torch.zeros(config.num_layers, 1, config.hidden_size))) Variable(torch.zeros(args.num_layers, 1, args.hidden_size)))
# If use gpu # If use gpu
if torch.cuda.is_available(): if torch.cuda.is_available():
@ -71,7 +69,23 @@ def main(params):
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--image', type=str, required=True, help='image for generating caption') parser.add_argument('--image', type=str, required=True,
help='input image for generating caption')
parser.add_argument('--encoder_path', type=str, default='./models/encoder-5-3000.pkl',
help='path for trained encoder')
parser.add_argument('--decoder_path', type=str, default='./models/decoder-5-3000.pkl',
help='path for trained decoder')
parser.add_argument('--vocab_path', type=str, default='./data/vocab.pkl',
help='path for vocabulary wrapper')
parser.add_argument('--crop_size', type=int, default=224,
help='size for center cropping images')
# Model parameters (should be same as paramters in train.py)
parser.add_argument('--embed_size', type=int , default=256,
help='dimension of word embedding vectors')
parser.add_argument('--hidden_size', type=int , default=512,
help='dimension of lstm hidden states')
parser.add_argument('--num_layers', type=int , default=1 ,
help='number of layers in lstm')
args = parser.parse_args() args = parser.parse_args()
params = vars(args) main(args)
main(params)

View File

@ -1,56 +1,55 @@
from data import get_data_loader import argparse
from vocab import Vocabulary
from configuration import Config
from model import EncoderCNN, DecoderRNN
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence
import torch import torch
import torch.nn as nn import torch.nn as nn
import numpy as np import numpy as np
import pickle
import os import os
from data_loader import get_loader
from build_vocab import Vocabulary
from model import EncoderCNN, DecoderRNN
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms
def main(): def main(args):
# Configuration for hyper-parameters
config = Config()
# Create model directory # Create model directory
if not os.path.exists(config.model_path): if not os.path.exists(args.model_path):
os.makedirs(config.model_path) os.makedirs(args.model_path)
# Image preprocessing # Image preprocessing
transform = config.train_transform transform = transforms.Compose([
transforms.RandomCrop(args.crop_size),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
# Load vocabulary wrapper # Load vocabulary wrapper.
with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f: with open(vocab_path, 'rb') as f:
vocab = pickle.load(f) vocab = pickle.load(f)
# Build data loader # Build data loader
image_path = os.path.join(config.image_path, 'train2014') data_loader = get_loader(args.image_dir, args.caption_path, vocab,
json_path = os.path.join(config.caption_path, 'captions_train2014.json') transform, args.batch_size,
train_loader = get_data_loader(image_path, json_path, vocab, shuffle=True, num_workers=args.num_workers)
transform, config.batch_size,
shuffle=True, num_workers=config.num_threads)
total_step = len(train_loader)
# Build Models # Build the models
encoder = EncoderCNN(config.embed_size) encoder = EncoderCNN(args.embed_size)
decoder = DecoderRNN(config.embed_size, config.hidden_size, decoder = DecoderRNN(args.embed_size, args.hidden_size,
len(vocab), config.num_layers) len(vocab), args.num_layers)
if torch.cuda.is_available() if torch.cuda.is_available():
encoder.cuda() encoder.cuda()
decoder.cuda() decoder.cuda()
# Loss and Optimizer # Loss and Optimizer
criterion = nn.CrossEntropyLoss() criterion = nn.CrossEntropyLoss()
params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters()) params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters())
optimizer = torch.optim.Adam(params, lr=config.learning_rate) optimizer = torch.optim.Adam(params, lr=args.learning_rate)
# Train the Models # Train the Models
for epoch in range(config.num_epochs): total_step = len(data_loader)
for i, (images, captions, lengths) in enumerate(train_loader): for epoch in range(args.num_epochs):
for i, (images, captions, lengths) in enumerate(data_loader):
# Set mini-batch dataset # Set mini-batch dataset
images = Variable(images) images = Variable(images)
@ -70,19 +69,50 @@ def main():
optimizer.step() optimizer.step()
# Print log info # Print log info
if i % config.log_step == 0: if i % args.log_step == 0:
print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
%(epoch, config.num_epochs, i, total_step, %(epoch, args.num_epochs, i, total_step,
loss.data[0], np.exp(loss.data[0]))) loss.data[0], np.exp(loss.data[0])))
# Save the Model # Save the models
if (i+1) % config.save_step == 0: if (i+1) % args.save_step == 0:
torch.save(decoder.state_dict(), torch.save(decoder.state_dict(),
os.path.join(config.model_path, os.path.join(args.model_path,
'decoder-%d-%d.pkl' %(epoch+1, i+1))) 'decoder-%d-%d.pkl' %(epoch+1, i+1)))
torch.save(encoder.state_dict(), torch.save(encoder.state_dict(),
os.path.join(config.model_path, os.path.join(args.model_path,
'encoder-%d-%d.pkl' %(epoch+1, i+1))) 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
if __name__ == '__main__': if __name__ == '__main__':
main() parser = argparse.ArgumentParser()
parser.add_argument('--model_path', type=str, default='./models/' ,
help='path for saving trained models')
parser.add_argument('--crop_size', type=int, default=224 ,
help='size for randomly cropping images')
parser.add_argument('--vocab_path', type=str, default='./data/vocab.pkl',
help='path for vocabulary wrapper')
parser.add_argument('--image_dir', type=str, default='./data/resized2014' ,
help='directory for resized images')
parser.add_argument('--caption_path', type=str,
default='./data/annotations/captions_train2014.json',
help='path for train annotation json file')
parser.add_argument('--log_step', type=int , default=10,
help='step size for prining log info')
parser.add_argument('--save_step', type=int , default=1000,
help='step size for saving trained models')
# Model parameters
parser.add_argument('--embed_size', type=int , default=256 ,
help='dimension of word embedding vectors')
parser.add_argument('--hidden_size', type=int , default=512 ,
help='dimension of lstm hidden states')
parser.add_argument('--num_layers', type=int , default=1 ,
help='number of layers in lstm')
parser.add_argument('--num_epochs', type=int, default=5)
parser.add_argument('--batch_size', type=int, default=128)
parser.add_argument('--num_workers', type=int, default=2)
parser.add_argument('--learning_rate', type=float, default=0.001)
args = parser.parse_args()
print(args)
main(args)