modify the model

This commit is contained in:
yunjey
2017-03-13 14:35:34 +09:00
parent eadb0f9580
commit a500ce7396
3 changed files with 32 additions and 12 deletions

View File

@ -10,9 +10,15 @@ class EncoderCNN(nn.Module):
"""Load pretrained ResNet-152 and replace top fc layer.""" """Load pretrained ResNet-152 and replace top fc layer."""
super(EncoderCNN, self).__init__() super(EncoderCNN, self).__init__()
self.resnet = models.resnet152(pretrained=True) self.resnet = models.resnet152(pretrained=True)
self.resnet.fc = nn.Linear(self.resnet.fc.in_features, embed_size) # For efficient memory usage.
for param in self.resnet.parameters(): for param in self.resnet.parameters():
param.requires_grad = False param.requires_grad = False
self.resnet.fc = nn.Linear(self.resnet.fc.in_features, embed_size)
self.init_weights()
def init_weights(self):
self.resnet.fc.weight.data.uniform_(-0.1, 0.1)
self.resnet.fc.bias.data.fill_(0)
def forward(self, images): def forward(self, images):
"""Extract image feature vectors.""" """Extract image feature vectors."""
@ -30,6 +36,11 @@ class DecoderRNN(nn.Module):
self.embed = nn.Embedding(vocab_size, embed_size) self.embed = nn.Embedding(vocab_size, embed_size)
self.lstm = nn.LSTM(embed_size, hidden_size, num_layers) self.lstm = nn.LSTM(embed_size, hidden_size, num_layers)
self.linear = nn.Linear(hidden_size, vocab_size) self.linear = nn.Linear(hidden_size, vocab_size)
def init_weights(self):
self.embed.weight.data.uniform_(-0.1, 0.1)
self.linear.weigth.data.uniform_(-0.1, 0.1)
self.linear.bias.data.fill_(0)
def forward(self, features, captions, lengths): def forward(self, features, captions, lengths):
"""Decode image feature vectors and generate caption.""" """Decode image feature vectors and generate caption."""

View File

@ -1,6 +1,7 @@
import os import os
import numpy as np import numpy as np
import torch import torch
import torchvision.transforms as T
import pickle import pickle
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from PIL import Image from PIL import Image
@ -8,6 +9,12 @@ from model import EncoderCNN, DecoderRNN
from vocab import Vocabulary from vocab import Vocabulary
from torch.autograd import Variable from torch.autograd import Variable
# Image processing
transform = T.Compose([
T.CenterCrop(224),
T.ToTensor(),
T.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))])
# Hyper Parameters # Hyper Parameters
embed_size = 128 embed_size = 128
hidden_size = 512 hidden_size = 512
@ -18,11 +25,10 @@ with open('./data/vocab.pkl', 'rb') as f:
vocab = pickle.load(f) vocab = pickle.load(f)
# Load an image array # Load an image array
images = os.listdir('./data/val2014resized/') images = os.listdir('./data/train2014resized/')
image_path = './data/val2014resized/' + images[12] image_path = './data/train2014resized/' + images[12]
with open(image_path, 'r+b') as f: img = Image.open(image_path)
img = np.asarray(Image.open(f)) image = transform(img).unsqueeze(0)
image = torch.from_numpy(img.transpose(2, 0, 1)).float().unsqueeze(0) / 255 - 0.5
# Load the trained models # Load the trained models
encoder = torch.load('./encoder.pkl') encoder = torch.load('./encoder.pkl')

View File

@ -1,6 +1,6 @@
from data import get_loader from data import get_loader
from vocab import Vocabulary from vocab import Vocabulary
from models import EncoderCNN, DecoderRNN from model import EncoderCNN, DecoderRNN
from torch.autograd import Variable from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence from torch.nn.utils.rnn import pack_padded_sequence
import torch import torch
@ -10,10 +10,11 @@ import torchvision.transforms as T
import pickle import pickle
# Hyper Parameters # Hyper Parameters
num_epochs = 5 num_epochs = 1
batch_size = 100 batch_size = 32
embed_size = 128 embed_size = 256
hidden_size = 512 hidden_size = 512
crop_size = 224
num_layers = 1 num_layers = 1
learning_rate = 0.001 learning_rate = 0.001
train_image_path = './data/train2014resized/' train_image_path = './data/train2014resized/'
@ -21,6 +22,7 @@ train_json_path = './data/annotations/captions_train2014.json'
# Image Preprocessing # Image Preprocessing
transform = T.Compose([ transform = T.Compose([
T.RandomCrop(crop_size),
T.RandomHorizontalFlip(), T.RandomHorizontalFlip(),
T.ToTensor(), T.ToTensor(),
T.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))]) T.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))])
@ -42,7 +44,8 @@ decoder.cuda()
# Loss and Optimizer # Loss and Optimizer
criterion = nn.CrossEntropyLoss() criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate) params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters())
optimizer = torch.optim.Adam(params, lr=learning_rate)
# Train the Decoder # Train the Decoder
for epoch in range(num_epochs): for epoch in range(num_epochs):
@ -63,7 +66,7 @@ for epoch in range(num_epochs):
if i % 100 == 0: if i % 100 == 0:
print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
%(epoch, num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) %(epoch, num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0])))
# Save the Model # Save the Model
torch.save(decoder, 'decoder.pkl') torch.save(decoder, 'decoder.pkl')
torch.save(encoder, 'encoder.pkl') torch.save(encoder, 'encoder.pkl')