Skip to content

Commit

Permalink
add variable length for pytorch;
Browse files Browse the repository at this point in the history
add pretrained model.
  • Loading branch information
dongfang91 committed Nov 7, 2018
1 parent 4fe321f commit 1cba376
Show file tree
Hide file tree
Showing 7 changed files with 446 additions and 29 deletions.
13 changes: 10 additions & 3 deletions gru.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,18 @@ def last_timestep(self, unpacked, lengths):
# Index of the last output for each sequence.
idx = (lengths - 1).view(-1, 1).expand(unpacked.size(0),
unpacked.size(2)).unsqueeze(1)
if torch.cuda.is_available():
idx = idx.cuda()

return unpacked.gather(1, idx).squeeze()

def init_hidden(self):
h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
if torch.cuda.is_available():
h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim)).cuda()
c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim)).cuda()
else:
h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
return (h0, c0)

def forward(self, sentence,lengths):
Expand All @@ -41,4 +48,4 @@ def forward(self, sentence,lengths):
hidden_1 = self.hidden2hidden1(last_outputs)
hidden_1 = self.relu1(hidden_1)
y = self.hidden2label(hidden_1)
return y
return y
50 changes: 50 additions & 0 deletions gru_pretrained.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.autograd import Variable


class LSTMClassifier(nn.Module):

def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size, batch_size):
super(LSTMClassifier, self).__init__()
self.hidden_dim = hidden_dim
self.batch_size = batch_size
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim)
self.hidden2hidden1 = nn.Linear(hidden_dim, hidden_dim)
self.relu1 = nn.ReLU()
self.hidden2label = nn.Linear(hidden_dim, label_size)
self.dropout = nn.Dropout(0.5)


def last_timestep(self, unpacked, lengths):
# Index of the last output for each sequence.
idx = (lengths - 1).view(-1, 1).expand(unpacked.size(0),
unpacked.size(2)).unsqueeze(1)
if torch.cuda.is_available():
idx = idx.cuda()

return unpacked.gather(1, idx).squeeze()

def init_hidden(self):
if torch.cuda.is_available():
h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim)).cuda()
c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim)).cuda()
else:
h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
return (h0, c0)

def forward(self, sentence,lengths):

packed = torch.nn.utils.rnn.pack_padded_sequence(sentence, lengths,batch_first=True)
lstm_out, self.hidden = self.lstm(packed, self.hidden)
unpacked, unpacked_len = torch.nn.utils.rnn.pad_packed_sequence(lstm_out,batch_first=True)
# get the outputs from the last *non-masked* timestep for each sentence
last_outputs = self.last_timestep(unpacked, unpacked_len)
last_outputs = self.dropout(last_outputs)
hidden_1 = self.hidden2hidden1(last_outputs)
hidden_1 = self.relu1(hidden_1)
y = self.hidden2label(hidden_1)
return y
73 changes: 73 additions & 0 deletions read_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@

# encoding: utf-8
import json
import numpy as np
import sys
import shutil
if sys.version_info[0]==2:
import cPickle as pickle
else:
import pickle
import os

def create_folder(filename):
if "\\" in filename:
a = '\\'.join(filename.split('\\')[:-1])
else:
a = '/'.join(filename.split('/')[:-1])
if not os.path.exists(a):
os.makedirs(a)



def savein_json(filename, array):
create_folder(filename)
with open(filename+'.txt', 'w') as outfile:
json.dump(array, outfile)
print("Save into files: ",filename)
outfile.close()

def readfrom_json(filename):
with open(filename+'.txt', 'r') as outfile:
data = json.load(outfile)
outfile.close()
return data

def savein_pickle(file,array):
create_folder(file)
with open(file, 'wb') as handle:
pickle.dump(array, handle)

def readfrom_pickle(file):
with open(file, 'rb') as handle:
if sys.version_info[0] == 2:
data = pickle.load(handle)
else:
data = pickle.load(handle,encoding='latin1')
return data

def readfrom_txt(path):
data =open(path).read()
return data

def textfile2list(path):
data = readfrom_txt(path)
txt_list =list()
for line in data.splitlines():
txt_list.append(line)
return txt_list


def movefiles(dir_simples,old_address,new_address,abbr=""):
for dir_simple in dir_simples:
desti = dir_simple.replace(old_address,new_address)
desti = desti.replace("TimeNorm.gold.completed.xml","TimeNorm.system.completed.xml")
create_folder(desti)
shutil.copy(dir_simple+abbr,desti)

def movefiles_folders(dir_simples,old_address,new_address,abbr=""):
for dir_simple in dir_simples:
if not os.path.exists(new_address+"/"+dir_simple):
os.makedirs(new_address+"/"+dir_simple)
shutil.copy(old_address+"/"+dir_simple+"/"+dir_simple+".TimeNorm.gold.completed.xml",new_address+"/"+dir_simple+"/"+dir_simple+".TimeNorm.gold.completed.xml")

22 changes: 16 additions & 6 deletions rnn_baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from torch.autograd import Variable
import torch_preprocess as function
import math
import read_files as read


np.random.seed(123)
Expand Down Expand Up @@ -160,14 +161,21 @@ def rnn_baseline(dataset,train_model):

print("Run RNN Baseline for %s: ..." %(dataset))

texts, label_texts, labels = function.load_data(
"data/" + dataset + "/" + dataset + ".fold-" + str(0) + ".train.txt",
"data/" + dataset + "/" + dataset + ".fold-" + str(0) + ".validation.txt",
"data/" + dataset + "/" + dataset + ".fold-" + str(0) + ".test.txt")

vocab_dict, label_dict = function.get_vocab(texts, label_texts, labels)
vocab_dict["UNK"] = len(vocab_dict) + 1
vocab_dict["PAD"] = 0


for i in range(10):
texts, label_texts, labels = function.load_data("data/"+dataset + "/"+dataset+".fold-"+ str(i) +".train.txt",
"data/"+dataset + "/"+dataset+".fold-"+ str(i) +".validation.txt",
"data/"+dataset + "/"+dataset+".fold-"+ str(i) +".test.txt")

vocab_dict, label_dict =function.get_vocab(texts,label_texts,labels)
vocab_dict["UNK"] = len(vocab_dict)+1
vocab_dict["PAD"] = 0
max_length = 56
train_x,train_y,train_sentence_len = function.dataset_preprocess(texts[0],labels[0], vocab_dict,label_dict,max_length)
valid_x,valid_y , valid_sentence_len = function.dataset_preprocess(texts[1],labels[1], vocab_dict,label_dict,max_length)
Expand All @@ -180,16 +188,18 @@ def rnn_baseline(dataset,train_model):
else:
model = torch.load("data/model/model_" + dataset + "_folder_" + str(i) + ".pkl")
dev_acc = eval(i, model, valid_x, valid_y, valid_sentence_len, mode="Dev")
print(dev_acc)
test_acc = eval(i, model, test_x, test_y, test_sentence_len, mode = "Test")

avg_test_acc += test_acc
avg_dev_acc +=dev_acc
print('Average Dev Acc for %s: %.3f'
% (dataset, avg_dev_acc/10.0))
print('Average Testing Acc for %s: %.3f'
% (dataset, avg_test_acc/10.0))

import term_matching_baseline
term_matching_baseline.term_matching_baseline("AskAPatient")
term_matching_baseline.term_matching_baseline("TwADR-L")
#import term_matching_baseline
#term_matching_baseline.term_matching_baseline("AskAPatient")
#term_matching_baseline.term_matching_baseline("TwADR-L")
rnn_baseline("AskAPatient",train_model=False)
rnn_baseline("TwADR-L",train_model=False)
47 changes: 27 additions & 20 deletions rnn_character.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
np.random.seed(123)
torch.manual_seed(123)

use_gpu = torch.cuda.is_available()

## parameter setting


def iterate_minibatches(inputs, targets, sentence_len,batchsize, shuffle=False):
Expand All @@ -35,20 +35,18 @@ def iterate_minibatches(inputs, targets, sentence_len,batchsize, shuffle=False):

### parameter setting





### training procedure
def train(vocab_dict, label_dict, train_x,train_y,train_sentence_len,valid_x,valid_y , valid_sentence_len, dataset,folder):
embedding_dim = 4096
embedding_dim =128
hidden_dim = 256
epochs = 50
batch_size = 50
learning_rate = 1.0

model = LSTMClassifier(embedding_dim=embedding_dim, hidden_dim=hidden_dim,
vocab_size=len(vocab_dict), label_size=len(label_dict), batch_size=batch_size)
if use_gpu:
model = model.cuda()
train_acc_ = []
valid_acc_ = []
optimizer = optim.Adadelta(model.parameters(), lr=learning_rate, weight_decay=1e-6)
Expand All @@ -67,13 +65,17 @@ def train(vocab_dict, label_dict, train_x,train_y,train_sentence_len,valid_x,val
train_target_batch = torch.LongTensor(target_batch)
train_seq_lens_batch = torch.LongTensor(seq_lens_batch)
train_seq_lens_batch, perm_index = train_seq_lens_batch.sort(0, descending=True)
train_x_batch= train_input_batch[perm_index]
train_y_batch = train_target_batch[perm_index]
if use_gpu:
train_x_batch = Variable(train_input_batch[perm_index]).cuda()
train_y_batch = Variable(train_target_batch[perm_index]).cuda()
else:
train_x_batch = Variable(train_input_batch[perm_index])
train_y_batch = Variable(train_target_batch[perm_index])
model.zero_grad()
model.batch_size = len(train_x_batch)
model.hidden = model.init_hidden()
output = model(train_x_batch,train_seq_lens_batch)
loss = loss_function(output, Variable(train_y_batch))
loss = loss_function(output, train_y_batch)
loss.backward()
optimizer.step()

Expand All @@ -96,13 +98,16 @@ def train(vocab_dict, label_dict, train_x,train_y,train_sentence_len,valid_x,val
valid_label = torch.LongTensor(target_batch)
valid_seq_lens_batch = torch.LongTensor(seq_lens_batch)
valid_seq_lens_batch, perm_index = valid_seq_lens_batch.sort(0, descending=True)
valid_x_batch = valid_input[perm_index]
valid_y_batch = valid_label[perm_index]
valid_x_batch = Variable(valid_x_batch)
if use_gpu:
valid_x_batch = Variable(valid_input[perm_index]).cuda()
valid_y_batch = Variable(valid_label[perm_index]).cuda()
else:
valid_x_batch = Variable(valid_input[perm_index])
valid_y_batch = Variable(valid_label[perm_index])
model.batch_size = len(valid_x_batch)
model.hidden = model.init_hidden()
output = model(valid_x_batch,valid_seq_lens_batch)
loss = loss_function(output, Variable(valid_y_batch))
loss = loss_function(output, valid_y_batch)

# calc testing acc
_, predicted = torch.max(output.data, 1)
Expand Down Expand Up @@ -137,15 +142,17 @@ def eval(folder, model,test_x,test_y , test_sentence_len,mode):
test_label = torch.LongTensor(target_batch)
test_seq_lens_batch = torch.LongTensor(seq_lens_batch)
test_seq_lens_batch, perm_index = test_seq_lens_batch.sort(0, descending=True)
test_x_batch = test_input[perm_index]
test_y_batch = test_label[perm_index]
test_x_batch = Variable(test_x_batch)
if use_gpu:
test_x_batch = Variable(test_input[perm_index]).cuda()
test_y_batch = Variable(test_label[perm_index]).cuda()
else:
test_x_batch = Variable(test_input[perm_index])
test_y_batch = Variable(test_label[perm_index])
model.batch_size = len(test_x_batch)
model.hidden = model.init_hidden()
output = model(test_x_batch,test_seq_lens_batch)
loss_function = nn.CrossEntropyLoss()
loss = loss_function(output, Variable(test_y_batch))

loss = loss_function(output,test_y_batch)
# calc testing acc
_, predicted = torch.max(output.data, 1)
total_acc += np.float((predicted == test_y_batch).sum().item())
Expand All @@ -159,7 +166,7 @@ def eval(folder, model,test_x,test_y , test_sentence_len,mode):
return test_acc_[0]

#test()
def rnn_baseline(dataset,train_model):
def rnn_character(dataset,train_model):
avg_test_acc = 0.0
avg_dev_acc = 0.0

Expand Down Expand Up @@ -202,5 +209,5 @@ def rnn_baseline(dataset,train_model):



rnn_baseline("AskAPatient",train_model=True)
rnn_character("AskAPatient",train_model=True)
#rnn_baseline("TwADR-L",train_model=False)
Loading

0 comments on commit 1cba376

Please sign in to comment.