diff --git a/gru.py b/gru.py index 89dadbe..8b36fd4 100644 --- a/gru.py +++ b/gru.py @@ -22,11 +22,18 @@ def last_timestep(self, unpacked, lengths): # Index of the last output for each sequence. idx = (lengths - 1).view(-1, 1).expand(unpacked.size(0), unpacked.size(2)).unsqueeze(1) + if torch.cuda.is_available(): + idx = idx.cuda() + return unpacked.gather(1, idx).squeeze() def init_hidden(self): - h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim)) - c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim)) + if torch.cuda.is_available(): + h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim)).cuda() + c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim)).cuda() + else: + h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim)) + c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim)) return (h0, c0) def forward(self, sentence,lengths): @@ -41,4 +48,4 @@ def forward(self, sentence,lengths): hidden_1 = self.hidden2hidden1(last_outputs) hidden_1 = self.relu1(hidden_1) y = self.hidden2label(hidden_1) - return y \ No newline at end of file + return y diff --git a/gru_pretrained.py b/gru_pretrained.py new file mode 100644 index 0000000..372d4e8 --- /dev/null +++ b/gru_pretrained.py @@ -0,0 +1,50 @@ +import torch.nn as nn +import torch.nn.functional as F +import torch +from torch.autograd import Variable + + +class LSTMClassifier(nn.Module): + + def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size, batch_size): + super(LSTMClassifier, self).__init__() + self.hidden_dim = hidden_dim + self.batch_size = batch_size + self.word_embeddings = nn.Embedding(vocab_size, embedding_dim) + self.lstm = nn.LSTM(embedding_dim, hidden_dim) + self.hidden2hidden1 = nn.Linear(hidden_dim, hidden_dim) + self.relu1 = nn.ReLU() + self.hidden2label = nn.Linear(hidden_dim, label_size) + self.dropout = nn.Dropout(0.5) + + + def last_timestep(self, unpacked, lengths): + # Index of the last output for each sequence. + idx = (lengths - 1).view(-1, 1).expand(unpacked.size(0), + unpacked.size(2)).unsqueeze(1) + if torch.cuda.is_available(): + idx = idx.cuda() + + return unpacked.gather(1, idx).squeeze() + + def init_hidden(self): + if torch.cuda.is_available(): + h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim)).cuda() + c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim)).cuda() + else: + h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim)) + c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim)) + return (h0, c0) + + def forward(self, sentence,lengths): + + packed = torch.nn.utils.rnn.pack_padded_sequence(sentence, lengths,batch_first=True) + lstm_out, self.hidden = self.lstm(packed, self.hidden) + unpacked, unpacked_len = torch.nn.utils.rnn.pad_packed_sequence(lstm_out,batch_first=True) + # get the outputs from the last *non-masked* timestep for each sentence + last_outputs = self.last_timestep(unpacked, unpacked_len) + last_outputs = self.dropout(last_outputs) + hidden_1 = self.hidden2hidden1(last_outputs) + hidden_1 = self.relu1(hidden_1) + y = self.hidden2label(hidden_1) + return y diff --git a/read_files.py b/read_files.py new file mode 100644 index 0000000..bec6496 --- /dev/null +++ b/read_files.py @@ -0,0 +1,73 @@ + +# encoding: utf-8 +import json +import numpy as np +import sys +import shutil +if sys.version_info[0]==2: + import cPickle as pickle +else: + import pickle +import os + +def create_folder(filename): + if "\\" in filename: + a = '\\'.join(filename.split('\\')[:-1]) + else: + a = '/'.join(filename.split('/')[:-1]) + if not os.path.exists(a): + os.makedirs(a) + + + +def savein_json(filename, array): + create_folder(filename) + with open(filename+'.txt', 'w') as outfile: + json.dump(array, outfile) + print("Save into files: ",filename) + outfile.close() + +def readfrom_json(filename): + with open(filename+'.txt', 'r') as outfile: + data = json.load(outfile) + outfile.close() + return data + +def savein_pickle(file,array): + create_folder(file) + with open(file, 'wb') as handle: + pickle.dump(array, handle) + +def readfrom_pickle(file): + with open(file, 'rb') as handle: + if sys.version_info[0] == 2: + data = pickle.load(handle) + else: + data = pickle.load(handle,encoding='latin1') + return data + +def readfrom_txt(path): + data =open(path).read() + return data + +def textfile2list(path): + data = readfrom_txt(path) + txt_list =list() + for line in data.splitlines(): + txt_list.append(line) + return txt_list + + +def movefiles(dir_simples,old_address,new_address,abbr=""): + for dir_simple in dir_simples: + desti = dir_simple.replace(old_address,new_address) + desti = desti.replace("TimeNorm.gold.completed.xml","TimeNorm.system.completed.xml") + create_folder(desti) + shutil.copy(dir_simple+abbr,desti) + +def movefiles_folders(dir_simples,old_address,new_address,abbr=""): + for dir_simple in dir_simples: + if not os.path.exists(new_address+"/"+dir_simple): + os.makedirs(new_address+"/"+dir_simple) + shutil.copy(old_address+"/"+dir_simple+"/"+dir_simple+".TimeNorm.gold.completed.xml",new_address+"/"+dir_simple+"/"+dir_simple+".TimeNorm.gold.completed.xml") + diff --git a/rnn_baseline.py b/rnn_baseline.py index c438743..b8eae7e 100644 --- a/rnn_baseline.py +++ b/rnn_baseline.py @@ -6,6 +6,7 @@ from torch.autograd import Variable import torch_preprocess as function import math +import read_files as read np.random.seed(123) @@ -160,14 +161,21 @@ def rnn_baseline(dataset,train_model): print("Run RNN Baseline for %s: ..." %(dataset)) + texts, label_texts, labels = function.load_data( + "data/" + dataset + "/" + dataset + ".fold-" + str(0) + ".train.txt", + "data/" + dataset + "/" + dataset + ".fold-" + str(0) + ".validation.txt", + "data/" + dataset + "/" + dataset + ".fold-" + str(0) + ".test.txt") + + vocab_dict, label_dict = function.get_vocab(texts, label_texts, labels) + vocab_dict["UNK"] = len(vocab_dict) + 1 + vocab_dict["PAD"] = 0 + + for i in range(10): texts, label_texts, labels = function.load_data("data/"+dataset + "/"+dataset+".fold-"+ str(i) +".train.txt", "data/"+dataset + "/"+dataset+".fold-"+ str(i) +".validation.txt", "data/"+dataset + "/"+dataset+".fold-"+ str(i) +".test.txt") - vocab_dict, label_dict =function.get_vocab(texts,label_texts,labels) - vocab_dict["UNK"] = len(vocab_dict)+1 - vocab_dict["PAD"] = 0 max_length = 56 train_x,train_y,train_sentence_len = function.dataset_preprocess(texts[0],labels[0], vocab_dict,label_dict,max_length) valid_x,valid_y , valid_sentence_len = function.dataset_preprocess(texts[1],labels[1], vocab_dict,label_dict,max_length) @@ -180,7 +188,9 @@ def rnn_baseline(dataset,train_model): else: model = torch.load("data/model/model_" + dataset + "_folder_" + str(i) + ".pkl") dev_acc = eval(i, model, valid_x, valid_y, valid_sentence_len, mode="Dev") + print(dev_acc) test_acc = eval(i, model, test_x, test_y, test_sentence_len, mode = "Test") + avg_test_acc += test_acc avg_dev_acc +=dev_acc print('Average Dev Acc for %s: %.3f' @@ -188,8 +198,8 @@ def rnn_baseline(dataset,train_model): print('Average Testing Acc for %s: %.3f' % (dataset, avg_test_acc/10.0)) -import term_matching_baseline -term_matching_baseline.term_matching_baseline("AskAPatient") -term_matching_baseline.term_matching_baseline("TwADR-L") +#import term_matching_baseline +#term_matching_baseline.term_matching_baseline("AskAPatient") +#term_matching_baseline.term_matching_baseline("TwADR-L") rnn_baseline("AskAPatient",train_model=False) rnn_baseline("TwADR-L",train_model=False) \ No newline at end of file diff --git a/rnn_character.py b/rnn_character.py index 2a78c5c..0161ce8 100644 --- a/rnn_character.py +++ b/rnn_character.py @@ -11,8 +11,8 @@ np.random.seed(123) torch.manual_seed(123) +use_gpu = torch.cuda.is_available() -## parameter setting def iterate_minibatches(inputs, targets, sentence_len,batchsize, shuffle=False): @@ -35,13 +35,9 @@ def iterate_minibatches(inputs, targets, sentence_len,batchsize, shuffle=False): ### parameter setting - - - - ### training procedure def train(vocab_dict, label_dict, train_x,train_y,train_sentence_len,valid_x,valid_y , valid_sentence_len, dataset,folder): - embedding_dim = 4096 + embedding_dim =128 hidden_dim = 256 epochs = 50 batch_size = 50 @@ -49,6 +45,8 @@ def train(vocab_dict, label_dict, train_x,train_y,train_sentence_len,valid_x,val model = LSTMClassifier(embedding_dim=embedding_dim, hidden_dim=hidden_dim, vocab_size=len(vocab_dict), label_size=len(label_dict), batch_size=batch_size) + if use_gpu: + model = model.cuda() train_acc_ = [] valid_acc_ = [] optimizer = optim.Adadelta(model.parameters(), lr=learning_rate, weight_decay=1e-6) @@ -67,13 +65,17 @@ def train(vocab_dict, label_dict, train_x,train_y,train_sentence_len,valid_x,val train_target_batch = torch.LongTensor(target_batch) train_seq_lens_batch = torch.LongTensor(seq_lens_batch) train_seq_lens_batch, perm_index = train_seq_lens_batch.sort(0, descending=True) - train_x_batch= train_input_batch[perm_index] - train_y_batch = train_target_batch[perm_index] + if use_gpu: + train_x_batch = Variable(train_input_batch[perm_index]).cuda() + train_y_batch = Variable(train_target_batch[perm_index]).cuda() + else: + train_x_batch = Variable(train_input_batch[perm_index]) + train_y_batch = Variable(train_target_batch[perm_index]) model.zero_grad() model.batch_size = len(train_x_batch) model.hidden = model.init_hidden() output = model(train_x_batch,train_seq_lens_batch) - loss = loss_function(output, Variable(train_y_batch)) + loss = loss_function(output, train_y_batch) loss.backward() optimizer.step() @@ -96,13 +98,16 @@ def train(vocab_dict, label_dict, train_x,train_y,train_sentence_len,valid_x,val valid_label = torch.LongTensor(target_batch) valid_seq_lens_batch = torch.LongTensor(seq_lens_batch) valid_seq_lens_batch, perm_index = valid_seq_lens_batch.sort(0, descending=True) - valid_x_batch = valid_input[perm_index] - valid_y_batch = valid_label[perm_index] - valid_x_batch = Variable(valid_x_batch) + if use_gpu: + valid_x_batch = Variable(valid_input[perm_index]).cuda() + valid_y_batch = Variable(valid_label[perm_index]).cuda() + else: + valid_x_batch = Variable(valid_input[perm_index]) + valid_y_batch = Variable(valid_label[perm_index]) model.batch_size = len(valid_x_batch) model.hidden = model.init_hidden() output = model(valid_x_batch,valid_seq_lens_batch) - loss = loss_function(output, Variable(valid_y_batch)) + loss = loss_function(output, valid_y_batch) # calc testing acc _, predicted = torch.max(output.data, 1) @@ -137,15 +142,17 @@ def eval(folder, model,test_x,test_y , test_sentence_len,mode): test_label = torch.LongTensor(target_batch) test_seq_lens_batch = torch.LongTensor(seq_lens_batch) test_seq_lens_batch, perm_index = test_seq_lens_batch.sort(0, descending=True) - test_x_batch = test_input[perm_index] - test_y_batch = test_label[perm_index] - test_x_batch = Variable(test_x_batch) + if use_gpu: + test_x_batch = Variable(test_input[perm_index]).cuda() + test_y_batch = Variable(test_label[perm_index]).cuda() + else: + test_x_batch = Variable(test_input[perm_index]) + test_y_batch = Variable(test_label[perm_index]) model.batch_size = len(test_x_batch) model.hidden = model.init_hidden() output = model(test_x_batch,test_seq_lens_batch) loss_function = nn.CrossEntropyLoss() - loss = loss_function(output, Variable(test_y_batch)) - + loss = loss_function(output,test_y_batch) # calc testing acc _, predicted = torch.max(output.data, 1) total_acc += np.float((predicted == test_y_batch).sum().item()) @@ -159,7 +166,7 @@ def eval(folder, model,test_x,test_y , test_sentence_len,mode): return test_acc_[0] #test() -def rnn_baseline(dataset,train_model): +def rnn_character(dataset,train_model): avg_test_acc = 0.0 avg_dev_acc = 0.0 @@ -202,5 +209,5 @@ def rnn_baseline(dataset,train_model): -rnn_baseline("AskAPatient",train_model=True) +rnn_character("AskAPatient",train_model=True) #rnn_baseline("TwADR-L",train_model=False) \ No newline at end of file diff --git a/rnn_characters_pretrained.py b/rnn_characters_pretrained.py new file mode 100644 index 0000000..612cd66 --- /dev/null +++ b/rnn_characters_pretrained.py @@ -0,0 +1,253 @@ +import numpy as np +import torch +from gru_pretrained import LSTMClassifier +import torch.optim as optim +import torch.nn as nn +from torch.autograd import Variable +import torch_preprocess as function +import math +from flair.models import LanguageModel + +np.random.seed(123) +torch.manual_seed(123) + +use_gpu = torch.cuda.is_available() + + +def iterate_minibatches(inputs, targets, sentence_len, batchsize, shuffle=False): + assert len(inputs) == len(targets) + indices = np.arange(len(inputs)) + if shuffle: + np.random.shuffle(indices) + for start_idx in range(0, len(inputs) - batchsize + 1, batchsize): + if shuffle: + excerpt = indices[start_idx:start_idx + batchsize] + else: + excerpt = indices[start_idx:start_idx + batchsize] + sentence_len_batch = sentence_len[excerpt] + input_batch = [] + for i, text_id in enumerate(excerpt): + input_batch.append(inputs[text_id]) + + yield input_batch, targets[excerpt], sentence_len_batch + + ### parameter setting + + +def get_padded_sentence(sentences,is_forward_lm): + + sentences_padded = [] + longest_character_sequence_in_batch: int = 0 + for sentence in sentences: + if len( + sentence) > longest_character_sequence_in_batch: longest_character_sequence_in_batch \ + = len(sentence) + + for sentence in sentences: + if is_forward_lm: + sentences_padded.append( + sentence + ((longest_character_sequence_in_batch - len(sentence)) * ' ')) + else: + sentences_padded.append( + sentence[::-1] + ( + (longest_character_sequence_in_batch - len(sentence)) * ' ')) + + return sentences_padded + +def get_pretrained_input(input_batch): + lm_f = LanguageModel.load_language_model('/home/dongfang/.flair/embeddings/lm-news-english-forward-v0.2rc.pt') + lm_b = LanguageModel.load_language_model('/home/dongfang/.flair/embeddings/lm-news-english-backward-v0.2rc.pt') + if use_gpu: + lm_f.cuda() + lm_b.cuda() + rnn_f = lm_f.get_representation(get_padded_sentence(input_batch, is_forward_lm=True)) + rnn_f = rnn_f.transpose(0, 1) + rnn_b = lm_b.get_representation(get_padded_sentence(input_batch, is_forward_lm=False)) + rnn_b = rnn_b.transpose(0, 1) + rnn = torch.cat((rnn_f,rnn_b),dim=2) + return rnn + + + + +### training procedure +def train(vocab_dict, label_dict, train_x, train_y, train_sentence_len, valid_x, valid_y, valid_sentence_len, dataset, + folder): + embedding_dim = 4096 + hidden_dim = 256 + epochs = 50 + batch_size = 50 + learning_rate = 1.0 + + model = LSTMClassifier(embedding_dim=embedding_dim, hidden_dim=hidden_dim, + vocab_size=len(vocab_dict), label_size=len(label_dict), batch_size=batch_size) + if use_gpu: + model = model.cuda() + train_acc_ = [] + valid_acc_ = [] + optimizer = optim.Adadelta(model.parameters(), lr=learning_rate, weight_decay=1e-6) + loss_function = nn.CrossEntropyLoss() + train_loss_ = [] + valid_loss_ = [] + val_acc = -math.inf + for epoch in range(epochs): + ## training epoch + total_acc = 0.0 + total_loss = 0.0 + total = 0.0 + for batch in iterate_minibatches(train_x, train_y, train_sentence_len, batchsize=batch_size, shuffle=True): + input_batch, target_batch, seq_lens_batch = batch + train_target_batch = torch.LongTensor(target_batch) + train_seq_lens_batch = torch.LongTensor(seq_lens_batch) + train_seq_lens_batch, perm_index = train_seq_lens_batch.sort(0, descending=True) + train_input_batch = get_pretrained_input(input_batch) + train_x_batch = train_input_batch[perm_index] + if use_gpu: + train_y_batch = Variable(train_target_batch[perm_index]).cuda() + else: + train_y_batch = Variable(train_target_batch[perm_index]) + + model.zero_grad() + model.batch_size = len(train_x_batch) + model.hidden = model.init_hidden() + output = model(train_x_batch, train_seq_lens_batch) + loss = loss_function(output, train_y_batch) + loss.backward() + optimizer.step() + + # calc training acc + _, predicted = torch.max(output.data, 1) + total_acc += np.float((predicted == train_y_batch).sum().item()) + total += len(train_y_batch) + total_loss += loss.item() + + train_loss_.append(total_loss / total) + train_acc_.append(total_acc / total) + ## validing epoch + total_acc = 0.0 + total_loss = 0.0 + total = 0.0 + + for batch in iterate_minibatches(valid_x, valid_y, valid_sentence_len, batchsize=batch_size, shuffle=False): + input_batch, target_batch, seq_lens_batch = batch + valid_label = torch.LongTensor(target_batch) + valid_seq_lens_batch = torch.LongTensor(seq_lens_batch) + valid_seq_lens_batch, perm_index = valid_seq_lens_batch.sort(0, descending=True) + + valid_input = get_pretrained_input(input_batch) + valid_x_batch = valid_input[perm_index] + if use_gpu: + valid_y_batch = Variable(valid_label[perm_index]).cuda() + else: + valid_y_batch = Variable(valid_label[perm_index]) + model.batch_size = len(valid_x_batch) + model.hidden = model.init_hidden() + output = model(valid_x_batch, valid_seq_lens_batch) + loss = loss_function(output, valid_y_batch) + + # calc testing acc + _, predicted = torch.max(output.data, 1) + total_acc += np.float((predicted == valid_y_batch).sum().item()) + total += len(valid_y_batch) + total_loss += loss.item() + if (total_acc / total > val_acc): + torch.save(model, "data/model1/model_" + dataset + "_folder_" + str(folder) + ".pkl") + val_acc = total_acc / total + valid_loss_.append(total_loss / total) + valid_acc_.append(total_acc / total) + print( + '[Epoch: %3d/%3d] Training Loss: %.3f, Validating Loss: %.3f, Training Acc: %.3f, Validing Acc: %.3f' + % (epoch, epochs, train_loss_[epoch], valid_loss_[epoch], train_acc_[epoch], + valid_acc_[epoch])) + + +# train() + +def eval(folder, model, test_x, test_y, test_sentence_len, mode): + ## testing epoch + test_loss_ = [] + test_acc_ = [] + total_acc = 0.0 + total_loss = 0.0 + total = 0.0 + + batch_size = 50 + for batch in iterate_minibatches(test_x, test_y, test_sentence_len, batchsize=batch_size, shuffle=False): + input_batch, target_batch, seq_lens_batch = batch + test_label = torch.LongTensor(target_batch) + test_seq_lens_batch = torch.LongTensor(seq_lens_batch) + test_seq_lens_batch, perm_index = test_seq_lens_batch.sort(0, descending=True) + test_input = get_pretrained_input(input_batch) + test_x_batch = test_input[perm_index] + if use_gpu: + test_y_batch = Variable(test_label[perm_index]).cuda() + else: + test_y_batch = Variable(test_label[perm_index]) + model.batch_size = len(test_x_batch) + model.hidden = model.init_hidden() + output = model(test_x_batch, test_seq_lens_batch) + loss_function = nn.CrossEntropyLoss() + loss = loss_function(output, test_y_batch) + + # calc testing acc + _, predicted = torch.max(output.data, 1) + total_acc += np.float((predicted == test_y_batch).sum().item()) + total += len(test_y_batch) + total_loss += loss.item() + test_loss_.append(total_loss / total) + test_acc_.append(total_acc / total) + + # print('%s Loss for folder %s: %.3f, %s Acc: %.3f' + # % (mode ,str(folder),test_loss_[0], mode, test_acc_[0])) + return test_acc_[0] + + +# test() +def rnn_character(dataset, train_model): + avg_test_acc = 0.0 + avg_dev_acc = 0.0 + + print("Run RNN Baseline for %s: ..." % (dataset)) + lm_f = LanguageModel.load_language_model('/home/dongfang/.flair/embeddings/lm-news-english-forward-v0.2rc.pt') + dictionary = lm_f.dictionary + vocab_dict = dict() + for i, j in dictionary.item2idx.items(): + vocab_dict[i.decode('utf-8')] = j + for i in range(1): + texts, label_texts, labels = function.load_data( + "data/" + dataset + "/" + dataset + ".fold-" + str(i) + ".train.txt", + "data/" + dataset + "/" + dataset + ".fold-" + str(i) + ".validation.txt", + "data/" + dataset + "/" + dataset + ".fold-" + str(i) + ".test.txt") + + _, label_dict = function.get_vocab(texts, label_texts, labels) + vocab_dict["UNK"] = len(vocab_dict) + 1 + train_x, train_y, train_sentence_len = function.dataset_preprocess_character_pretrained(texts[0], labels[0], vocab_dict, + label_dict) + valid_x, valid_y, valid_sentence_len = function.dataset_preprocess_character_pretrained(texts[1], labels[1], vocab_dict, + label_dict) + test_x, test_y, test_sentence_len = function.dataset_preprocess_character_pretrained(texts[2], labels[2], vocab_dict, + label_dict) + if train_model == True: + train(vocab_dict, label_dict, train_x, train_y, train_sentence_len, valid_x, valid_y, valid_sentence_len, + dataset, i) + model = torch.load("data/model1/model_" + dataset + "_folder_" + str(i) + ".pkl") + dev_acc = eval(i, model, valid_x, valid_y, valid_sentence_len, mode="Dev") + test_acc = eval(i, model, test_x, test_y, test_sentence_len, mode="Test") + else: + model = torch.load("data/model1/model_" + dataset + "_folder_" + str(i) + ".pkl") + dev_acc = eval(i, model, valid_x, valid_y, valid_sentence_len, mode="Dev") + test_acc = eval(i, model, test_x, test_y, test_sentence_len, mode="Test") + avg_test_acc += test_acc + avg_dev_acc += dev_acc + print('Average Dev Acc for %s: %.3f' + % (dataset, avg_dev_acc / 10.0)) + print('Average Testing Acc for %s: %.3f' + % (dataset, avg_test_acc / 10.0)) + + +import term_matching_baseline +from flair.embeddings import CharLMEmbeddings +from flair.data import Sentence + +rnn_character("AskAPatient", train_model=True) +# rnn_baseline("TwADR-L",train_model=False) \ No newline at end of file diff --git a/torch_preprocess.py b/torch_preprocess.py index b63e102..940350b 100644 --- a/torch_preprocess.py +++ b/torch_preprocess.py @@ -120,3 +120,20 @@ def dataset_preprocess_character(texts,labels, vocab_dict,label_dict): label_y.append(label_dict[labels[index]]) return text_x,np.asanyarray(label_y),np.asarray(sent_length) +def dataset_preprocess_character_pretrained(texts,labels, vocab_dict,label_dict): + text_x = [] + label_y = [] + sent_length = [] + + for index, sent in enumerate(texts): + #word_list = word_tokenize(sent) + sent_length.append(len(sent)) + #text_x.append(get_idx_from_sent_character(sent,vocab_dict)) + # if index <=2000: + # label_y.append(0) + # elif 3000<=index<=7000: + # label_y.append(1) + # else: + # label_y.append(2) + label_y.append(label_dict[labels[index]]) + return texts,np.asanyarray(label_y),np.asarray(sent_length)