From 631aeefc90a4111b654dfefc58526b4796f20a61 Mon Sep 17 00:00:00 2001 From: Zhe Wang Date: Fri, 1 Dec 2017 16:20:10 -0800 Subject: [PATCH] Add files via upload --- prepro_7w.py | 342 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 342 insertions(+) create mode 100644 prepro_7w.py diff --git a/prepro_7w.py b/prepro_7w.py new file mode 100644 index 0000000..6c32a13 --- /dev/null +++ b/prepro_7w.py @@ -0,0 +1,342 @@ +""" +Preoricess a raw json dataset into hdf5/json files. + +Caption: Use NLTK or split function to get tokens. +""" +import copy +from random import shuffle, seed +import sys +import os.path +import argparse +import glob +import numpy as np +from scipy.misc import imread, imresize +import scipy.io +import pdb +import string +import h5py +from nltk.tokenize import word_tokenize +import json +import pdb +from autocorrect import spell +import re + +def tokenize(sentence): + return [i for i in re.split(r"([-.\"',:? !\$#@~()*&\^%;\[\]/\\\+<>\n=])", sentence) if i!='' and i!=' ' and i!='\n']; + +def nltk_tokenize(sent): + sent = str(sent).lower() + sent = sent.replace("-", " ") + sent = sent.replace("/", " ") + sent = sent.replace("`", " ") + token = word_tokenize(sent) + + for i in range(len(token)): + if token[i].isalpha(): + token[i] = spell(token[i]).lower() + return token + + +def prepro_question(imgs, params): + + # preprocess all the question and candidate answer + print 'example processed tokens:' + for i,img in enumerate(imgs): + s = img['question'] + if params['token_method'] == 'nltk': + txt = word_tokenize(str(s).lower()) + else: + txt = tokenize(s) + img['processed_tokens'] = txt + + # preprocess candidate answer + ans = img['MC_ans'] + if params['token_method'] == 'nltk': + ans_txt = word_tokenize(str(ans).lower()) + else: + ans_txt = tokenize(ans) + img['processed_ans'] = ans_txt + + if i < 10: print txt + if i % 1000 == 0: + sys.stdout.write("processing question %d/%d (%.2f%% done) \r" % (i, len(imgs), i*100.0/len(imgs)) ) + sys.stdout.flush() + + return imgs + +def build_vocab_question(imgs, params): + # build vocabulary for question and answers. + + count_thr = params['word_count_threshold'] + + # count up the number of words + counts = {} + for img in imgs: + for w in img['processed_tokens']: + counts[w] = counts.get(w, 0) + 1 + for w in img['processed_ans']: + counts[w] = counts.get(w, 0) + 1 + cw = sorted([(count,w) for w,count in counts.iteritems()], reverse=True) + print 'top words and their counts:' + print '\n'.join(map(str,cw[:20])) + + # print some stats + total_words = sum(counts.itervalues()) + print 'total words:', total_words + bad_words = [w for w,n in counts.iteritems() if n <= count_thr] + vocab = [w for w,n in counts.iteritems() if n > count_thr] + bad_count = sum(counts[w] for w in bad_words) + print 'number of bad words: %d/%d = %.2f%%' % (len(bad_words), len(counts), len(bad_words)*100.0/len(counts)) + print 'number of words in vocab would be %d' % (len(vocab), ) + print 'number of UNKs: %d/%d = %.2f%%' % (bad_count, total_words, bad_count*100.0/total_words) + + + # lets now produce the final annotation + # additional special UNK token we will use below to map infrequent words to + print 'inserting the special UNK token' + vocab.append('UNK') + + for img in imgs: + txt = img['processed_tokens'] + question = [w if counts.get(w,0) > count_thr else 'UNK' for w in txt] + img['final_question'] = question + + txt = img['processed_ans'] + ans = [w if counts.get(w,0) > count_thr else 'UNK' for w in txt] + img['final_ans'] = ans + + return imgs, vocab + +def apply_vocab_question(imgs, wtoi): + # apply the vocab on test. + for img in imgs: + txt = img['processed_tokens'] + question = [w if wtoi.get(w,len(wtoi)+1) != (len(wtoi)+1) else 'UNK' for w in txt] + img['final_question'] = question + + txt = img['processed_ans'] + ans = [w if wtoi.get(w,len(wtoi)+1) != (len(wtoi)+1) else 'UNK' for w in txt] + img['final_ans'] = ans + + return imgs + +def get_top_answers(imgs, params): + counts = {} + for img in imgs: + ans = img['ans'] + counts[ans] = counts.get(ans, 0) + 1 + + cw = sorted([(count,w) for w,count in counts.iteritems()], reverse=True) + print 'top answer and their counts:' + print '\n'.join(map(str,cw[:20])) + + vocab = [] + for i in range(params['num_ans']): + vocab.append(cw[i][1]) + + return vocab[:params['num_ans']] + +def encode_question(imgs, params, wtoi): + # encode both question and answer + + max_length = params['max_length'] + N = len(imgs) + + label_arrays = np.zeros((N, max_length), dtype='uint32') + label_length = np.zeros(N, dtype='uint32') + + ans_arrays = np.zeros((N, max_length), dtype='uint32') + ans_length = np.zeros(N, dtype='uint32') + + question_id = np.zeros(N, dtype='uint32') + question_counter = 0 + for i,img in enumerate(imgs): + question_id[question_counter] = img['ques_id'] + label_length[question_counter] = min(max_length, len(img['final_question'])) # record the length of this sequence + ans_length[question_counter] = min(max_length, len(img['final_ans'])) + + question_counter += 1 + for k,w in enumerate(img['final_question']): + if k < max_length: + label_arrays[i,k] = wtoi[w] + for k,w in enumerate(img['final_ans']): + if k < max_length: + ans_arrays[i,k] = wtoi[w] + + return label_arrays, label_length, ans_arrays, ans_length, question_id + + +def encode_answer(imgs): + N = len(imgs) + ans_arrays = np.zeros(N, dtype='uint32') + + for i, img in enumerate(imgs): + ans_arrays[i] = img['ans'] + + return ans_arrays + +def encode_mc_answer(imgs): + N = len(imgs) + mc_ans_arrays = np.zeros((N, 18), dtype='uint32') + + for i, img in enumerate(imgs): + for j, ans in enumerate(img['MC_ans']): + mc_ans_arrays[i,j] = atoi.get(ans, 0) + return mc_ans_arrays + +def filter_question(imgs, atoi): + new_imgs = [] + for i, img in enumerate(imgs): + if atoi.get(img['ans'],len(atoi)+1) != len(atoi)+1: + new_imgs.append(img) + + print 'question number reduce from %d to %d '%(len(imgs), len(new_imgs)) + return new_imgs + +def get_unqiue_img(imgs): + count_img = {} + N = len(imgs) + img_pos = np.zeros(N, dtype='uint32') + for img in imgs: + count_img[img['img_path']] = count_img.get(img['img_path'], 0) + 1 + + unique_img = [w for w,n in count_img.iteritems()] + imgtoi = {w:i+1 for i,w in enumerate(unique_img)} # add one for torch, since torch start from 1. + + + for i, img in enumerate(imgs): + img_pos[i] = imgtoi.get(img['img_path']) + + return unique_img, img_pos + +def main(params): + + imgs_train = json.load(open(params['input_train_json'], 'r')) + imgs_test = json.load(open(params['input_test_json'], 'r')) + + ''' + # get top answers + top_ans = get_top_answers(imgs_train, params) + atoi = {w:i+1 for i,w in enumerate(top_ans)} + itoa = {i+1:w for i,w in enumerate(top_ans)} + + # filter question, which isn't in the top answers. + imgs_train = filter_question(imgs_train, atoi) + ''' + + # seed(123) # make reproducible + # shuffle(imgs_train) # shuffle the order + + # tokenization and preprocessing training question + imgs_train = prepro_question(imgs_train, params) + # tokenization and preprocessing testing question + imgs_test = prepro_question(imgs_test, params) + + # create the vocab for question + imgs_train, vocab = build_vocab_question(imgs_train, params) + + itow = {i+1:w for i,w in enumerate(vocab)} # a 1-indexed vocab translation table + wtoi = {w:i+1 for i,w in enumerate(vocab)} # inverse table + + # read glove vectors + with open(params['vector_file'], 'r') as f: + vectors = {} + for line in f: + vals = line.rstrip().split(' ') + vectors[vals[0]] = [float(x) for x in vals[1:]] + + vocab_size = len(vocab) + + vector_dim = 300 + glove = np.zeros((vocab_size, vector_dim)) + array = [] + num = 0 + for word in vocab: + if word in vectors: + glove[wtoi[word]-1,:] = vectors[word] + else: + num += 1 + # glove[wtoi[word]-1,:] = np.random.uniform(-0.10, 0.10, 300) + print num + + # normalize each word vector to unit variance + glove_norm = np.zeros(glove.shape) + d = (np.sum(glove ** 2, 1) ** (0.5)) + glove_norm = (glove.T / d).T + glove_norm[np.isnan(glove_norm)] = 0 + + + ques_train, ques_length_train, ans_train, ans_length_train, question_id_train = encode_question(imgs_train, params, wtoi) + + imgs_test = apply_vocab_question(imgs_test, wtoi) + ques_test, ques_length_test, ans_test, ans_length_test, question_id_test = encode_question(imgs_test, params, wtoi) + + # get the unique image for train and test + unique_img_train, img_pos_train = get_unqiue_img(imgs_train) + unique_img_test, img_pos_test = get_unqiue_img(imgs_test) + + # get the answer encoding. + target_train = encode_answer(imgs_train) + target_test = encode_answer(imgs_test) + + # create output h5 file for training set. + N = len(imgs_train) + f = h5py.File(params['output_h5'], "w") + f.create_dataset("ques_train", dtype='uint32', data=ques_train) + f.create_dataset("ques_length_train", dtype='uint32', data=ques_length_train) + f.create_dataset("ans_train", dtype='uint32', data=ans_train) + f.create_dataset("ans_length_train", dtype='uint32', data=ans_length_train) + f.create_dataset("target_train", dtype='uint32', data=target_train) + f.create_dataset("question_id_train", dtype='uint32', data=question_id_train) + f.create_dataset("img_pos_train", dtype='uint32', data=img_pos_train) + + f.create_dataset("ques_test", dtype='uint32', data=ques_test) + f.create_dataset("ques_length_test", dtype='uint32', data=ques_length_test) + f.create_dataset("ans_test", dtype='uint32', data=ans_test) + f.create_dataset("ans_length_test", dtype='uint32', data=ans_length_test) + f.create_dataset("question_id_test", dtype='uint32', data=question_id_test) + f.create_dataset("img_pos_test", dtype='uint32', data=img_pos_test) + f.create_dataset("target_test", dtype='uint32', data=target_test) + f.create_dataset("emb_matrix", dtype='float32', data=glove_norm) + + f.close() + print 'wrote ', params['output_h5'] + + # create output json file + out = {} + out['ix_to_word'] = itow # encode the (1-indexed) vocab + # out['ix_to_ans'] = itoa + out['unique_img_train'] = unique_img_train + out['unique_img_test'] = unique_img_test + json.dump(out, open(params['output_json'], 'w')) + print 'wrote ', params['output_json'] + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + + # input json + parser.add_argument('--input_train_json',default = 'vqa_raw_train.json', help='input json file to process into hdf5') + parser.add_argument('--input_test_json',default = 'vqa_raw_test.json', help='input json file to process into hdf5') + # num_ans: num of top answers + parser.add_argument('--num_ans', default = 100, type=int, help='number of top answers for the final classifications.') + + parser.add_argument('--output_json', default='data_prepro_0417.json', help='output json file') + parser.add_argument('--output_h5', default='data_prepro_0417.h5', help='output h5 file') + + # options + parser.add_argument('--max_length', default=26, type=int, help='max length of a caption, in number of words. captions longer than this get clipped.') + parser.add_argument('--word_count_threshold', default=0, type=int, help='only words that occur more than this number of times will be put in vocab') + parser.add_argument('--num_test', default=0, type=int, help='number of test images (to withold until very very end)') + parser.add_argument('--token_method', default='nltk', help='token method, nltk is much more slower.') + + parser.add_argument('--batch_size', default=10, type=int) + parser.add_argument('--vector_file', default='glove.6B.300d.txt', type=str) + + args = parser.parse_args() + params = vars(args) # convert to ordinary dict + print 'parsed input parameters:' + print json.dumps(params, indent = 2) + + # pdb.set_trace() + main(params)