Skip to content

Commit

Permalink
Works for DC+NC+YD+HMYW+Count. 1. New pattn2count pretraining, data c…
Browse files Browse the repository at this point in the history
…reation. 2. Model loads pre-trained count. 3. Figuring out a new method to compute max and argmax. Will be a major change. This commit can be used to come back to.
  • Loading branch information
nitishgupta committed May 10, 2019
1 parent bca704a commit 0acb0bc
Show file tree
Hide file tree
Showing 20 changed files with 872 additions and 548 deletions.
6 changes: 3 additions & 3 deletions allenconfigs/semqa/train/drop_parser_wmodel.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ local compareff_inputdim =
"type": "gru",
"input_size": 4,
"hidden_size": 20,
"num_layers": 3,
"num_layers": 2,
"bidirectional": true,
},

Expand All @@ -254,10 +254,10 @@ local compareff_inputdim =

"initializers":
[
["passage_attention_to_count|passage_count_predictor",
["passage_attention_to_count|passage_count_hidden2logits",
{
"type": "pretrained",
"weights_file_path": "./resources/semqa/checkpoints/savedmodels/num2count_vstd/best.th"
"weights_file_path": "./resources/semqa/checkpoints/drop_pattn2count/T_gru/Isize_4/Hsize_20/Layers_2/S_100/t600_v600/best.th"
},
],
[".*_text_field_embedder.*", "prevent"]
Expand Down
29 changes: 20 additions & 9 deletions allenconfigs/semqa/train/passage_attn2count.jsonnet
Original file line number Diff line number Diff line change
@@ -1,15 +1,26 @@
local utils = import 'utils.libsonnet';

local utils = import "utils.libsonnet";

{
"dataset_reader": {
"type": "passage_attn2count_reader",
"min_passage_length": 200,
"max_passage_length": 400,
"max_span_length": 10,
"num_training_samples": 2000,
"normalized": utils.boolparser(std.extVar("NORM")),
"withnoise": utils.boolparser(std.extVar("NOISE")),
"min_passage_length": 100,
"max_passage_length": 600,
"min_span_length": 5,
"max_span_length": 15,
"samples_per_bucket_count": 2000,
"normalized": true,
"withnoise": true,
},

"validation_dataset_reader": {
"type": "passage_attn2count_reader",
"min_passage_length": 100,
"max_passage_length": 600,
"min_span_length": 5,
"max_span_length": 15,
"samples_per_bucket_count": 500,
"normalized": true,
"withnoise": true,
},

"train_data_path": std.extVar("TRAINING_DATA_FILE"),
Expand All @@ -31,7 +42,7 @@ local utils = import 'utils.libsonnet';
"iterator": {
"type": "basic",
"batch_size": std.extVar("BS"),
"max_instances_in_memory": std.extVar("BS")
"max_instances_in_memory": 1000000,
},

"trainer": {
Expand Down
89 changes: 89 additions & 0 deletions datasets/drop/analysis/parabucketedsize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import os
import json
import copy
import argparse
import datasets.drop.constants as constants
from collections import defaultdict
from utils.util import round_all


def readDataset(input_json):
with open(input_json, 'r') as f:
dataset = json.load(f)
return dataset


def quesParaSize(input_json):
dataset = readDataset(input_json)
numparas = 0
maxparalen = 0
passage_len_sums = 0
plen_lt_100_cnt = 0
plen_lt_200_cnt = 0
plen_lt_400_cnt = 0
plen_lt_500_cnt = 0
plen_lt_600_cnt = 0
plen_lt_800_cnt = 0
plen_lt_1000_cnt = 0

for pid, pinfo in dataset.items():
numparas += 1
passage = pinfo[constants.tokenized_passage]
plen = len(passage.split(' '))
maxparalen = plen if plen > maxparalen else maxparalen

passage_len_sums += plen

if plen < 100:
plen_lt_100_cnt += 1
if plen < 200:
plen_lt_200_cnt += 1
if plen < 400:
plen_lt_400_cnt += 1
if plen < 500:
plen_lt_500_cnt += 1
if plen < 600:
plen_lt_600_cnt += 1
if plen < 800:
plen_lt_800_cnt += 1
if plen < 1000:
plen_lt_1000_cnt += 1

avg_plen = float(passage_len_sums)/numparas

print(f"Paras: {numparas} MaxParaLen:{maxparalen}")
print(f"Avg Para len: {avg_plen}")
print(f"Plen < 100: {plen_lt_100_cnt}")
print(f"Plen < 200: {plen_lt_200_cnt}")
print(f"Plen < 400: {plen_lt_400_cnt}")
print(f"Plen < 500: {plen_lt_500_cnt}")
print(f"Plen < 600: {plen_lt_600_cnt}")
print(f"Plen < 800: {plen_lt_800_cnt}")
print(f"Plen < 1000: {plen_lt_1000_cnt}")


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--inputdir')
args = parser.parse_args()

inputdir = args.inputdir

train_json = 'drop_dataset_train.json'
dev_json = 'drop_dataset_dev.json'

inputdir = "./resources/data/drop_s/num/count_filterqattn"
# inputdir = "./resources/data/drop_s/date_num/date_numcq_hmvy_cnt_filter"


input_trnfp = os.path.join(inputdir, train_json)
input_devfp = os.path.join(inputdir, dev_json)

print(input_trnfp)
quesParaSize(input_trnfp)

print(input_devfp)
quesParaSize(input_devfp)



Empty file.
141 changes: 141 additions & 0 deletions datasets/drop/synthetic/pattn2count.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
from typing import List
import numpy as np
import random
from collections import defaultdict
import json

random.seed(100)
np.random.seed(100)


def sample_spansfor_variablelength(seqlen, num_spans, span_lengths: List[int]):
sum_lengths = sum(span_lengths)
# We need a gap of atleast 1 token between two spans. Number of heads is computed based on longer spans (+1)
# and offset is also up by +1
# Range of Number of possible span starts
num_heads = seqlen - (sum_lengths - num_spans + num_spans)
if num_heads < num_spans:
return None
indices = range(seqlen - (sum_lengths - num_spans))
result = []
offset = 0
# Randomly sample n=num_spans heads
for i, idx in enumerate(sorted(random.sample(indices, num_spans))):
# These heads are 0-indexed, to this we add the offset we've covered in the seq
idx += offset
span_length = span_lengths[i]
result.append((idx, idx + span_length))
offset += span_length - 1 + 1
return result


def make_instance(min_passage_length: int, max_passage_length: int,
min_span_length: int, max_span_length: int, count_value: int):

passage_length = random.randint(min_passage_length, max_passage_length)
# Mean: 0, Std: 0.2, Size: PassageLength
attention = np.abs(np.random.normal(0.0, 0.1, passage_length))

if count_value > 0:
span_lengths = [random.randint(min_span_length, max_span_length) for _ in range(count_value)]
# Sample n=count_value spans of the same length. Ends are exclusive
# sampled_spans = self.sample_spans(passage_length, count_value, span_length)
sampled_spans = sample_spansfor_variablelength(passage_length, count_value, span_lengths)
if sampled_spans is None:
return None

for (start, end) in sampled_spans:
attention[start:end] += 1.0

attention_sum = sum(attention)
attention = attention / attention_sum

return attention

def _get_length_buckets(min_passage_length, max_passage_length):
if min_passage_length == max_passage_length:
return [(min_passage_length, max_passage_length)]

min_length_buckets = [min_passage_length]
max_length_buckets = []

# Add start, end + 100 until end <= max_passage_length
i = 1
while True:
potential_max_len = i * 100 + min_passage_length
if potential_max_len <= max_passage_length:
max_length_buckets.append(potential_max_len)
min_length_buckets.append(max_length_buckets[-1]) # Last end is next's start

i += 1
else:
break
if len(max_length_buckets) == 0 or max_length_buckets[-1] != max_passage_length: # This was left out
max_length_buckets.append(max_passage_length)

if min_length_buckets[-1] == max_passage_length:
min_length_buckets = min_length_buckets[:-1]

return list(zip(min_length_buckets, max_length_buckets))


def make_data(min_passage_length, max_passage_length, min_span_length, max_span_length,
samples_per_bucket_count: int, max_count_value: int = 7):
# For each 100 length bucket, and count value, generate 1000 examples in train mode, and 100 in val mode
num_instances_per_bucket_per_count = samples_per_bucket_count

# List of min and max passage
minmax_passagelen_tuples = _get_length_buckets(min_passage_length, max_passage_length)
data_dicts = []

lenbucket_count_dict = defaultdict()

for count_value in range(0, max_count_value + 1):
print(f"Count Value: {count_value}")
for min_plen, max_plen in minmax_passagelen_tuples:
instances_for_bucket = 0
for i in range(num_instances_per_bucket_per_count):
attention = make_instance(min_passage_length=min_plen, max_passage_length=max_plen,
min_span_length=min_span_length, max_span_length=max_span_length,
count_value=count_value)
if attention is None:
continue
if count_value not in lenbucket_count_dict:
lenbucket_count_dict[count_value] = defaultdict(int)
lenbucket_count_dict[count_value][(min_plen, max_plen)] += 1
attention = attention.tolist()
data_dicts.append({'attention': attention, 'count_value': count_value})
instances_for_bucket += 1
print(f"{min_plen}, {max_plen} :: {instances_for_bucket}")
print('\n')

print(lenbucket_count_dict)
return data_dicts


def write_data_to_file(data, filepath):
with open(filepath, 'w') as f:
json.dump(data, f)


if __name__=='__main__':
train_data = make_data(min_passage_length=100, max_passage_length=600, min_span_length=5,
max_span_length=15, max_count_value=7, samples_per_bucket_count=2000)

dev_data = make_data(min_passage_length=100, max_passage_length=600, min_span_length=5,
max_span_length=15, max_count_value=7, samples_per_bucket_count=500)

train_data_path = "./resources/data/drop_s/synthetic/pattn2count/train.json"
dev_data_path = "./resources/data/drop_s/synthetic/pattn2count/dev.json"


write_data_to_file(train_data, train_data_path)
write_data_to_file(dev_data, dev_data_path)








2 changes: 1 addition & 1 deletion scripts/allennlp/base/make_model_tar.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env

SERIALIZATION_DIR=./resources/semqa/checkpoints/drop/date_num/date_numcq_hmvy_cnt_filter/drop_parser/TOKENS_qanet/ED_100/RG_1e-07/MODELTYPE_modeled/SUPEPOCHS_2/S_100/SMFilter
SERIALIZATION_DIR=./resources/semqa/checkpoints/drop/date_num/date_numcq_hmvy_cnt_filter/drop_parser/TOKENS_qanet/ED_100/RG_1e-07/MODELTYPE_encoded/CNTFIX_true/SUPEPOCHS_5/S_100/PattnCount
WEIGHTS_TH=best.th

MODEL_ARCHIVE=${SERIALIZATION_DIR}/model.tar.gz
Expand Down
73 changes: 3 additions & 70 deletions scripts/allennlp/models/drop_parser/predict.sh
Original file line number Diff line number Diff line change
@@ -1,73 +1,6 @@
#!/usr/bin/env

export TMPDIR=/srv/local/data/nitishg/tmp
#
#### DATASET PATHS -- should be same across models for same dataset
#TRAINDATASET_NAME=date_num/dc_nc_100_yeardiff
#
#EVAL_DATASET=date/year_diff
#
#DATASET_DIR=./resources/data/drop_s/${EVAL_DATASET}
#TRAINFILE=${DATASET_DIR}/drop_dataset_train.json
#VALFILE=${DATASET_DIR}/drop_dataset_dev.json
#
## PACKAGE TO BE INCLUDED WHICH HOUSES ALL THE CODE
#INCLUDE_PACKAGE=semqa
#
## Check CONFIGFILE for environment variables to set
#export GPU=0
#
## All parameters here are used to fetch the correct serialization_dir
#export TOKENIDX="qanet"
#
#export BS=8
#export DROPOUT=0.2
#export LR=0.001
#
#export WEMB_DIM=100
#export RG=1e-4
#
## Which kind of similarity to use in Ques-Passage attention - raw / encoded / raw-enc
#export QP_SIM_KEY="raw"
#
#export GOLDACTIONS=false
#export GOLDPROGS=false
#export DENLOSS=true
#export EXCLOSS=true
#export QATTLOSS=true
#export MMLLOSS=true
#
## Whether strong supervison instances should be trained on first, if yes for how many epochs
#export SUPFIRST=true
#export SUPEPOCHS=5
#
#export SEED=100
#
#export BEAMSIZE=2
#
#export DEBUG=true
#
##### SERIALIZATION DIR --- Check for checkpoint_root/task/dataset/model/parameters/
#CHECKPOINT_ROOT=./resources/semqa/checkpoints
#SERIALIZATION_DIR_ROOT=${CHECKPOINT_ROOT}/drop_old/${TRAINDATASET_NAME}
#MODEL_DIR=drop_parser
#PD_1=BS_${BS}/LR_${LR}/Drop_${DROPOUT}/TOKENS_${TOKENIDX}/ED_${WEMB_DIM}/RG_${RG}/GACT_${GOLDACTIONS}/GPROGS_${GOLDPROGS}
#PD_2=QPSIMKEY_${QP_SIM_KEY}/QAL_${DENLOSS}/EXL_${EXCLOSS}/QATL_${QATTLOSS}/MML_${MMLLOSS}/SUPFIRST_${SUPFIRST}/SUPEPOCHS_${SUPEPOCHS}
#SERIALIZATION_DIR=${SERIALIZATION_DIR_ROOT}/${MODEL_DIR}/${PD_1}/${PD_2}/S_${SEED}/no_qsa
#
## PREDICTION DATASET
#PREDICT_OUTPUT_DIR=${SERIALIZATION_DIR}/predictions
#mkdir ${PREDICT_OUTPUT_DIR}
#
#mkdir -p ${PREDICT_OUTPUT_DIR}/${EVAL_DATASET}

##***************** PREDICTION FILENAME *****************
#PRED_FILENAME=${EVAL_DATASET}.dev_pred.txt
#EVAL_FILENAME=${EVAL_DATASET}.dev_eval.txt
#TESTFILE=${VALFILE}
##PRED_FILENAME=train_predictions.txt
##TESTFILE=${TRAINFILE}


# PACKAGE TO BE INCLUDED WHICH HOUSES ALL THE CODE
INCLUDE_PACKAGE=semqa
Expand All @@ -76,16 +9,16 @@ export BEAMSIZE=1
export DEBUG=true

# SAVED MODEL
MODEL_DIR=./resources/semqa/checkpoints/test/hmywcount_mod_sgfilter_filterlater5
MODEL_DIR=./resources/semqa/checkpoints/drop/date_num/date_numcq_hmvy_cnt_filter/drop_parser/TOKENS_qanet/ED_100/RG_1e-07/MODELTYPE_encoded/CNTFIX_true/SUPEPOCHS_5/S_100/PattnCount
MODEL_TAR=${MODEL_DIR}/model.tar.gz
PREDICTION_DIR=${MODEL_DIR}/predictions
mkdir ${PREDICTION_DIR}

# EVALUATION DATASET
SUBFOLDER=num
EVAL_DATASET=datecomp_full
EVAL_DATASET=

for EVAL_DATASET in hmyw_filter
for EVAL_DATASET in numcomp_full count_filterqattn hmyw_filter
do
DATASET_DIR=./resources/data/drop_s/${SUBFOLDER}/${EVAL_DATASET}
TRAINFILE=${DATASET_DIR}/drop_dataset_train.json
Expand Down
Loading

0 comments on commit 0acb0bc

Please sign in to comment.