From 0acb0bcaffedcd229de1d4c132b08af431091720 Mon Sep 17 00:00:00 2001 From: Nitish Gupta Date: Thu, 9 May 2019 19:41:13 -0500 Subject: [PATCH] Works for DC+NC+YD+HMYW+Count. 1. New pattn2count pretraining, data creation. 2. Model loads pre-trained count. 3. Figuring out a new method to compute max and argmax. Will be a major change. This commit can be used to come back to. --- .../semqa/train/drop_parser_wmodel.jsonnet | 6 +- .../semqa/train/passage_attn2count.jsonnet | 29 +- datasets/drop/analysis/parabucketedsize.py | 89 ++++++ datasets/drop/synthetic/__init__.py | 0 datasets/drop/synthetic/pattn2count.py | 141 +++++++++ scripts/allennlp/base/make_model_tar.sh | 2 +- .../allennlp/models/drop_parser/predict.sh | 73 +---- scripts/allennlp/models/drop_parser/train.sh | 12 +- .../models/drop_pattn2count/predict.sh | 52 ++-- .../allennlp/models/drop_pattn2count/train.sh | 25 +- scripts/allennlp/models/naqanet/predict.sh | 51 ++- .../data/dataset_readers/drop/drop_reader.py | 15 +- .../drop/passage_attn2count.py | 291 ------------------ .../drop/pattn2count_reader.py | 273 ++++++++++++++++ semqa/data/iterators/filter_iterator.py | 12 - semqa/domain_languages/drop/drop_language.py | 99 +++--- .../drop/execution_parameters.py | 5 +- semqa/models/drop/drop_parser_wmodel.py | 16 +- semqa/models/drop/passage_attn_to_count.py | 102 ++++-- .../predictors/drop/pattn2count_predictor.py | 127 ++++++++ 20 files changed, 872 insertions(+), 548 deletions(-) create mode 100644 datasets/drop/analysis/parabucketedsize.py create mode 100644 datasets/drop/synthetic/__init__.py create mode 100644 datasets/drop/synthetic/pattn2count.py delete mode 100644 semqa/data/dataset_readers/drop/passage_attn2count.py create mode 100644 semqa/data/dataset_readers/drop/pattn2count_reader.py create mode 100644 semqa/predictors/drop/pattn2count_predictor.py diff --git a/allenconfigs/semqa/train/drop_parser_wmodel.jsonnet b/allenconfigs/semqa/train/drop_parser_wmodel.jsonnet index 09f72c8..48bf171 100644 --- a/allenconfigs/semqa/train/drop_parser_wmodel.jsonnet +++ b/allenconfigs/semqa/train/drop_parser_wmodel.jsonnet @@ -231,7 +231,7 @@ local compareff_inputdim = "type": "gru", "input_size": 4, "hidden_size": 20, - "num_layers": 3, + "num_layers": 2, "bidirectional": true, }, @@ -254,10 +254,10 @@ local compareff_inputdim = "initializers": [ - ["passage_attention_to_count|passage_count_predictor", + ["passage_attention_to_count|passage_count_hidden2logits", { "type": "pretrained", - "weights_file_path": "./resources/semqa/checkpoints/savedmodels/num2count_vstd/best.th" + "weights_file_path": "./resources/semqa/checkpoints/drop_pattn2count/T_gru/Isize_4/Hsize_20/Layers_2/S_100/t600_v600/best.th" }, ], [".*_text_field_embedder.*", "prevent"] diff --git a/allenconfigs/semqa/train/passage_attn2count.jsonnet b/allenconfigs/semqa/train/passage_attn2count.jsonnet index 2d3686e..5b5e7c0 100644 --- a/allenconfigs/semqa/train/passage_attn2count.jsonnet +++ b/allenconfigs/semqa/train/passage_attn2count.jsonnet @@ -1,15 +1,26 @@ -local utils = import 'utils.libsonnet'; - +local utils = import "utils.libsonnet"; { "dataset_reader": { "type": "passage_attn2count_reader", - "min_passage_length": 200, - "max_passage_length": 400, - "max_span_length": 10, - "num_training_samples": 2000, - "normalized": utils.boolparser(std.extVar("NORM")), - "withnoise": utils.boolparser(std.extVar("NOISE")), + "min_passage_length": 100, + "max_passage_length": 600, + "min_span_length": 5, + "max_span_length": 15, + "samples_per_bucket_count": 2000, + "normalized": true, + "withnoise": true, + }, + + "validation_dataset_reader": { + "type": "passage_attn2count_reader", + "min_passage_length": 100, + "max_passage_length": 600, + "min_span_length": 5, + "max_span_length": 15, + "samples_per_bucket_count": 500, + "normalized": true, + "withnoise": true, }, "train_data_path": std.extVar("TRAINING_DATA_FILE"), @@ -31,7 +42,7 @@ local utils = import 'utils.libsonnet'; "iterator": { "type": "basic", "batch_size": std.extVar("BS"), - "max_instances_in_memory": std.extVar("BS") + "max_instances_in_memory": 1000000, }, "trainer": { diff --git a/datasets/drop/analysis/parabucketedsize.py b/datasets/drop/analysis/parabucketedsize.py new file mode 100644 index 0000000..ced23a7 --- /dev/null +++ b/datasets/drop/analysis/parabucketedsize.py @@ -0,0 +1,89 @@ +import os +import json +import copy +import argparse +import datasets.drop.constants as constants +from collections import defaultdict +from utils.util import round_all + + +def readDataset(input_json): + with open(input_json, 'r') as f: + dataset = json.load(f) + return dataset + + +def quesParaSize(input_json): + dataset = readDataset(input_json) + numparas = 0 + maxparalen = 0 + passage_len_sums = 0 + plen_lt_100_cnt = 0 + plen_lt_200_cnt = 0 + plen_lt_400_cnt = 0 + plen_lt_500_cnt = 0 + plen_lt_600_cnt = 0 + plen_lt_800_cnt = 0 + plen_lt_1000_cnt = 0 + + for pid, pinfo in dataset.items(): + numparas += 1 + passage = pinfo[constants.tokenized_passage] + plen = len(passage.split(' ')) + maxparalen = plen if plen > maxparalen else maxparalen + + passage_len_sums += plen + + if plen < 100: + plen_lt_100_cnt += 1 + if plen < 200: + plen_lt_200_cnt += 1 + if plen < 400: + plen_lt_400_cnt += 1 + if plen < 500: + plen_lt_500_cnt += 1 + if plen < 600: + plen_lt_600_cnt += 1 + if plen < 800: + plen_lt_800_cnt += 1 + if plen < 1000: + plen_lt_1000_cnt += 1 + + avg_plen = float(passage_len_sums)/numparas + + print(f"Paras: {numparas} MaxParaLen:{maxparalen}") + print(f"Avg Para len: {avg_plen}") + print(f"Plen < 100: {plen_lt_100_cnt}") + print(f"Plen < 200: {plen_lt_200_cnt}") + print(f"Plen < 400: {plen_lt_400_cnt}") + print(f"Plen < 500: {plen_lt_500_cnt}") + print(f"Plen < 600: {plen_lt_600_cnt}") + print(f"Plen < 800: {plen_lt_800_cnt}") + print(f"Plen < 1000: {plen_lt_1000_cnt}") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--inputdir') + args = parser.parse_args() + + inputdir = args.inputdir + + train_json = 'drop_dataset_train.json' + dev_json = 'drop_dataset_dev.json' + + inputdir = "./resources/data/drop_s/num/count_filterqattn" + # inputdir = "./resources/data/drop_s/date_num/date_numcq_hmvy_cnt_filter" + + + input_trnfp = os.path.join(inputdir, train_json) + input_devfp = os.path.join(inputdir, dev_json) + + print(input_trnfp) + quesParaSize(input_trnfp) + + print(input_devfp) + quesParaSize(input_devfp) + + + diff --git a/datasets/drop/synthetic/__init__.py b/datasets/drop/synthetic/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datasets/drop/synthetic/pattn2count.py b/datasets/drop/synthetic/pattn2count.py new file mode 100644 index 0000000..a5f9a54 --- /dev/null +++ b/datasets/drop/synthetic/pattn2count.py @@ -0,0 +1,141 @@ +from typing import List +import numpy as np +import random +from collections import defaultdict +import json + +random.seed(100) +np.random.seed(100) + + +def sample_spansfor_variablelength(seqlen, num_spans, span_lengths: List[int]): + sum_lengths = sum(span_lengths) + # We need a gap of atleast 1 token between two spans. Number of heads is computed based on longer spans (+1) + # and offset is also up by +1 + # Range of Number of possible span starts + num_heads = seqlen - (sum_lengths - num_spans + num_spans) + if num_heads < num_spans: + return None + indices = range(seqlen - (sum_lengths - num_spans)) + result = [] + offset = 0 + # Randomly sample n=num_spans heads + for i, idx in enumerate(sorted(random.sample(indices, num_spans))): + # These heads are 0-indexed, to this we add the offset we've covered in the seq + idx += offset + span_length = span_lengths[i] + result.append((idx, idx + span_length)) + offset += span_length - 1 + 1 + return result + + +def make_instance(min_passage_length: int, max_passage_length: int, + min_span_length: int, max_span_length: int, count_value: int): + + passage_length = random.randint(min_passage_length, max_passage_length) + # Mean: 0, Std: 0.2, Size: PassageLength + attention = np.abs(np.random.normal(0.0, 0.1, passage_length)) + + if count_value > 0: + span_lengths = [random.randint(min_span_length, max_span_length) for _ in range(count_value)] + # Sample n=count_value spans of the same length. Ends are exclusive + # sampled_spans = self.sample_spans(passage_length, count_value, span_length) + sampled_spans = sample_spansfor_variablelength(passage_length, count_value, span_lengths) + if sampled_spans is None: + return None + + for (start, end) in sampled_spans: + attention[start:end] += 1.0 + + attention_sum = sum(attention) + attention = attention / attention_sum + + return attention + +def _get_length_buckets(min_passage_length, max_passage_length): + if min_passage_length == max_passage_length: + return [(min_passage_length, max_passage_length)] + + min_length_buckets = [min_passage_length] + max_length_buckets = [] + + # Add start, end + 100 until end <= max_passage_length + i = 1 + while True: + potential_max_len = i * 100 + min_passage_length + if potential_max_len <= max_passage_length: + max_length_buckets.append(potential_max_len) + min_length_buckets.append(max_length_buckets[-1]) # Last end is next's start + + i += 1 + else: + break + if len(max_length_buckets) == 0 or max_length_buckets[-1] != max_passage_length: # This was left out + max_length_buckets.append(max_passage_length) + + if min_length_buckets[-1] == max_passage_length: + min_length_buckets = min_length_buckets[:-1] + + return list(zip(min_length_buckets, max_length_buckets)) + + +def make_data(min_passage_length, max_passage_length, min_span_length, max_span_length, + samples_per_bucket_count: int, max_count_value: int = 7): + # For each 100 length bucket, and count value, generate 1000 examples in train mode, and 100 in val mode + num_instances_per_bucket_per_count = samples_per_bucket_count + + # List of min and max passage + minmax_passagelen_tuples = _get_length_buckets(min_passage_length, max_passage_length) + data_dicts = [] + + lenbucket_count_dict = defaultdict() + + for count_value in range(0, max_count_value + 1): + print(f"Count Value: {count_value}") + for min_plen, max_plen in minmax_passagelen_tuples: + instances_for_bucket = 0 + for i in range(num_instances_per_bucket_per_count): + attention = make_instance(min_passage_length=min_plen, max_passage_length=max_plen, + min_span_length=min_span_length, max_span_length=max_span_length, + count_value=count_value) + if attention is None: + continue + if count_value not in lenbucket_count_dict: + lenbucket_count_dict[count_value] = defaultdict(int) + lenbucket_count_dict[count_value][(min_plen, max_plen)] += 1 + attention = attention.tolist() + data_dicts.append({'attention': attention, 'count_value': count_value}) + instances_for_bucket += 1 + print(f"{min_plen}, {max_plen} :: {instances_for_bucket}") + print('\n') + + print(lenbucket_count_dict) + return data_dicts + + +def write_data_to_file(data, filepath): + with open(filepath, 'w') as f: + json.dump(data, f) + + +if __name__=='__main__': + train_data = make_data(min_passage_length=100, max_passage_length=600, min_span_length=5, + max_span_length=15, max_count_value=7, samples_per_bucket_count=2000) + + dev_data = make_data(min_passage_length=100, max_passage_length=600, min_span_length=5, + max_span_length=15, max_count_value=7, samples_per_bucket_count=500) + + train_data_path = "./resources/data/drop_s/synthetic/pattn2count/train.json" + dev_data_path = "./resources/data/drop_s/synthetic/pattn2count/dev.json" + + + write_data_to_file(train_data, train_data_path) + write_data_to_file(dev_data, dev_data_path) + + + + + + + + diff --git a/scripts/allennlp/base/make_model_tar.sh b/scripts/allennlp/base/make_model_tar.sh index a735b48..fd37ec7 100755 --- a/scripts/allennlp/base/make_model_tar.sh +++ b/scripts/allennlp/base/make_model_tar.sh @@ -1,6 +1,6 @@ #!/usr/bin/env -SERIALIZATION_DIR=./resources/semqa/checkpoints/drop/date_num/date_numcq_hmvy_cnt_filter/drop_parser/TOKENS_qanet/ED_100/RG_1e-07/MODELTYPE_modeled/SUPEPOCHS_2/S_100/SMFilter +SERIALIZATION_DIR=./resources/semqa/checkpoints/drop/date_num/date_numcq_hmvy_cnt_filter/drop_parser/TOKENS_qanet/ED_100/RG_1e-07/MODELTYPE_encoded/CNTFIX_true/SUPEPOCHS_5/S_100/PattnCount WEIGHTS_TH=best.th MODEL_ARCHIVE=${SERIALIZATION_DIR}/model.tar.gz diff --git a/scripts/allennlp/models/drop_parser/predict.sh b/scripts/allennlp/models/drop_parser/predict.sh index 0dc9de5..d4fcbee 100644 --- a/scripts/allennlp/models/drop_parser/predict.sh +++ b/scripts/allennlp/models/drop_parser/predict.sh @@ -1,73 +1,6 @@ #!/usr/bin/env export TMPDIR=/srv/local/data/nitishg/tmp -# -#### DATASET PATHS -- should be same across models for same dataset -#TRAINDATASET_NAME=date_num/dc_nc_100_yeardiff -# -#EVAL_DATASET=date/year_diff -# -#DATASET_DIR=./resources/data/drop_s/${EVAL_DATASET} -#TRAINFILE=${DATASET_DIR}/drop_dataset_train.json -#VALFILE=${DATASET_DIR}/drop_dataset_dev.json -# -## PACKAGE TO BE INCLUDED WHICH HOUSES ALL THE CODE -#INCLUDE_PACKAGE=semqa -# -## Check CONFIGFILE for environment variables to set -#export GPU=0 -# -## All parameters here are used to fetch the correct serialization_dir -#export TOKENIDX="qanet" -# -#export BS=8 -#export DROPOUT=0.2 -#export LR=0.001 -# -#export WEMB_DIM=100 -#export RG=1e-4 -# -## Which kind of similarity to use in Ques-Passage attention - raw / encoded / raw-enc -#export QP_SIM_KEY="raw" -# -#export GOLDACTIONS=false -#export GOLDPROGS=false -#export DENLOSS=true -#export EXCLOSS=true -#export QATTLOSS=true -#export MMLLOSS=true -# -## Whether strong supervison instances should be trained on first, if yes for how many epochs -#export SUPFIRST=true -#export SUPEPOCHS=5 -# -#export SEED=100 -# -#export BEAMSIZE=2 -# -#export DEBUG=true -# -##### SERIALIZATION DIR --- Check for checkpoint_root/task/dataset/model/parameters/ -#CHECKPOINT_ROOT=./resources/semqa/checkpoints -#SERIALIZATION_DIR_ROOT=${CHECKPOINT_ROOT}/drop_old/${TRAINDATASET_NAME} -#MODEL_DIR=drop_parser -#PD_1=BS_${BS}/LR_${LR}/Drop_${DROPOUT}/TOKENS_${TOKENIDX}/ED_${WEMB_DIM}/RG_${RG}/GACT_${GOLDACTIONS}/GPROGS_${GOLDPROGS} -#PD_2=QPSIMKEY_${QP_SIM_KEY}/QAL_${DENLOSS}/EXL_${EXCLOSS}/QATL_${QATTLOSS}/MML_${MMLLOSS}/SUPFIRST_${SUPFIRST}/SUPEPOCHS_${SUPEPOCHS} -#SERIALIZATION_DIR=${SERIALIZATION_DIR_ROOT}/${MODEL_DIR}/${PD_1}/${PD_2}/S_${SEED}/no_qsa -# -## PREDICTION DATASET -#PREDICT_OUTPUT_DIR=${SERIALIZATION_DIR}/predictions -#mkdir ${PREDICT_OUTPUT_DIR} -# -#mkdir -p ${PREDICT_OUTPUT_DIR}/${EVAL_DATASET} - -##***************** PREDICTION FILENAME ***************** -#PRED_FILENAME=${EVAL_DATASET}.dev_pred.txt -#EVAL_FILENAME=${EVAL_DATASET}.dev_eval.txt -#TESTFILE=${VALFILE} -##PRED_FILENAME=train_predictions.txt -##TESTFILE=${TRAINFILE} - # PACKAGE TO BE INCLUDED WHICH HOUSES ALL THE CODE INCLUDE_PACKAGE=semqa @@ -76,16 +9,16 @@ export BEAMSIZE=1 export DEBUG=true # SAVED MODEL -MODEL_DIR=./resources/semqa/checkpoints/test/hmywcount_mod_sgfilter_filterlater5 +MODEL_DIR=./resources/semqa/checkpoints/drop/date_num/date_numcq_hmvy_cnt_filter/drop_parser/TOKENS_qanet/ED_100/RG_1e-07/MODELTYPE_encoded/CNTFIX_true/SUPEPOCHS_5/S_100/PattnCount MODEL_TAR=${MODEL_DIR}/model.tar.gz PREDICTION_DIR=${MODEL_DIR}/predictions mkdir ${PREDICTION_DIR} # EVALUATION DATASET SUBFOLDER=num -EVAL_DATASET=datecomp_full +EVAL_DATASET= -for EVAL_DATASET in hmyw_filter +for EVAL_DATASET in numcomp_full count_filterqattn hmyw_filter do DATASET_DIR=./resources/data/drop_s/${SUBFOLDER}/${EVAL_DATASET} TRAINFILE=${DATASET_DIR}/drop_dataset_train.json diff --git a/scripts/allennlp/models/drop_parser/train.sh b/scripts/allennlp/models/drop_parser/train.sh index 8469002..154f5e0 100644 --- a/scripts/allennlp/models/drop_parser/train.sh +++ b/scripts/allennlp/models/drop_parser/train.sh @@ -3,8 +3,8 @@ export TMPDIR=/srv/local/data/nitishg/tmp ### DATASET PATHS -- should be same across models for same dataset -# DATASET_NAME=date_num/datepaq_numcq_hmvy_ydiff_countqa -DATASET_NAME=num/hmyw_count_filter +DATASET_NAME=date_num/date_numcq_hmvy_cnt_filter +# DATASET_NAME=num/hmyw_count_filter DATASET_DIR=./resources/data/drop_s/${DATASET_NAME} TRAINFILE=${DATASET_DIR}/drop_dataset_train.json @@ -34,7 +34,7 @@ export WORDEMB_FILE="https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/ export BIDAF_MODEL_TAR='https://s3-us-west-2.amazonaws.com/allennlp/models/bidaf-model-2017.09.15-charpad.tar.gz' export BIDAF_WORDEMB_FILE="https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz" -export MODELTYPE=modeled +export MODELTYPE=encoded export COUNT_FIXED=true export DENLOSS=true @@ -67,11 +67,11 @@ export DEBUG=false CHECKPOINT_ROOT=./resources/semqa/checkpoints SERIALIZATION_DIR_ROOT=${CHECKPOINT_ROOT}/drop/${DATASET_NAME} MODEL_DIR=drop_parser -PD_1=TOKENS_${TOKENIDX}/ED_${WEMB_DIM}/RG_${RG}/MODELTYPE_${MODELTYPE} +PD_1=TOKENS_${TOKENIDX}/ED_${WEMB_DIM}/RG_${RG}/MODELTYPE_${MODELTYPE}/CNTFIX_${COUNT_FIXED} PD_2=SUPEPOCHS_${SUPEPOCHS} -SERIALIZATION_DIR=${SERIALIZATION_DIR_ROOT}/${MODEL_DIR}/${PD_1}/${PD_2}/S_${SEED}/SigmFilter +SERIALIZATION_DIR=${SERIALIZATION_DIR_ROOT}/${MODEL_DIR}/${PD_1}/${PD_2}/S_${SEED}/PattnCount -SERIALIZATION_DIR=./resources/semqa/checkpoints/test/hmywcount_mod_sgfilter_filterlater5_cntfix +# SERIALIZATION_DIR=./resources/semqa/checkpoints/test/hmywcount_mod_sgfilter_filterlater5_cntfix # SERIALIZATION_DIR=./resources/semqa/checkpoints/test/test ####################################################################################################################### diff --git a/scripts/allennlp/models/drop_pattn2count/predict.sh b/scripts/allennlp/models/drop_pattn2count/predict.sh index 3cd84db..27947b6 100644 --- a/scripts/allennlp/models/drop_pattn2count/predict.sh +++ b/scripts/allennlp/models/drop_pattn2count/predict.sh @@ -2,11 +2,6 @@ export TMPDIR=/srv/local/data/nitishg/tmp -### DATASET PATHS -- should be same across models for same dataset -DATASET_DIR=./resources/data/drop/date_subset -TRAINFILE=${DATASET_DIR}/drop_dataset_train.json -VALFILE=${DATASET_DIR}/drop_dataset_dev.json - # PACKAGE TO BE INCLUDED WHICH HOUSES ALL THE CODE INCLUDE_PACKAGE=semqa @@ -14,48 +9,41 @@ INCLUDE_PACKAGE=semqa export GPU=0 # All parameters here are used to fetch the correct serialization_dir -export TOKENIDX="qanet" - -export GOLDACTIONS=true - export BS=8 export DROPOUT=0.2 -export DEBUG=false - -#### SERIALIZATION DIR --- Check for checkpoint_root/task/dataset/model/parameters/ -CHECKPOINT_ROOT=./resources/semqa/checkpoints -SERIALIZATION_DIR_ROOT=${CHECKPOINT_ROOT}/drop/date_num -MODEL_DIR=drop_parser -PARAMETERS_DIR1=BS_${BS}/Drop_${DROPOUT}/TOKENS_${TOKENIDX}/GOLDAC_${GOLDACTIONS} -# PARAMETERS_DIR2=AUXGPLOSS_${AUXGPLOSS}/ATTCOV_${ATTCOVLOSS}/PTREX_${PTREX} -SERIALIZATION_DIR=${SERIALIZATION_DIR_ROOT}/${MODEL_DIR}/${PARAMETERS_DIR1}_ep_rawsim_attperf #/${PARAMETERS_DIR2} +DATASET_DIR=./resources/data/drop_s/synthetic/pattn2count +TRAINFILE=${DATASET_DIR}/train.json +VALFILE=${DATASET_DIR}/dev.json +TEST_DATA=${VALFILE} -# PREDICTION DATASET -PREDICT_OUTPUT_DIR=${SERIALIZATION_DIR}/predictions -mkdir ${PREDICT_OUTPUT_DIR} +#### SERIALIZATION DIR --- Check for checkpoint_root/task/dataset/model/parameters/ +MODEL_DIR=./resources/semqa/checkpoints/drop_pattn2count/T_gru/Isize_4/Hsize_20/Layers_2/S_100/t600_v600 +MODEL_TAR=${MODEL_DIR}/model.tar.gz +PREDICTION_DIR=${MODEL_DIR}/predictions +mkdir ${PREDICTION_DIR} +PREDICTION_FILE=${PREDICTION_DIR}/preds.txt +EVALUATION_FILE=${PREDICTION_DIR}/eval.txt -#***************** PREDICTION FILENAME ***************** -PRED_FILENAME=dev_predictions.txt +####################################################################################################################### -TESTFILE=${VALFILE} -MODEL_TAR=${SERIALIZATION_DIR}/model.tar.gz -PREDICTION_FILE=${PREDICT_OUTPUT_DIR}/${PRED_FILENAME} -PREDICTOR=drop_parser_predictor +allennlp evaluate --output-file ${EVALUATION_FILE} \ + --cuda-device ${GPU} \ + --include-package ${INCLUDE_PACKAGE} \ + ${MODEL_TAR} ${TEST_DATA} -####################################################################################################################### allennlp predict --output-file ${PREDICTION_FILE} \ - --predictor ${PREDICTOR} \ + --predictor pattn2count_predictor \ --cuda-device ${GPU} \ --include-package ${INCLUDE_PACKAGE} \ --silent \ --batch-size 1 \ --use-dataset-reader \ - --overrides "{ "model": {"debug": ${DEBUG}} }" \ - ${MODEL_TAR} ${TESTFILE} - + --overrides "{"dataset_reader": { "samples_per_bucket_count": 5}}" \ + ${MODEL_TAR} ${TEST_DATA} +# echo -e "Predictions file saved at: ${PREDICTION_FILE}" diff --git a/scripts/allennlp/models/drop_pattn2count/train.sh b/scripts/allennlp/models/drop_pattn2count/train.sh index a3e637e..66301b6 100644 --- a/scripts/allennlp/models/drop_pattn2count/train.sh +++ b/scripts/allennlp/models/drop_pattn2count/train.sh @@ -2,11 +2,11 @@ export TMPDIR=/srv/local/data/nitishg/tmp -DATASET_NAME=num/yardscount_wqattn +DATASET_NAME=num/numcomp_full -DATASET_DIR=./resources/data/drop_s/${DATASET_NAME} -TRAINFILE=${DATASET_DIR}/drop_dataset_train.json -VALFILE=${DATASET_DIR}/drop_dataset_dev.json +DATASET_DIR=./resources/data/drop_s/synthetic/pattn2count +TRAINFILE=${DATASET_DIR}/train.json +VALFILE=${DATASET_DIR}/dev.json export TRAINING_DATA_FILE=${TRAINFILE} export VAL_DATA_FILE=${VALFILE} @@ -20,18 +20,19 @@ CONFIGFILE=allenconfigs/semqa/train/passage_attn2count.jsonnet # Check CONFIGFILE for environment variables to set export GPU=0 -export BS=8 +export BS=16 export TYPE=gru export ISIZE=4 export HSIZE=20 -export NL=3 -export SEED=10 +export NL=2 + +export SEED=100 export NORM=true export NOISE=true -export EPOCHS=20 +export EPOCHS=40 for SEED in 100 do @@ -39,12 +40,12 @@ do do #### SERIALIZATION DIR --- Check for checkpoint_root/task/dataset/model/parameters/ CHECKPOINT_ROOT=./resources/semqa/checkpoints - SERIALIZATION_DIR_ROOT=${CHECKPOINT_ROOT}/drop/date_num + SERIALIZATION_DIR_ROOT=${CHECKPOINT_ROOT} MODEL_DIR=drop_pattn2count - PARAMETERS_DIR1=BS_${BS}/NORM_${NORM}/T_${TYPE}/I_${ISIZE}/H_${HSIZE}/NL_${NL}/NOISE_${NOISE}/S_${SEED} - SERIALIZATION_DIR=${SERIALIZATION_DIR_ROOT}/${MODEL_DIR}/${PARAMETERS_DIR1} + PARAMETERS_DIR1=T_${TYPE}/Isize_${ISIZE}/Hsize_${HSIZE}/Layers_${NL}/S_${SEED} + SERIALIZATION_DIR=${SERIALIZATION_DIR_ROOT}/${MODEL_DIR}/${PARAMETERS_DIR1}/t600_v600_noLenBias - SERIALIZATION_DIR=./resources/semqa/checkpoints/savedmodels/count_pretrn_nobias + # SERIALIZATION_DIR=./resources/semqa/checkpoints/savedmodels/test ####################################################################################################################### diff --git a/scripts/allennlp/models/naqanet/predict.sh b/scripts/allennlp/models/naqanet/predict.sh index fc067f6..fc6af95 100644 --- a/scripts/allennlp/models/naqanet/predict.sh +++ b/scripts/allennlp/models/naqanet/predict.sh @@ -1,44 +1,43 @@ #!/usr/bin/env bash -### DATASET PATHS -- should be same across models for same dataset - -DATASET_NAME=num/howmanyyards_count_diff -DATASET_DIR=./resources/data/drop_s/${DATASET_NAME} - -TRAINFILE=${DATASET_DIR}/drop_dataset_train.json -VALFILE=${DATASET_DIR}/drop_dataset_dev.json - MODEL_TAR='https://s3-us-west-2.amazonaws.com/allennlp/models/naqanet-2019.03.01.tar.gz' GPU=0 CHECKPOINT_ROOT=./resources/semqa/checkpoints -SERIALIZATION_DIR_ROOT=${CHECKPOINT_ROOT}/drop/${DATASET_NAME} +SERIALIZATION_DIR_ROOT=${CHECKPOINT_ROOT}/drop MODEL_DIR=naqanet SERIALIZATION_DIR=${SERIALIZATION_DIR_ROOT}/${MODEL_DIR} -OUTPUT_DIR=${SERIALIZATION_DIR}/predictions +PREDICTION_DIR=${SERIALIZATION_DIR}/predictions -mkdir -p ${OUTPUT_DIR} +mkdir -p ${PREDICTION_DIR} +SUBFOLDER=date_num -TESTFILE=${VALFILE} -EVAL_FILE=${OUTPUT_DIR}/dev_eval.txt -PREDICTION_FILE=${OUTPUT_DIR}/dev_predictions.txt +for EVAL_DATASET in date_numcq_hmvy_cnt_filter +do + DATASET_DIR=./resources/data/drop_s/${SUBFOLDER}/${EVAL_DATASET} + TRAINFILE=${DATASET_DIR}/drop_dataset_train.json + VALFILE=${DATASET_DIR}/drop_dataset_dev.json -allennlp evaluate --output-file ${EVAL_FILE} \ - --cuda-device ${GPU} \ - ${MODEL_TAR} ${TESTFILE} + TESTFILE=${VALFILE} + PREDICTION_FILE=${PREDICTION_DIR}/${EVAL_DATASET}_dev_pred.txt + EVALUATION_FILE=${PREDICTION_DIR}/${EVAL_DATASET}_dev_eval.txt -allennlp predict --output-file ${PREDICTION_FILE} \ - --predictor 'machine-comprehension' \ - --cuda-device ${GPU} \ - --silent \ - --batch-size 1 \ - --use-dataset-reader \ - ${MODEL_TAR} ${TESTFILE} + allennlp evaluate --output-file ${EVALUATION_FILE} \ + --cuda-device ${GPU} \ + ${MODEL_TAR} ${TESTFILE} -echo -e "Evaluation file saved at: ${EVAL_FILE}" -echo -e "Predictions file saved at: ${PREDICTION_FILE}" +# allennlp predict --output-file ${PREDICTION_FILE} \ +# --predictor 'machine-comprehension' \ +# --cuda-device ${GPU} \ +# --silent \ +# --batch-size 1 \ +# --use-dataset-reader \ +# ${MODEL_TAR} ${TESTFILE} + echo -e "Evaluation file saved at: ${EVALUATION_FILE}" + echo -e "Predictions file saved at: ${PREDICTION_FILE}" +done diff --git a/semqa/data/dataset_readers/drop/drop_reader.py b/semqa/data/dataset_readers/drop/drop_reader.py index 842d174..6913532 100644 --- a/semqa/data/dataset_readers/drop/drop_reader.py +++ b/semqa/data/dataset_readers/drop/drop_reader.py @@ -1051,16 +1051,19 @@ def maxnum_filterfind_logicalforms(**kwargs) -> Tuple[List[str], List[str]]: @staticmethod def count_find_logicalforms(**kwargs) -> Tuple[List[str], List[str]]: - find_num_lfs, _ = DROPReader.findnum_logicalforms() - find_num_lf = find_num_lfs[0] - gold_lf = f"(numberDistribution2Count {find_num_lf})" + # find_num_lfs, _ = DROPReader.findnum_logicalforms() + # find_num_lf = find_num_lfs[0] + # gold_lf = f"(numberDistribution2Count {find_num_lf})" + gold_lf = "(passageAttn2Count find_PassageAttention)" return [gold_lf], ['count_number'] @staticmethod def count_filterfind_logicalforms(**kwargs) -> Tuple[List[str], List[str]]: - findfilter_num_lfs, _ = DROPReader.filterfindnum_logicalforms() - findfilter_num_lf = findfilter_num_lfs[0] - gold_lf = f"(numberDistribution2Count {findfilter_num_lf})" + # findfilter_num_lfs, _ = DROPReader.filterfindnum_logicalforms() + # findfilter_num_lf = findfilter_num_lfs[0] + # gold_lf = f"(numberDistribution2Count {findfilter_num_lf})" + filter_passageattn_lf = DROPReader.filter_passageattn_lf() + gold_lf = f"(passageAttn2Count {filter_passageattn_lf})" return [gold_lf], ['count_number'] diff --git a/semqa/data/dataset_readers/drop/passage_attn2count.py b/semqa/data/dataset_readers/drop/passage_attn2count.py deleted file mode 100644 index ab7fad9..0000000 --- a/semqa/data/dataset_readers/drop/passage_attn2count.py +++ /dev/null @@ -1,291 +0,0 @@ -import json -import random -import logging -import itertools -import numpy as np -from typing import Dict, List, Union, Tuple, Any -from collections import defaultdict -from overrides import overrides -from allennlp.common.file_utils import cached_path -from allennlp.data.dataset_readers.dataset_reader import DatasetReader -from allennlp.data.instance import Instance -from allennlp.data.dataset_readers.reading_comprehension.util import make_reading_comprehension_instance -from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer -from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer -from allennlp.data.dataset_readers.reading_comprehension.util import IGNORED_TOKENS, STRIPPED_CHARACTERS -from allennlp.data.fields import Field, TextField, MetadataField, LabelField, ListField, \ - SequenceLabelField, SpanField, IndexField, ProductionRuleField, ArrayField - -from semqa.domain_languages.drop_old.drop_language import DropLanguage, Date, get_empty_language_object -from collections import defaultdict - -from datasets.drop import constants - -# from reading_comprehension.utils import split_tokens_by_hyphen - -logger = logging.getLogger(__name__) # pylint: disable=invalid-name - - -# TODO: Add more number here -WORD_NUMBER_MAP = {"zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, - "five": 5, "six": 6, "seven": 7, "eight": 8, - "nine": 9, "ten": 10, "eleven": 11, "twelve": 12, - "thirteen": 13, "fourteen": 14, "fifteen": 15, - "sixteen": 16, "seventeen": 17, "eighteen": 18, "nineteen": 19} - - -@DatasetReader.register("passage_attn2count_reader") -class DROPReader(DatasetReader): - def __init__(self, - lazy: bool = False, - min_passage_length=200, - max_passage_length=400, - max_span_length=10, - num_training_samples=2000, - normalized=True, - withnoise=True)-> None: - super().__init__(lazy) - - self._min_passage_length = min_passage_length - self._max_passage_length = max_passage_length - self._max_span_length = max_span_length - self._num_training_samples = num_training_samples - self._normalized = normalized - self._withnoise = withnoise - - @overrides - def _read(self, file_path: str): - # pylint: disable=logging-fstring-interpolation - logger.info(f"Making {self._num_training_samples} training examples with:\n" - f"max_passage_length: {self._max_passage_length}\n" - f"min_passage_len: {self._min_passage_length}\n" - f"max_span_len:{self._max_span_length}\n") - - instances: List[Instance] = [] - for i in range(self._num_training_samples): - fields: Dict[str, Field] = {} - - passage_length = random.randint(self._min_passage_length, self._max_passage_length) - attention = [0.0 for _ in range(passage_length)] - - span_length = random.randint(1, self._max_span_length) - - # Inclusive start and end positions - start_position = random.randint(0, passage_length - span_length) - end_position = start_position + span_length - 1 - - attention[start_position:end_position + 1] = [1.0] * span_length - - if self._withnoise: - attention = [x + abs(random.gauss(0, 0.001)) for x in attention] - - if self._normalized: - attention_sum = sum(attention) - attention = [float(x)/attention_sum for x in attention] - - passage_span_fields = ArrayField(np.array([[start_position, end_position]]), padding_value=-1) - - fields["passage_attention"] = ArrayField(np.array(attention), padding_value=0.0) - - fields["passage_lengths"] = MetadataField(passage_length) - - fields["answer_as_passage_spans"] = passage_span_fields - - instances.append(Instance(fields)) - - print("Making data") - - return instances - - - def number2count_auxloss(self, passage_number_values: List[List[float]], device_id=-1): - """ Using passage numnbers, make a (batch_size, max_passage_numbers) (padded) tensor, each containing a - noisy distribution with mass distributed over x-numbers. The corresponding count-answer will be x. - Use the attention2count rnn to predict a count value and compute the loss. - """ - batch_size = len(passage_number_values) - # List of length -- batch-size - num_of_passage_numbers = [len(nums) for nums in passage_number_values] - max_passage_numbers = max(num_of_passage_numbers) - - # Shape: (batch_size, ) - num_pasasge_numbers = allenutil.move_to_device(torch.LongTensor(num_of_passage_numbers), cuda_device=device_id) - # Shape: (max_passage_numbers, ) - range_vector = allenutil.get_range_vector(size=max_passage_numbers, device=device_id) - - # Shape: (batch_size, maxnum_passage_numbers) - mask = (range_vector.unsqueeze(0) < num_pasasge_numbers.unsqueeze(1)).float() - - number_distributions = mask.new_zeros(batch_size, max_passage_numbers).normal_(0, 0.01).abs_() - count_answers = number_distributions.new_zeros(batch_size).long() - for i, num_numbers in enumerate(num_of_passage_numbers): - """ Sample a count value between [0, min(5, num_numbers)]. Sample indices in this range, and set them as 1. - Add gaussian noise to the whole tensor and normalize. - """ - # Pick a count answer - count_value = random.randint(0, min(7, num_numbers)) - count_answers[i] = count_value - # Pick the indices that will have mass - if count_value > 0: - indices = random.sample(range(num_numbers), count_value) - # Add 1.0 to all sampled indices - number_distributions[i, indices] += 1.0 - - number_distributions = number_distributions * mask - # Shape: (batch_size, maxnum_passage_numbers) - number_distributions = number_distributions / torch.sum(number_distributions, dim=1).unsqueeze(1) - - # Distributions made; computing loss - scaled_attentions = [number_distributions * sf for sf in - self._executor_parameters.passage_attention_scalingvals] - # Shape: (batch_size, maxnum_passage_numbers, num_scaling_factors) - stacked_scaled_attentions = torch.stack(scaled_attentions, dim=2) - - # Shape: (batch_size, hidden_dim) - count_hidden_repr = self.passage_attention_to_count(stacked_scaled_attentions, mask) - - # Shape: (batch_size, num_counts) - count_logits = self.passage_count_predictor(count_hidden_repr) - - count_loss = F.cross_entropy(input=count_logits, target=count_answers) - - return count_loss - - - """ - def _read(self, file_path: str): - # pylint: disable=logging-fstring-interpolation - - instances: List[Instance] = [] - with open(file_path) as dataset_file: - dataset = json.load(dataset_file) - logger.info(f"Reading the dataset from: {file_path}") - - count_dist = defaultdict(int) - masked = 0 - - for passage_id, passage_info in dataset.items(): - passage_text = passage_info[constants.tokenized_passage] - passage_length = len(passage_text.split(' ')) - passage_tokens = passage_text.split(' ') - # print(passage_text) - # print() - - for question_answer in passage_info[constants.qa_pairs]: - fields = {} - - # TODO(nitish): Only using first span as answer - - start_position = 5 - end_position = 10 - - - attention, count_answer, mask = self.make_count_instance(passage_tokens) - if mask != 0: - count_dist[count_answer] += 1 - else: - masked += 1 - - - if self._withnoise: - attention = [x + abs(random.gauss(0, 0.001)) for x in attention] - - if self._normalized: - attention_sum = sum(attention) - attention = [float(x) / attention_sum for x in attention] - - count_answer_vec = [0] * 10 - count_answer_vec[count_answer] = 1 - - fields["passage_attention"] = ArrayField(np.array(attention), padding_value=0.0) - - fields["passage_lengths"] = MetadataField(passage_length) - - fields["answer_as_count"] = ArrayField(np.array(count_answer_vec)) - - fields["count_mask"] = ArrayField(np.array(mask)) - - instances.append(Instance(fields)) - - print(count_dist) - - return instances - - - - def make_count_instance(self, passage_tokens: List[str]): - ''' output an attention, count_answer, mask. Mask is when we don;t find relevant spans ''' - - # We would like to count these spans - relevant_spans = ['TD pass', 'TD run', 'touchdown pass', 'field goal', 'touchdown run'] - num_relevant_spans = len(relevant_spans) - - attention = [0.0] * len(passage_tokens) - - # With 10% prob select no span - count_zero_prob = random.random() - if count_zero_prob < 0.1: - return (attention, 0, 1) - - - # Choose a particular type of span from relevant ones and find it's starting positions - tries = 0 - starting_positions_in_passage = [] - while len(starting_positions_in_passage) == 0 and tries < 5: - choosen_span = random.randint(0, num_relevant_spans - 1) - span_tokens = relevant_spans[choosen_span].split(' ') - starting_positions_in_passage = self.contains(span_tokens, passage_tokens) - tries += 1 - - # even after 5 tries, span to count not found. Return masked attention - if len(starting_positions_in_passage) == 0: - return attention, 0, 0 - - # # TO save from infinite loop - # count_zero_prob = random.random() - # if count_zero_prob < 0.1: - # return attention, 0 - - if len(starting_positions_in_passage) == 1: - count = len(starting_positions_in_passage) - starting_position = starting_positions_in_passage[0] - attention[starting_position] = 1.0 - attention[starting_position + 1] = 1.0 - - else: - num_of_spans_found = len(starting_positions_in_passage) - # Choose a subset of the starting_positions - random.shuffle(starting_positions_in_passage) - num_spans = random.randint(2, num_of_spans_found) - num_spans = min(num_spans, 9) - - count = num_spans - - spread_len = random.randint(1, 3) - - chosen_starting_positions = starting_positions_in_passage[0:num_spans] - for starting_position in chosen_starting_positions: - attention[starting_position] = 1.0 - attention[starting_position + 1] = 1.0 - for i in range(1, spread_len+1): - prev_idx = starting_position - i - if prev_idx >= 0: - attention[prev_idx] = 0.5 - next_idx = starting_position + 1 + i - if next_idx < len(passage_tokens): - attention[next_idx] = 0.5 - - return attention, count, 1 - - def contains(self, small, big): - starting_positions = [] - for i in range(len(big) - len(small) + 1): - start = True - for j in range(len(small)): - if big[i + j] != small[j]: - start = False - break - if start: - starting_positions.append(i) - return starting_positions - """ diff --git a/semqa/data/dataset_readers/drop/pattn2count_reader.py b/semqa/data/dataset_readers/drop/pattn2count_reader.py new file mode 100644 index 0000000..cd65aa0 --- /dev/null +++ b/semqa/data/dataset_readers/drop/pattn2count_reader.py @@ -0,0 +1,273 @@ +import json +import random +import logging +import itertools +import numpy as np +from typing import Dict, List, Union, Tuple, Any +from collections import defaultdict +from overrides import overrides +from allennlp.common.file_utils import cached_path +from allennlp.data.dataset_readers.dataset_reader import DatasetReader +from allennlp.data.instance import Instance +from allennlp.data.dataset_readers.reading_comprehension.util import make_reading_comprehension_instance +from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer +from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer +from allennlp.data.dataset_readers.reading_comprehension.util import IGNORED_TOKENS, STRIPPED_CHARACTERS +from allennlp.data.fields import Field, TextField, MetadataField, LabelField, ListField, \ + SequenceLabelField, SpanField, IndexField, ProductionRuleField, ArrayField + +from semqa.domain_languages.drop_old.drop_language import DropLanguage, Date, get_empty_language_object +from collections import defaultdict + +from datasets.drop import constants + +# from reading_comprehension.utils import split_tokens_by_hyphen + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + +# TODO: Add more number here +WORD_NUMBER_MAP = {"zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, + "five": 5, "six": 6, "seven": 7, "eight": 8, + "nine": 9, "ten": 10, "eleven": 11, "twelve": 12, + "thirteen": 13, "fourteen": 14, "fifteen": 15, + "sixteen": 16, "seventeen": 17, "eighteen": 18, "nineteen": 19} + + +@DatasetReader.register("passage_attn2count_reader") +class PAttn2CountReader(DatasetReader): + def __init__(self, + lazy: bool = False, + min_passage_length=200, + max_passage_length=400, + min_span_length=5, + max_span_length=15, + samples_per_bucket_count=1000, + normalized=True, + withnoise=True)-> None: + super().__init__(lazy) + + self._min_passage_length = min_passage_length + self._max_passage_length = max_passage_length + self._min_span_length = min_span_length + self._max_span_length = max_span_length + self._normalized = True # normalized + self._withnoise = True # withnoise + self.samples_per_bucket_count = samples_per_bucket_count + self.num_instances = 0 + + # def _read(self, file_path: str): + # # pylint: disable=logging-fstring-interpolation + # logger.info(f"Reading file from: {file_path}\n") + # + # instances: List[Instance] = [] + # + # with open(file_path, 'r') as f: + # data_dicts = json.load(f) + # + # for d in data_dicts: + # attention = d['attention'] + # count_value = d['count_value'] + # passage_length = len(attention) + # + # fields = {} + # fields["passage_attention"] = ArrayField(np.array(attention), padding_value=-1) + # fields["passage_lengths"] = MetadataField(passage_length) + # fields["count_answer"] = LabelField(count_value, skip_indexing=True) + # + # instances.append(Instance(fields)) + # self.num_instances += 1 + # # random.shuffle(instances) + # print(f"TotalInstances: {self.num_instances}") + # print(f"Data read!") + # + # return instances + + @overrides + def _read(self, file_path: str): + # pylint: disable=logging-fstring-interpolation + logger.info(f"Making training examples with:\n" + f"min_passage_len: {self._min_passage_length}\n" + f"max_passage_length: {self._max_passage_length}\n" + f"min / max span_len: {self._min_span_length} / {self._max_span_length}\n") + + instances: List[Instance] = [] + + data_dicts = self.make_data(min_passage_length=self._min_passage_length, + max_passage_length=self._max_passage_length, + min_span_length=self._min_span_length, max_span_length=self._max_span_length, + max_count_value=7, samples_per_bucket_count=self.samples_per_bucket_count) + + for d in data_dicts: + attention = d['attention'] + count_value = d['count_value'] + passage_length = len(attention) + + fields = {} + fields["passage_attention"] = ArrayField(np.array(attention), padding_value=-1) + fields["passage_lengths"] = MetadataField(passage_length) + fields["count_answer"] = LabelField(count_value, skip_indexing=True) + + instances.append(Instance(fields)) + self.num_instances += 1 + + print(f"SamplesPerBucketCount: {self.samples_per_bucket_count} TotalInstances: {self.num_instances}") + print(f"Data made!") + + return instances + + + def make_data(self, min_passage_length, max_passage_length, min_span_length, max_span_length, + samples_per_bucket_count: int, max_count_value: int = 7): + # For each 100 length bucket, and count value, generate 1000 examples in train mode, and 100 in val mode + num_instances_per_bucket_per_count = samples_per_bucket_count + + # List of min and max passage + minmax_passagelen_tuples = self._get_length_buckets(min_passage_length, max_passage_length) + + data_dicts = [] + + print(f"Making Data ... ") + + lenbucket_count_dict = defaultdict() + print(f"Passage Length Buckets: {minmax_passagelen_tuples}") + + for count_value in range(0, max_count_value + 1): + print(f"Count Value: {count_value}") + for min_plen, max_plen in minmax_passagelen_tuples: + instances_for_bucket = 0 + for i in range(num_instances_per_bucket_per_count): + attention = self.make_instance(min_passage_length=min_plen, max_passage_length=max_plen, + min_span_length=min_span_length, max_span_length=max_span_length, + count_value=count_value) + if attention is None: + continue + if count_value not in lenbucket_count_dict: + lenbucket_count_dict[count_value] = defaultdict(int) + lenbucket_count_dict[count_value][(min_plen, max_plen)] += 1 + data_dicts.append({'attention': attention, 'count_value': count_value}) + instances_for_bucket += 1 + print(f"{min_plen}, {max_plen} :: {instances_for_bucket}") + print('\n') + return data_dicts + + + def sample_spansfor_variablelength(self, seqlen, num_spans, span_lengths: List[int]): + sum_lengths = sum(span_lengths) + # We need a gap of atleast 1 token between two spans. Number of heads is computed based on longer spans (+1) + # and offset is also up by +1 + # Range of Number of possible span starts + num_heads = seqlen - (sum_lengths - num_spans + num_spans) + if num_heads < num_spans: + return None + indices = range(seqlen - (sum_lengths - num_spans)) + result = [] + offset = 0 + # Randomly sample n=num_spans heads + for i, idx in enumerate(sorted(random.sample(indices, num_spans))): + # These heads are 0-indexed, to this we add the offset we've covered in the seq + idx += offset + span_length = span_lengths[i] + result.append((idx, idx + span_length)) + offset += span_length - 1 + 1 + return result + + def make_instance(self, min_passage_length: int, max_passage_length: int, + min_span_length: int, max_span_length: int, count_value: int): + + passage_length = random.randint(min_passage_length, max_passage_length) + # Mean: 0, Std: 0.2, Size: PassageLength + attention = np.abs(np.random.normal(0.0, 0.1, passage_length)) + + if count_value > 0: + span_lengths = [random.randint(min_span_length, max_span_length) for _ in range(count_value)] + # Sample n=count_value spans of the same length. Ends are exclusive + # sampled_spans = self.sample_spans(passage_length, count_value, span_length) + sampled_spans = self.sample_spansfor_variablelength(passage_length, count_value, span_lengths) + if sampled_spans is None: + return None + + for (start, end) in sampled_spans: + attention[start:end] += 1.0 + + attention_sum = sum(attention) + attention = attention / attention_sum + + return attention + + def _get_length_buckets(self, min_passage_length, max_passage_length): + min_length_buckets = [min_passage_length] + max_length_buckets = [] + + # Add start, end + 100 until end <= max_passage_length + i = 1 + while True: + potential_max_len = i * 100 + min_passage_length + if potential_max_len <= max_passage_length: + max_length_buckets.append(potential_max_len) + min_length_buckets.append(max_length_buckets[-1]) # Last end is next's start + + i += 1 + else: + break + if len(max_length_buckets) == 0 or max_length_buckets[-1] != max_passage_length: # This was left out + max_length_buckets.append(max_passage_length) + + if min_length_buckets[-1] == max_passage_length: + min_length_buckets = min_length_buckets[:-1] + + return list(zip(min_length_buckets, max_length_buckets)) + + # def sample_spans(self, seqlen, num_spans, span_length): + # # We need a gap of atleast 1 token between two spans. Number of heads is computed based on longer spans (+1) + # # and offset is also up by +1 + # # Range of Number of possible span starts + # num_heads = seqlen - (span_length - 1 + 1) * num_spans + # if num_heads < num_spans: + # return None + # indices = range(seqlen - (span_length - 1) * num_spans) + # result = [] + # offset = 0 + # # Randomly sample n=num_spans heads + # for i in sorted(random.sample(indices, num_spans)): + # # These heads are 0-indexed, to this we add the offset we've covered in the seq + # i += offset + # result.append((i, i + span_length)) + # offset += span_length - 1 + 1 + # return result + + +# for i in range(self._num_training_samples): +# fields: Dict[str, Field] = {} +# +# passage_length = random.randint(self._min_passage_length, self._max_passage_length) +# attention = [0.0 for _ in range(passage_length)] +# +# count_value = random.randint(0, 7) +# +# if count_value > 0: +# span_lengths = [random.randint(1, self._max_span_length) +# for _ in range(count_value)] +# +# # Sample n=count_value spans of the same length. Ends are exclusive +# # sampled_spans = self.sample_spans(passage_length, count_value, span_length) +# sampled_spans = self.sample_spansfor_variablelength(passage_length, count_value, span_lengths) +# if sampled_spans is None: +# continue +# +# for (start, end) in sampled_spans: +# attention[start:end] = [1.0] * (end - start) +# +# attention = [x + abs(random.gauss(0, 0.05)) for x in attention] +# +# attention_sum = sum(attention) +# attention = [float(x)/attention_sum for x in attention] +# +# fields["passage_attention"] = ArrayField(np.array(attention), padding_value=-1) +# +# fields["passage_lengths"] = MetadataField(passage_length) +# +# fields["count_answer"] = LabelField(count_value, skip_indexing=True) +# +# instances.append(Instance(fields)) +# self.num_instances += 1 \ No newline at end of file diff --git a/semqa/data/iterators/filter_iterator.py b/semqa/data/iterators/filter_iterator.py index da1d350..278baf9 100644 --- a/semqa/data/iterators/filter_iterator.py +++ b/semqa/data/iterators/filter_iterator.py @@ -50,18 +50,11 @@ def __init__(self, def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: # First break the dataset into memory-sized lists: for instance_list in self._memory_sized_lists(instances): - # print('\n') - # print(len(instance_list)) - # print(self._epochs) - # print('\n') - # exit() - instances_w_epoch_num = 0 for instance in instances: if 'epoch_num' in instance.fields: instances_w_epoch_num += 1 - print(f"\nInstances: {len(instance_list)}") epochs_list = list(self._epochs.values()) @@ -82,13 +75,8 @@ def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Itera strongly_supervised_first = False # CURRICULUM = None - # CURRICULUM = [constants.YARDS_longest_qtype, constants.YARDS_findnum_qtype, constants.YARDS_shortest_qtype] NO_CURRICULUM = [constants.COUNT_filter_find_qtype, constants.MAX_filter_find_qtype, constants.MIN_filter_find_qtype, constants.NUM_filter_find_qtype] - # CURRICULUM = [constants.DATECOMP_QTYPE, constants.NUMCOMP_QTYPE, constants.YARDS_findnum_qtype, - # constants.SYN_NUMGROUND_qtype, constants.SYN_COUNT_qtype] - # CURRICULUM = [constants.DATECOMP_QTYPE, constants.NUMCOMP_QTYPE, constants.YARDS_findnum_qtype, - # constants.COUNT_qtype, constants.SYN_NUMGROUND_qtype, constants.SYN_COUNT_qtype] filtered_instance_list = [] if self.filter_instances: diff --git a/semqa/domain_languages/drop/drop_language.py b/semqa/domain_languages/drop/drop_language.py index a3ef485..c76bf94 100644 --- a/semqa/domain_languages/drop/drop_language.py +++ b/semqa/domain_languages/drop/drop_language.py @@ -350,6 +350,8 @@ def __init__(self, self.passagnum_differences_mat = allenutil.move_to_device(torch.FloatTensor(passagenum_differences_mat), cuda_device=self.device_id) + self.countvals = allenutil.move_to_device(torch.FloatTensor(range(0, 10)), cuda_device=self.device_id) + if self._debug: num_date_tokens = self.passage_datetokens_mask_float.sum() plen = self.passage_mask.sum() @@ -1138,24 +1140,39 @@ def passageAttn2Count(self, passage_attention: PassageAttention) -> CountNumber: # Shape: (passage_length, num_scaling_factors) scaled_passage_attentions = torch.stack(scaled_attentions, dim=1) - # Shape: (hidden_dim, ) + # Shape: (passage_length, hidden_dim) count_hidden_repr = self.parameters.passage_attention_to_count(scaled_passage_attentions.unsqueeze(0), self.passage_mask.unsqueeze(0)).squeeze(0) - # Shape: (num_counts, ) - passage_span_logits = self.parameters.passage_count_predictor(count_hidden_repr) + # Shape: (passage_length, 1) + passage_token_logits = self.parameters.passage_count_hidden2logits(count_hidden_repr) + # Shape: (passage_length) + passage_token_logits = passage_token_logits.squeeze(1) + + passage_token_sigmoids = torch.sigmoid(passage_token_logits) + passage_token_sigmoids = passage_token_sigmoids * self.passage_mask + + count_mean = torch.sum(passage_token_sigmoids) + variance = 0.5 + + # Shape: (num_count_values, ) + l2_by_vsquared = torch.pow(self.countvals - count_mean, 2) / (2 * variance * variance) + exp_val = torch.exp(-1 * l2_by_vsquared) + 1e-30 + count_distribution = exp_val / (torch.sum(exp_val)) - count_distribution = torch.softmax(passage_span_logits, dim=0) + # print(count_mean) + # print(count_distribution) loss = 0 # loss += passage_attention.loss debug_value = "" if self._debug: - count = myutils.round_all(myutils.tocpuNPList(count_distribution), 3) - _, pattn_vis_most = dlutils.listTokensVis(passage_attn, self.metadata["passage_tokens"]) - debug_value += f"CountDist: {count}" - debug_value += f"\nPattn: {pattn_vis_most}" + countdist = myutils.round_all(myutils.tocpuNPList(count_distribution), 3) + psigms, pattn_vis_most = dlutils.listTokensVis(passage_token_sigmoids, self.metadata["passage_tokens"]) + debug_value += f"CountDist: {countdist}" + debug_value += f"CountMean: {count_mean}" + debug_value += f"\nPSigms: {psigms}" return CountNumber(count_number_dist=count_distribution, loss=loss, @@ -1331,39 +1348,39 @@ def find_PassageNumber(self, passage_attention: PassageAttention, event_num_grou debug_value=debug_value) - @predicate - def numberDistribution2Count(self, number_distribution: PassageNumber) -> CountNumber: - number_distribution_vector = number_distribution._value - - scaled_attentions = [number_distribution_vector * sf for sf in self.parameters.passage_attention_scalingvals] - # Shape: (passage_length, num_scaling_factors) - stacked_scaled_attentions = torch.stack(scaled_attentions, dim=1) - - # We need a mask vector for the RNN of shape (num_of_passage_values, ) - mask_vector = stacked_scaled_attentions.new_ones(self.num_passage_nums) - - # Shape: (hidden_dim, ) - count_hidden_repr = self.parameters.passage_attention_to_count(stacked_scaled_attentions.unsqueeze(0), - mask_vector.unsqueeze(0)).squeeze(0) - - # Shape: (num_counts, ) - count_logits = self.parameters.passage_count_predictor(count_hidden_repr) - - count_distribution = torch.softmax(count_logits, dim=0) - - loss = 0 - # loss += passage_attention.loss - - debug_value = "" - if self._debug: - count = myutils.round_all(myutils.tocpuNPList(count_distribution), 3) - number_distribution = myutils.round_all(myutils.tocpuNPList(number_distribution_vector), 3) - debug_value += f"CountDist: {count}" - debug_value += f"\nNumDist: {number_distribution}" - - return CountNumber(count_number_dist=count_distribution, - loss=loss, - debug_value=debug_value) + # @predicate + # def numberDistribution2Count(self, number_distribution: PassageNumber) -> CountNumber: + # number_distribution_vector = number_distribution._value + # + # scaled_attentions = [number_distribution_vector * sf for sf in self.parameters.passage_attention_scalingvals] + # # Shape: (passage_length, num_scaling_factors) + # stacked_scaled_attentions = torch.stack(scaled_attentions, dim=1) + # + # # We need a mask vector for the RNN of shape (num_of_passage_values, ) + # mask_vector = stacked_scaled_attentions.new_ones(self.num_passage_nums) + # + # # Shape: (hidden_dim, ) + # count_hidden_repr = self.parameters.passage_attention_to_count(stacked_scaled_attentions.unsqueeze(0), + # mask_vector.unsqueeze(0)).squeeze(0) + # + # # Shape: (num_counts, ) + # count_logits = self.parameters.passage_count_predictor(count_hidden_repr) + # + # count_distribution = torch.softmax(count_logits, dim=0) + # + # loss = 0 + # # loss += passage_attention.loss + # + # debug_value = "" + # if self._debug: + # count = myutils.round_all(myutils.tocpuNPList(count_distribution), 3) + # number_distribution = myutils.round_all(myutils.tocpuNPList(number_distribution_vector), 3) + # debug_value += f"CountDist: {count}" + # debug_value += f"\nNumDist: {number_distribution}" + # + # return CountNumber(count_number_dist=count_distribution, + # loss=loss, + # debug_value=debug_value) if __name__=='__main__': diff --git a/semqa/domain_languages/drop/execution_parameters.py b/semqa/domain_languages/drop/execution_parameters.py index 8514846..5b1a7b8 100644 --- a/semqa/domain_languages/drop/execution_parameters.py +++ b/semqa/domain_languages/drop/execution_parameters.py @@ -23,8 +23,9 @@ def __init__(self, passage_encoding_dim: int, passage_attention_to_span: Seq2SeqEncoder, question_attention_to_span: Seq2SeqEncoder, - passage_attention_to_count: Seq2VecEncoder, + passage_attention_to_count: Seq2SeqEncoder, passage_count_predictor=None, + passage_count_hidden2logits=None, dropout: float = 0.0): super().__init__() @@ -42,6 +43,8 @@ def __init__(self, # self.passage_count_predictor = torch.nn.Linear(self.passage_attention_to_count.get_output_dim(), # self.num_counts) self.passage_count_predictor = passage_count_predictor + # Linear from self.passage_attention_to_count.output_dim --> 1 + self.passage_count_hidden2logits = passage_count_hidden2logits self.dotprod_matrix_attn = DotProductMatrixAttention() diff --git a/semqa/models/drop/drop_parser_wmodel.py b/semqa/models/drop/drop_parser_wmodel.py index a4c1d4b..35cb926 100644 --- a/semqa/models/drop/drop_parser_wmodel.py +++ b/semqa/models/drop/drop_parser_wmodel.py @@ -61,7 +61,8 @@ def __init__(self, modeling_layer: Seq2SeqEncoder, passage_attention_to_span: Seq2SeqEncoder, question_attention_to_span: Seq2SeqEncoder, - passage_attention_to_count: Seq2VecEncoder, + passage_attention_to_count: Seq2SeqEncoder, + # passage_attention_to_count: Seq2VecEncoder, beam_size: int, # decoder_beam_search: ConstrainedBeamSearch, max_decoding_steps: int, @@ -152,6 +153,8 @@ def __init__(self, self.passage_attention_to_count = passage_attention_to_count self.passage_count_predictor = torch.nn.Linear(self.passage_attention_to_count.get_output_dim(), self.num_counts, bias=False) + self.passage_count_hidden2logits = torch.nn.Linear(self.passage_attention_to_count.get_output_dim(), + 1, bias=True) # self.passage_count_predictor.bias.data.zero_() # self.passage_count_predictor.bias.requires_grad = False @@ -162,6 +165,7 @@ def __init__(self, question_attention_to_span=question_attention_to_span, passage_attention_to_count=self.passage_attention_to_count, passage_count_predictor=self.passage_count_predictor, + passage_count_hidden2logits=self.passage_count_hidden2logits, dropout=dropout) self.modelloss_metric = Average() @@ -187,9 +191,11 @@ def __init__(self, parameter.requires_grad = False # # # Fix parameters for Counting + count_parameter_names = ['passage_attention_to_count', 'passage_count_hidden2logits', + 'passage_count_predictor'] if countfixed: for name, parameter in self.named_parameters(): - if 'passage_attention_to_count' in name or 'passage_count_predictor' in name: + if any(span in name for span in count_parameter_names): parameter.requires_grad = False @@ -397,8 +403,9 @@ def forward(self, passage_tokenidx2dateidx_mask, inwindow_mask, outwindow_mask) - count_loss = self.number2count_auxloss(passage_number_values=passage_number_values, - device_id=device_id) + # count_loss = self.number2count_auxloss(passage_number_values=passage_number_values, + # device_id=device_id) + count_loss = 0.0 aux_win_loss = num_aux_loss + date_aux_loss + count_loss else: @@ -616,7 +623,6 @@ def forward(self, batch_denotations, batch_denotation_types = self._get_denotations(batch_actionseqs, languages, batch_actionseq_sideargs) - output_dict = {} # Computing losses if gold answers are given if answer_program_start_types is not None: diff --git a/semqa/models/drop/passage_attn_to_count.py b/semqa/models/drop/passage_attn_to_count.py index e25d09e..921aeb6 100644 --- a/semqa/models/drop/passage_attn_to_count.py +++ b/semqa/models/drop/passage_attn_to_count.py @@ -2,10 +2,11 @@ from typing import List, Dict, Any, Tuple, Optional, Set import math import copy - +import numpy as np from overrides import overrides import torch +import torch.nn.functional as F from allennlp.data.vocabulary import Vocabulary from allennlp.models.model import Model @@ -13,6 +14,7 @@ from allennlp.nn import InitializerApplicator from allennlp.models.reading_comprehension.util import get_best_span from allennlp.modules.seq2vec_encoders import Seq2VecEncoder +from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder from allennlp.training.metrics import Average import datasets.drop.constants as dropconstants @@ -25,7 +27,7 @@ class PassageAttnToCount(Model): def __init__(self, vocab: Vocabulary, - passage_attention_to_count: Seq2VecEncoder, + passage_attention_to_count: Seq2SeqEncoder, dropout: float = 0.2, initializers: InitializerApplicator = InitializerApplicator()) -> None: @@ -38,8 +40,14 @@ def __init__(self, assert len(self.scaling_vals) == self.passage_attention_to_count.get_input_dim() self.num_counts = 10 - self.passage_count_predictor = torch.nn.Linear(self.passage_attention_to_count.get_output_dim(), - self.num_counts, bias=False) + # self.passage_count_predictor = torch.nn.Linear(self.passage_attention_to_count.get_output_dim(), + # self.num_counts, bias=False) + + # We want to predict a score for each passage token + self.passage_count_hidden2logits = torch.nn.Linear(self.passage_attention_to_count.get_output_dim(), + 1, bias=True) + + self.passagelength_to_bias = torch.nn.Linear(1, 1, bias=True) self.count_acc = Average() @@ -49,6 +57,8 @@ def __init__(self, self._dropout = lambda x: x initializers(self) + # self.passage_count_hidden2logits.bias.data.fill_(-1.0) + # self.passage_count_hidden2logits.bias.requires_grad = False def device_id(self): allenutil.get_device_of() @@ -57,57 +67,83 @@ def device_id(self): def forward(self, passage_attention: torch.Tensor, passage_lengths: List[int], - count_mask: torch.Tensor, - answer_as_count: torch.LongTensor = None, + count_answer: torch.LongTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: + device_id = allenutil.get_device_of(passage_attention) batch_size, max_passage_length = passage_attention.size() - passage_mask = passage_attention.new_zeros(batch_size, max_passage_length) - for i, passage_length in enumerate(passage_lengths): - passage_mask[i, 0:passage_length] = 1.0 - # (B, num_counts) - answer_as_count = answer_as_count.float() - count_mask = count_mask.float() + # Shape: (B, passage_length) + passage_mask = (passage_attention >= 0).float() # List of (B, P) shaped tensors scaled_attentions = [passage_attention * sf for sf in self.scaling_vals] # Shape: (B, passage_length, num_scaling_factors) scaled_passage_attentions = torch.stack(scaled_attentions, dim=2) - # Shape: (B, hidden_dim) + # Shape (batch_size, 1) + passage_len_bias = self.passagelength_to_bias(passage_mask.sum(1, keepdim=True)) + + scaled_passage_attentions = scaled_passage_attentions * passage_mask.unsqueeze(2) + + # Shape: (B, passage_length, hidden_dim) count_hidden_repr = self.passage_attention_to_count(scaled_passage_attentions, passage_mask) - # Shape: (B, num_counts) - passage_span_logits = self.passage_count_predictor(count_hidden_repr) - count_distribution = torch.softmax(passage_span_logits, dim=1) + # Shape: (B, passage_length, 1) -- score for each token + passage_span_logits = self.passage_count_hidden2logits(count_hidden_repr) + # Shape: (B, passage_length) -- sigmoid on token-score + token_sigmoids = torch.sigmoid(passage_span_logits.squeeze(2)) + token_sigmoids = token_sigmoids * passage_mask - # Loss computation - output_dict = {} - log_likelihood = 0.0 + # Shape: (B, 1) -- sum of sigmoids. This will act as the predicted mean + # passage_count_mean = torch.sum(token_sigmoids, dim=1, keepdim=True) + passage_len_bias + passage_count_mean = torch.sum(token_sigmoids, dim=1, keepdim=True) - num_masked_instances = torch.sum(count_mask) + # Shape: (1, count_vals) + self.countvals = allenutil.get_range_vector(10, device=device_id).unsqueeze(0).float() - if answer_as_count is not None: + variance = 0.2 - count_log_probs = torch.log(count_distribution + 1e-40) - log_likelihood = torch.sum(count_log_probs * answer_as_count * count_mask.unsqueeze(1)) + # Shape: (batch_size, count_vals) + l2_by_vsquared = torch.pow(self.countvals - passage_count_mean, 2) / (2 * variance * variance) + exp_val = torch.exp(-1 * l2_by_vsquared) + 1e-30 + # Shape: (batch_size, count_vals) + count_distribution = exp_val / (torch.sum(exp_val, 1, keepdim=True)) + # Loss computation + output_dict = {} + loss = 0.0 + pred_count_idx = torch.argmax(count_distribution, 1) + if count_answer is not None: + # L2-loss + passage_count_mean = passage_count_mean.squeeze(1) + L2Loss = F.mse_loss(input=passage_count_mean, target=count_answer.float()) + loss = L2Loss + predictions = passage_count_mean.detach().cpu().numpy() + predictions = np.round_(predictions) + + gold_count = count_answer.detach().cpu().numpy() + correct_vec = (predictions == gold_count) + correct_perc = sum(correct_vec)/batch_size + # print(f"{correct_perc} {predictions} {gold_count}") + self.count_acc(correct_perc) + + # loss = F.cross_entropy(input=count_distribution, target=count_answer) # List of predicted count idxs, Shape: (B,) - count_idx = torch.argmax(count_distribution, 1) - gold_count_idxs = torch.argmax(answer_as_count, 1) - correct_vec = ((count_idx == gold_count_idxs).float() * count_mask) - if num_masked_instances > 0: - correct_perc = torch.sum(correct_vec) / num_masked_instances - else: - correct_perc = torch.sum(correct_vec) - self.count_acc(correct_perc.item()) - - loss = -1.0 * log_likelihood + # correct_vec = (pred_count_idx == count_answer).float() + # correct_perc = torch.sum(correct_vec) / batch_size + # self.count_acc(correct_perc.item()) batch_loss = loss / batch_size output_dict["loss"] = batch_loss + output_dict["passage_attention"] = passage_attention + output_dict["passage_sigmoid"] = token_sigmoids + output_dict["count_mean"] = passage_count_mean + output_dict["count_distritbuion"] = count_distribution + output_dict["count_answer"] = count_answer + output_dict["pred_count"] = pred_count_idx + return output_dict diff --git a/semqa/predictors/drop/pattn2count_predictor.py b/semqa/predictors/drop/pattn2count_predictor.py new file mode 100644 index 0000000..ce15f66 --- /dev/null +++ b/semqa/predictors/drop/pattn2count_predictor.py @@ -0,0 +1,127 @@ +from typing import List, Union + +from overrides import overrides + +from allennlp.common.util import JsonDict, sanitize, group_by_count +from allennlp.data import DatasetReader, Instance +from allennlp.models import Model +from allennlp.predictors.predictor import Predictor +import datasets.hotpotqa.utils.constants as hpconstants +import utils.util as myutils + +from allennlp.tools.squad_eval import metric_max_over_ground_truths +from allennlp.tools.drop_eval import (get_metrics as drop_em_and_f1, + answer_json_to_strings) + + +@Predictor.register("pattn2count_predictor") +class Pattn2CountPredictor(Predictor): + """ + Predictor for the :class:`~allennlp.models.bidaf.SemanticRoleLabeler` model. + """ + def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: + super().__init__(model, dataset_reader) + + + @overrides + def predict_instance(self, instance: Instance) -> JsonDict: + outputs = self._model.forward_on_instance(instance) + return sanitize(outputs) + + + def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]: + outputs = self._model.forward_on_instances(instances) + return sanitize(outputs) + + @overrides + def dump_line(self, outputs: JsonDict) -> str: # pylint: disable=no-self-use + # Use json.dumps(outputs) + "\n" to dump a dictionary + + out_str = '' + pattn = outputs["passage_attention"] + pattn = myutils.round_all(pattn, 4) + psigmoid = outputs["passage_sigmoid"] + psigmoid = myutils.round_all(psigmoid, 4) + + attn_sigm = list(zip(pattn, psigmoid)) + + passage_count_mean = outputs["count_mean"] + count_distribution = outputs["count_distritbuion"] + count_answer = outputs["count_answer"] + pred_count_idx = outputs["pred_count"] + + out_str += f"Pattn: {pattn}" + "\n" + out_str += f"Psigm: {psigmoid}" + "\n" + out_str += f"Pattn_sigm: {attn_sigm}" + "\n" + out_str += f"Plen: {len(pattn)}" + "\n" + out_str += f"PattnSum: {sum(pattn)}" + "\n" + out_str += f"PSigmSum: {sum(psigmoid)}" + "\n" + out_str += f"CountMean: {passage_count_mean}" + '\n' + out_str += f"CountDist: {count_distribution}" + '\n' + out_str += f"CountAnswer: {count_answer}" + '\n' + out_str += f"Predicted CountAnswer: {pred_count_idx}" + '\n' + out_str += '--------------------------------------------------\n' + + return out_str + + # @overrides + # def _json_to_instance(self, json_dict: JsonDict): + # jsonobj = json_dict + # # space delimited tokenized + # question = jsonobj[hpconstants.q_field] + # + # answer = jsonobj[hpconstants.ans_field] + # # List of question mentions. Stored as mention-tuples --- (text, start, end, label) + # # TODO(nitish): Fix this to include all types + # q_nemens = jsonobj[hpconstants.q_ent_ner_field] + # # List of (title, space_delimited_tokenized_contexts) + # contexts = jsonobj[hpconstants.context_field] + # # List of list --- For each context , list of mention-tuples as (text, start, end, label) + # contexts_ent_ners = jsonobj[hpconstants.context_ent_ner_field] + # contexts_num_ners = jsonobj[hpconstants.context_num_ner_field] + # contexts_date_ners = jsonobj[hpconstants.context_date_ner_field] + # + # # Mention to entity mapping -- used to make the grounding vector + # context_entmens2entidx = jsonobj[hpconstants.context_nemens2entidx] + # context_nummens2entidx = jsonobj[hpconstants.context_nummens2entidx] + # context_datemens2entidx = jsonobj[hpconstants.context_datemens2entidx] + # + # # Entity to mentions --- Used to find the number of entities of each type in the contexts + # context_eqent2entmens = jsonobj[hpconstants.context_eqent2entmens] + # context_eqent2nummens = jsonobj[hpconstants.context_eqent2nummens] + # context_eqent2datemens = jsonobj[hpconstants.context_eqent2datemens] + # + # # Dict from {date_string: (date, month, year)} normalization. -1 indicates invalid field + # dates_normalized_dict = jsonobj[hpconstants.dates_normalized_field] + # # Dict from {num_string: float_val} normalization. + # nums_normalized_dict = jsonobj[hpconstants.nums_normalized_field] + # # Dict from {ent_idx: [(context_idx, men_idx)]} --- output pf CDCR + # + # # Grounding of ques entity mentions + # qnemens_to_ent = jsonobj[hpconstants.q_entmens2entidx] + # + # ans_type = None + # ans_grounding = None + # # if hpconstants.ans_type_field in jsonobj: + # # ans_type = jsonobj[hpconstants.ans_type_field] + # # ans_grounding = jsonobj[hpconstants.ans_grounding_field] + # + # instance = self._dataset_reader.text_to_instance(question, + # answer, + # q_nemens, + # contexts, + # contexts_ent_ners, + # contexts_num_ners, + # contexts_date_ners, + # context_entmens2entidx, + # context_nummens2entidx, + # context_datemens2entidx, + # context_eqent2entmens, + # context_eqent2nummens, + # context_eqent2datemens, + # dates_normalized_dict, + # nums_normalized_dict, + # qnemens_to_ent, + # ans_type, + # ans_grounding) + # return instance