Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
Yada Pruksachatkun committed May 6, 2019
2 parents 08a78bf + 2f42599 commit 3cc2fc4
Show file tree
Hide file tree
Showing 142 changed files with 8,267 additions and 8,740 deletions.
33 changes: 33 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Python CircleCI 2.0 configuration file
version: 2.1
jobs:
test:
docker:
- image: continuumio/miniconda3

working_directory: ~/repo

steps:
# Step 1: obtain repo from GitHub
- checkout
# Step 2: create virtual env and install dependencies
- run:
name: install dependencies
command: |
apt-get update
apt-get install -y cmake build-essential gcc g++
conda env create -q -f environment.yml
source activate jiant
# Step 3: run tests
- run:
name: run tests
command: |
mkdir ~/repo/test_output
source activate jiant
python -m nltk.downloader perluniprops nonbreaking_prefixes punkt
nose2 -v
workflows:
version: 2
test:
jobs:
- test
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ glue_data/
user_config.sh
.idea
.ipynb_checkpoints/
perluniprops/
7 changes: 5 additions & 2 deletions .pep8speaks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,16 @@ scanner:

pycodestyle: # Same as scanner.linter value. Other option is flake8
max-line-length: 100 # Default is 79 in PEP 8
ignore:
- E203 # Whitespace before :, not a strict PEP8 requirement and sometimes incompatible with black.
- W503 # Deprecated, incompatible with black.

no_blank_comment: False # If True, no comment is made on PR without any errors.

message: # Customize the comment made by the bot
opened: # Messages when a new PR is submitted
# The keyword {name} is converted into the author's username
footer: "You can automatically repair most issues by commenting `@pep8speaks pep8ify` on GitHub or by locally running: `python -m autopep8 --max-line-length 100 --in-place --aggressive --aggressive ./*.py ./*/*.py ./*/*/*.py`"
footer: "You can repair most issues by installing [black](https://github.com/ambv/black) and running: `black -l 100 ./*`. If you contribute often, have a look at the 'Contributing' section of the [README](https://github.com/nyu-mll/jiant) for instructions on doing this automatically."
# The messages can be written as they would over GitHub
updated: # Messages when new commits are added to the PR
footer: "You can automatically repair most issues by locally running: `python -m autopep8 --max-line-length 100 --in-place --aggressive --aggressive ./*.py ./*/*.py ./*/*/*.py`"
footer: "You can repair most issues by installing [black](https://github.com/ambv/black) and running: `black -l 100 ./*`. If you contribute often, have a look at the 'Contributing' section of the [README](https://github.com/nyu-mll/jiant) for instructions on doing this automatically."
10 changes: 10 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.1.0
hooks:
- id: trailing-whitespace
- id: check-yaml
- repo: https://github.com/ambv/black
rev: 19.3b0
hooks:
- id: black
8 changes: 8 additions & 0 deletions .pre-commit-hooks.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
- id: black
name: black
description: 'Black: The uncompromising Python code formatter'
entry: black
language: python
language_version: python3
require_serial: true
types: [python]
1 change: 0 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -92,5 +92,4 @@ ENV PATH_TO_COVE "$JSALT_SHARE_DIR/cove"
ENV ELMO_SRC_DIR "$JSALT_SHARE_DIR/elmo"

# Set these manually with -e or via Kuberentes config YAML.
# ENV NFS_PROJECT_PREFIX "/nfs/jsalt/exp/docker"
# ENV JIANT_PROJECT_PREFIX "$NFS_PROJECT_PREFIX"
262 changes: 16 additions & 246 deletions README.md

Large diffs are not rendered by default.

184 changes: 107 additions & 77 deletions cola_inference.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
'''
"""
Run a model inference via a REPL (read-eval-print loop), or by processing an input corpus file.
To run as REPL (default):
Expand Down Expand Up @@ -29,82 +29,106 @@
(Ensure that the repository is in your PYTHONPATH when running this script.)
'''
"""
# pylint: disable=no-member
import argparse
import json
import numpy as np
import logging as log
import os
import pandas as pd
import sys

import logging as log
from tqdm import tqdm

import numpy as np
import pandas as pd
import torch
from allennlp.data import Instance, Vocabulary
from allennlp.data.dataset import Batch
from allennlp.nn.util import move_to_device
from tqdm import tqdm

from src.models import build_model
from src.preprocess import build_indexers, build_tasks
from src.tasks.tasks import process_sentence, sentence_to_text_field
from src.utils import config
from src.utils.utils import load_model_state, check_arg_name
from src.utils.data_loaders import load_tsv
from src.preprocess import build_tasks, build_indexers
from src.models import build_model
from src.utils.utils import check_arg_name, load_model_state

from allennlp.data import Vocabulary
from allennlp.data.dataset import Batch
from allennlp.data import Instance
from allennlp.nn.util import move_to_device

log.basicConfig(format='%(asctime)s: %(message)s',
datefmt='%m/%d %I:%M:%S %p', level=log.INFO)
log.basicConfig(format="%(asctime)s: %(message)s", datefmt="%m/%d %I:%M:%S %p", level=log.INFO)


def handle_arguments(cl_arguments):
parser = argparse.ArgumentParser(description='')
parser = argparse.ArgumentParser(description="")

# Configuration files
parser.add_argument('--config_file', '-c', type=str, nargs="+",
help="Config file(s) (.conf) for model parameters.")
parser.add_argument('--overrides', '-o', type=str, default=None,
help="Parameter overrides, as valid HOCON string.")
parser.add_argument(
"--config_file",
"-c",
type=str,
nargs="+",
help="Config file(s) (.conf) for model parameters.",
)
parser.add_argument(
"--overrides",
"-o",
type=str,
default=None,
help="Parameter overrides, as valid HOCON string.",
)

# Inference arguments
parser.add_argument('--model_file_path', type=str, required=True,
help="Path to saved model (.th).")
parser.add_argument('--inference_mode', type=str, default="repl",
help="Run as REPL, or process a corpus file."
" [repl, corpus]")
parser.add_argument('--input_path', type=str, default=None,
help="Input path for running inference."
" One input per line."
" Only in eval_mode='corpus'")
parser.add_argument('--input_format', type=str, default="text",
help="Format of input (text | train | dev | test)")
parser.add_argument('--output_path', type=str, default=None,
help="Output path for running inference."
" Only in eval_mode='corpus'")
parser.add_argument('--eval_output_path', type=str, default=None,
help="Output path for metrics from evaluation."
" Only in eval_mode='corpus'")
parser.add_argument(
"--model_file_path", type=str, required=True, help="Path to saved model (.th)."
)
parser.add_argument(
"--inference_mode",
type=str,
default="repl",
help="Run as REPL, or process a corpus file." " [repl, corpus]",
)
parser.add_argument(
"--input_path",
type=str,
default=None,
help="Input path for running inference."
" One input per line."
" Only in eval_mode='corpus'",
)
parser.add_argument(
"--input_format",
type=str,
default="text",
help="Format of input (text | train | dev | test)",
)
parser.add_argument(
"--output_path",
type=str,
default=None,
help="Output path for running inference." " Only in eval_mode='corpus'",
)
parser.add_argument(
"--eval_output_path",
type=str,
default=None,
help="Output path for metrics from evaluation." " Only in eval_mode='corpus'",
)

return parser.parse_args(cl_arguments)


def main(cl_arguments):
''' Run REPL for a CoLA model '''
""" Run REPL for a CoLA model """

# Arguments handling #
cl_args = handle_arguments(cl_arguments)
args = config.params_from_file(cl_args.config_file, cl_args.overrides)
check_arg_name(args)
assert args.target_tasks == "cola", \
"Currently only supporting CoLA. ({})".format(args.target_tasks)
assert args.target_tasks == "cola", "Currently only supporting CoLA. ({})".format(
args.target_tasks
)

if args.cuda >= 0:
try:
if not torch.cuda.is_available():
raise EnvironmentError("CUDA is not available, or not detected"
" by PyTorch.")
raise EnvironmentError("CUDA is not available, or not detected" " by PyTorch.")
log.info("Using GPU %d", args.cuda)
torch.cuda.set_device(args.cuda)
except Exception:
Expand All @@ -121,12 +145,11 @@ def main(cl_arguments):
# Build or load model #
model = build_model(args, vocab, word_embs, tasks)
log.info("Loading existing model from %s...", cl_args.model_file_path)
load_model_state(model, cl_args.model_file_path,
args.cuda, [], strict=False)
load_model_state(model, cl_args.model_file_path, args.cuda, [], strict=False)

# Inference Setup #
model.eval()
vocab = Vocabulary.from_files(os.path.join(args.exp_dir, 'vocab'))
vocab = Vocabulary.from_files(os.path.join(args.exp_dir, "vocab"))
indexers = build_indexers(args)
task = take_one(tasks)

Expand All @@ -137,18 +160,23 @@ def main(cl_arguments):
print("Running REPL for task: {}".format(task.name))
run_repl(model, vocab, indexers, task, args)
elif cl_args.inference_mode == "corpus":
run_corpus_inference(model, vocab, indexers, task, args,
cl_args.input_path,
cl_args.input_format,
cl_args.output_path,
cl_args.eval_output_path,
)
run_corpus_inference(
model,
vocab,
indexers,
task,
args,
cl_args.input_path,
cl_args.input_format,
cl_args.output_path,
cl_args.eval_output_path,
)
else:
raise KeyError(cl_args.inference_mode)


def run_repl(model, vocab, indexers, task, args):
''' Run REPL '''
""" Run REPL """
print("Input CTRL-C or enter 'QUIT' to terminate.")
while True:
try:
Expand All @@ -158,9 +186,7 @@ def run_repl(model, vocab, indexers, task, args):
break

tokens = process_sentence(
tokenizer_name=task.tokenizer_name,
sent=input_string,
max_seq_len=args.max_seq_len,
tokenizer_name=task.tokenizer_name, sent=input_string, max_seq_len=args.max_seq_len
)
print("TOKENS:", " ".join("[{}]".format(tok) for tok in tokens))
field = sentence_to_text_field(tokens, indexers)
Expand All @@ -184,10 +210,10 @@ def run_repl(model, vocab, indexers, task, args):
break


def run_corpus_inference(model, vocab, indexers, task, args,
input_path, input_format, output_path,
eval_output_path):
''' Run inference on corpus '''
def run_corpus_inference(
model, vocab, indexers, task, args, input_path, input_format, output_path, eval_output_path
):
""" Run inference on corpus """
tokens, labels = load_cola_data(input_path, task, input_format, args.max_seq_len)
logit_batches = []
for tokens_batch in tqdm(list(batchify(tokens, args.batch_size))):
Expand All @@ -205,10 +231,7 @@ def run_corpus_inference(model, vocab, indexers, task, args,
# Future-proofing
assert task.name == "cola"
num_classes = logits.shape[1]
columns = (
[f"logit_{i}" for i in range(num_classes)]
+ [f"prob_{i}" for i in range(num_classes)]
)
columns = [f"logit_{i}" for i in range(num_classes)] + [f"prob_{i}" for i in range(num_classes)]

df = pd.DataFrame(data_out, columns=columns)
df["pred"] = preds
Expand All @@ -227,15 +250,15 @@ def run_corpus_inference(model, vocab, indexers, task, args,


def batchify(ls, batch_size):
''' Partition a list into batches of batch_size '''
""" Partition a list into batches of batch_size """
i = 0
while i < len(ls):
yield ls[i:i + batch_size]
yield ls[i : i + batch_size]
i += batch_size


def prepare_batch(tokens_batch, vocab, indexers, args):
''' Do preprocessing for batch '''
""" Do preprocessing for batch """
instance_ls = []
token_ls = []
for tokens in tokens_batch:
Expand All @@ -249,7 +272,7 @@ def prepare_batch(tokens_batch, vocab, indexers, args):


def take_one(ls):
''' Extract singleton from list '''
""" Extract singleton from list """
assert len(ls) == 1
return ls[0]

Expand All @@ -260,20 +283,27 @@ def load_cola_data(input_path, task, input_format, max_seq_len):
sentences = f_in.readlines()
tokens = [
process_sentence(
tokenizer_name=task.tokenizer_name,
sent=sentence,
max_seq_len=max_seq_len,
tokenizer_name=task.tokenizer_name, sent=sentence, max_seq_len=max_seq_len
)
for sentence in sentences
]
labels = None
elif input_format == "train" or input_format == "dev":
data = load_tsv(task.tokenizer_name, input_path, max_seq_len,
s1_idx=3, s2_idx=None, label_idx=1)
data = load_tsv(
task.tokenizer_name, input_path, max_seq_len, s1_idx=3, s2_idx=None, label_idx=1
)
tokens, labels = data[0], data[2]
elif input_format == "test":
data = load_tsv(task.tokenizer_name, input_path, max_seq_len,
s1_idx=1, s2_idx=None, has_labels=False, return_indices=True, skip_rows=1)
data = load_tsv(
task.tokenizer_name,
input_path,
max_seq_len,
s1_idx=1,
s2_idx=None,
has_labels=False,
return_indices=True,
skip_rows=1,
)
tokens, labels = data[0], None
else:
raise KeyError(input_format)
Expand All @@ -289,5 +319,5 @@ def get_cola_metrics(logits, preds, labels, task):
return task.get_metrics(reset=True)


if __name__ == '__main__':
if __name__ == "__main__":
main(sys.argv[1:])
Loading

0 comments on commit 3cc2fc4

Please sign in to comment.