epwalsh
diff --git a/‎.gitignore
Lines changed: 4 additions & 0 deletions b/‎.gitignore
Lines changed: 4 additions & 0 deletions
diff --git a/‎Makefile
Lines changed: 6 additions & 1 deletion b/‎Makefile
Lines changed: 6 additions & 1 deletion
diff --git a/‎README.md
Lines changed: 4 additions & 2 deletions b/‎README.md
Lines changed: 4 additions & 2 deletions
diff --git a/‎data/greetings.tar.gz
62.1 KB b/‎data/greetings.tar.gz
62.1 KB
diff --git a/‎data/names.tar.gz
254 KB b/‎data/names.tar.gz
254 KB
diff --git a/‎experiments/greetings/copynet.json
Lines changed: 94 additions & 0 deletions b/‎experiments/greetings/copynet.json
Lines changed: 94 additions & 0 deletions
diff --git a/‎experiments/nl2bash/copynet.json
Lines changed: 6 additions & 0 deletions b/‎experiments/nl2bash/copynet.json
Lines changed: 6 additions & 0 deletions
diff --git a/‎modules/data/dataset_readers/copynet.py
Lines changed: 77 additions & 30 deletions b/‎modules/data/dataset_readers/copynet.py
Lines changed: 77 additions & 30 deletions
diff --git a/‎modules/data/dataset_readers/nl2bash.py
Lines changed: 3 additions & 1 deletion b/‎modules/data/dataset_readers/nl2bash.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎modules/data/fields/__init__.py
Lines changed: 1 addition & 0 deletions b/‎modules/data/fields/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎modules/data/fields/copy_map_field.py
Lines changed: 39 additions & 0 deletions b/‎modules/data/fields/copy_map_field.py
Lines changed: 39 additions & 0 deletions
@@ -102,3 +102,7 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
+# scratch files
+scratch*
+tmp*
@@ -1,3 +1,4 @@
+debug         = 0
 test          = modules
 COVERAGE     := $(addprefix --cov=, $(test))
 PYTHONPATH    = allennlp
@@ -12,7 +13,11 @@ EXPERIMENTS  := $(wildcard $(EXPERIMENTDIR)/**/*.json)
 
 .PHONY : train
 train :
+ifeq ($(debug),0)
 	./scripts/train.sh
+else
+	CUDA_LAUNCH_BLOCKING=1 ./scripts/train.sh
+endif
 
 # Need this to force targets to build, even when the target file exists.
 .PHONY : phony-target
@@ -47,7 +52,7 @@ lint :
 .PHONY : unit-test
 unit-test :
 	@echo "Unit tests: pytest"
-ifeq ($(suffix $(test)),.py)
+ifneq ($(findstring test,$(test)),)
 	PYTHONPATH=$(PYTHONPATH) python -m pytest -v --color=yes $(test)
 else
 	PYTHONPATH=$(PYTHONPATH) python -m pytest -v --cov-config .coveragerc $(COVERAGE) --color=yes $(test)
 
@@ -20,16 +20,18 @@ After AllenNLP is installed, you can define your own experiments with an AllenNL
 
 ## Models implemented
 
-- (WIP) [CopyNet](https://arxiv.org/abs/1603.06393): A sequence-to-sequence model that incorporates a copying mechanism, which enables the model to copy tokens from the source sentence into the target sentence even if they are not part of the target vocabulary. This architecture has shown promising results on machine translation and semantic parsing tasks.
+- [CopyNet](https://arxiv.org/abs/1603.06393): A sequence-to-sequence model that incorporates a copying mechanism, which enables the model to copy tokens from the source sentence into the target sentence even if they are not part of the target vocabulary. This architecture has shown promising results on machine translation and semantic parsing tasks.
 
 ## Datasets
 
+- Greetings: A simple made-up dataset of greetings (the source sentences) and replies (the target sentences). The greetings are things like "Hi, my name is Jon Snow" and the replies are in the format "Nice to meet you, Jon Snow!". This is completely artificial and is just meant to show the usefullness of the copy mechanism in CopyNet.
 - [NL2Bash](http://arxiv.org/abs/1802.08979): A challenging dataset that consists of bash one-liners along with corresponding expert descriptions. The goal is to translate the natural language descriptions into the bash commands.
 
 ## Experiments
 
+- Greetings dataset with CopyNet: run `make experiments/greetings/copynet.json` to train.
 - (WIP) [NL2Bash with CopyNet](./experiments/nl2bash/copynet.json): run `make experiments/nl2bash/copynet.json` to train.
 
 ## TODO
 
-- Implement beam search for CopyNet
+- Implement custom metrics for NL2Bash.
@@ -0,0 +1,94 @@
+{
+  "dataset_reader": {
+    "target_namespace": "target_tokens",
+    "type": "copynet",
+    "source_token_indexers": {
+      "tokens": {
+        "type": "single_id",
+        "namespace": "source_tokens"
+      },
+      "token_characters": {
+        "type": "characters"
+      }
+    },
+    "target_token_indexers": {
+      "tokens": {
+        "namespace": "target_tokens"
+      }
+    }
+  },
+  "vocabulary": {
+    "min_count": {
+      "source_tokens": 4,
+      "target_tokens": 4
+    },
+    "tokens_to_add": {
+        "target_tokens": ["@COPY@"]
+    }
+  },
+  "train_data_path": "data/greetings/train.tsv",
+  "validation_data_path": "data/greetings/validation.tsv",
+  "model": {
+    "type": "copynet",
+    "source_embedder": {
+      "tokens": {
+        "type": "embedding",
+        "vocab_namespace": "source_tokens",
+        "embedding_dim": 25,
+        "trainable": true
+      },
+      "token_characters": {
+        "type": "character_encoding",
+        "embedding": {
+          "embedding_dim": 10
+        },
+        "encoder": {
+          "type": "lstm",
+          "input_size": 10,
+          "hidden_size": 10,
+          "num_layers": 2,
+          "dropout": 0,
+          "bidirectional": true
+        }
+      }
+    },
+    "encoder": {
+      "type": "lstm",
+      "input_size": 45,
+      "hidden_size": 100,
+      "num_layers": 2,
+      "dropout": 0,
+      "bidirectional": true
+    },
+    "attention": {
+      "type": "bilinear",
+      "vector_dim": 200,
+      "matrix_dim": 200
+    },
+    "target_embedding_dim": 10,
+    "beam_size": 3,
+    "max_decoding_steps": 20
+  },
+  "iterator": {
+    "type": "bucket",
+    "padding_noise": 0.0,
+    "batch_size" : 32,
+    "sorting_keys": [["source_tokens", "num_tokens"]]
+  },
+  "trainer": {
+    "optimizer": {
+      "type": "sgd",
+      "lr": 0.015
+    },
+    "learning_rate_scheduler": {
+      "type": "cosine",
+      "t_initial": 5,
+      "t_mul": 1.5,
+      "eta_mul": 0.9
+    },
+    "num_epochs": 80,
+    "cuda_device": 0,
+    "should_log_learning_rate": true,
+    "should_log_parameter_statistics": false
+  }
+}
@@ -1,5 +1,6 @@
 {
   "dataset_reader": {
+    "target_namespace": "target_tokens",
     "type": "nl2bash",
     "source_token_indexers": {
       "tokens": {
@@ -20,6 +21,9 @@
     "min_count": {
       "source_tokens": 4,
       "target_tokens": 4
+    },
+    "tokens_to_add": {
+        "target_tokens": ["@COPY@"]
     }
   },
   "train_data_path": "data/nl2bash/train.tsv",
@@ -62,6 +66,8 @@
       "matrix_dim": 200
     },
     "target_embedding_dim": 10
+    "beam_size": 5,
+    "max_decoding_steps": 50
   },
   "iterator": {
     "type": "bucket",
 
@@ -4,20 +4,23 @@
 import numpy as np
 from overrides import overrides
 
+from allennlp.common.checks import ConfigurationError
+from allennlp.common.file_utils import cached_path
 from allennlp.common.util import START_SYMBOL, END_SYMBOL
 from allennlp.data.dataset_readers.dataset_reader import DatasetReader
-from allennlp.data.dataset_readers.seq2seq import Seq2SeqDatasetReader
 from allennlp.data.fields import TextField, ArrayField
 from allennlp.data.instance import Instance
-from allennlp.data.tokenizers import Token, Tokenizer
-from allennlp.data.token_indexers import TokenIndexer
+from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
+from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
+
+from modules.data.fields import CopyMapField
 
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 
 @DatasetReader.register("copynet")
-class CopyNetDatasetReader(Seq2SeqDatasetReader):
+class CopyNetDatasetReader(DatasetReader):
     """
     Read a tsv file containing paired sequences, and create a dataset suitable for a
     ``CopyNet`` model, or any model with a matching API.
@@ -28,6 +31,9 @@ class CopyNetDatasetReader(Seq2SeqDatasetReader):
 
     Parameters
     ----------
+    target_namespace : ``str``, required
+        The vocab namespace for the targets. This needs to be passed to the dataset reader
+        in order to construct the CopyMapField.
     source_tokenizer : ``Tokenizer``, optional
         Tokenizer to use to split the input sequences into words or other kinds of tokens. Defaults
         to ``WordTokenizer()``.
@@ -43,20 +49,32 @@ class CopyNetDatasetReader(Seq2SeqDatasetReader):
     """
 
     def __init__(self,
+                 target_namespace: str,
                  source_tokenizer: Tokenizer = None,
                  target_tokenizer: Tokenizer = None,
                  source_token_indexers: Dict[str, TokenIndexer] = None,
                  target_token_indexers: Dict[str, TokenIndexer] = None,
                  lazy: bool = False) -> None:
-        # The only reason we override __init__ is so that we can ensure `source_add_start_token`
-        # is True. This is because the CopyNet model always assumes the start token
-        # will be part of the source sentence.
-        super().__init__(source_tokenizer=source_tokenizer,
-                         target_tokenizer=target_tokenizer,
-                         source_token_indexers=source_token_indexers,
-                         target_token_indexers=target_token_indexers,
-                         source_add_start_token=True,
-                         lazy=lazy)
+        super().__init__(lazy)
+        self._target_namespace = target_namespace
+        self._source_tokenizer = source_tokenizer or WordTokenizer()
+        self._target_tokenizer = target_tokenizer or self._source_tokenizer
+        self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer()}
+        self._target_token_indexers = target_token_indexers or self._source_token_indexers
+
+    @overrides
+    def _read(self, file_path):
+        with open(cached_path(file_path), "r") as data_file:
+            logger.info("Reading instances from lines in file at: %s", file_path)
+            for line_num, line in enumerate(data_file):
+                line = line.strip("\n")
+                if not line:
+                    continue
+                line_parts = line.split('\t')
+                if len(line_parts) != 2:
+                    raise ConfigurationError("Invalid line format: %s (line number %d)" % (line, line_num + 1))
+                source_sequence, target_sequence = line_parts
+                yield self.text_to_instance(source_sequence, target_sequence)
 
     def _preprocess_source(self, source_string: str) -> str:  # pylint: disable=no-self-use
         """
@@ -72,6 +90,27 @@ def _preprocess_target(self, target_string: str) -> str:  # pylint: disable=no-s
         """
         return target_string
 
+    @staticmethod
+    def _create_copy_indicator_array(tokenized_source: List[Token],
+                                     tokenized_target: List[Token]) -> np.array:
+        copy_indicator_array: List[List[int]] = []
+        for target_token in tokenized_target[1:-1]:
+            source_index_list: List[int] = [int(target_token.text.lower() == source_token.text.lower())
+                                            for source_token in tokenized_source[1:-1]]
+            copy_indicator_array.append(source_index_list)
+        copy_indicator_array.insert(0, [0] * len(tokenized_source[1:-1]))
+        copy_indicator_array.append([0] * len(tokenized_source[1:-1]))
+        return np.array(copy_indicator_array)
+
+    @staticmethod
+    def _create_source_duplicates_array(tokenized_source: List[Token]) -> np.array:
+        out_array: List[List[int]] = []
+        for token in tokenized_source[1:-1]:
+            array_slice: List[int] = [int(token.text.lower() == other.text.lower())
+                                      for other in tokenized_source[1:-1]]
+            out_array.append(array_slice)
+        return np.array(out_array)
+
     @overrides
     def text_to_instance(self, source_string: str, target_string: str = None) -> Instance:  # type: ignore
         # pylint: disable=arguments-differ
@@ -80,6 +119,24 @@ def text_to_instance(self, source_string: str, target_string: str = None) -> Ins
         tokenized_source.insert(0, Token(START_SYMBOL))
         tokenized_source.append(Token(END_SYMBOL))
         source_field = TextField(tokenized_source, self._source_token_indexers)
+
+        # For token in the source sentence, we store a sparse array containing
+        # indicators for each other source token that matches. This gives us
+        # a matrix of shape `(source_length, source_length)` where the (i,j)th entry
+        # is a 1 if the ith token matches the jth token.
+        source_duplicates_array = self._create_source_duplicates_array(tokenized_source)
+        source_duplicates_field = ArrayField(source_duplicates_array)
+
+        # For each token in the source sentence, we keep track of the matching token
+        # in the target sentence (which will be the OOV symbol if there is no match).
+        target_pointer_field = CopyMapField(tokenized_source[1:-1], self._target_namespace)
+
+        fields_dict = {
+                "source_tokens": source_field,
+                "source_duplicates": source_duplicates_field,
+                "target_pointers": target_pointer_field,
+        }
+
         if target_string is not None:
             target_string = self._preprocess_target(target_string)
             tokenized_target = self._target_tokenizer.tokenize(target_string)
@@ -89,22 +146,12 @@ def text_to_instance(self, source_string: str, target_string: str = None) -> Ins
 
             # For each token in the target sentence, we keep track of the index
             # of every token in the source sentence that matches.
-            source_index_array: List[List[int]] = []
-            for tgt_tok in tokenized_target[1:-1]:
-                source_index_list: List[int] = []
-                for src_tok in tokenized_source[1:-1]:
-                    if tgt_tok.text == src_tok.text:
-                        source_index_list.append(1)
-                    else:
-                        source_index_list.append(0)
-                source_index_array.append(source_index_list)
-            source_index_array.insert(0, [0] * len(tokenized_source[1:-1]))
-            source_index_array.append([0] * len(tokenized_source[1:-1]))
-            source_index_field = ArrayField(np.array(source_index_array))
+            copy_indicator_array = self._create_copy_indicator_array(tokenized_source,
+                                                                     tokenized_target)
             # shape: (target_length, source_length)
+            copy_indicator_field = ArrayField(copy_indicator_array)
+
+            fields_dict["target_tokens"] = target_field
+            fields_dict["copy_indicators"] = copy_indicator_field
 
-            return Instance({"source_tokens": source_field,
-                             "target_tokens": target_field,
-                             "source_indices": source_index_field})
-        else:
-            return Instance({'source_tokens': source_field})
+        return Instance(fields_dict)
@@ -95,14 +95,16 @@ class NL2BashDatasetReader(CopyNetDatasetReader):
     prompt_finder = re.compile(r"^(\$|#)\s?")
 
     def __init__(self,
+                 target_namespace: str,
                  source_tokenizer: Tokenizer = None,
                  target_tokenizer: Tokenizer = None,
                  source_token_indexers: Dict[str, TokenIndexer] = None,
                  target_token_indexers: Dict[str, TokenIndexer] = None,
                  lazy: bool = False) -> None:
         source_tokenizer = source_tokenizer or WordTokenizer(word_splitter=NL2BashWordSplitter())
         target_tokenizer = target_tokenizer or source_tokenizer
-        super().__init__(source_tokenizer=source_tokenizer,
+        super().__init__(target_namespace,
+                         source_tokenizer=source_tokenizer,
                          target_tokenizer=target_tokenizer,
                          source_token_indexers=source_token_indexers,
                          target_token_indexers=target_token_indexers,
 
@@ -0,0 +1 @@
+from modules.data.fields.copy_map_field import CopyMapField
@@ -0,0 +1,39 @@
+from typing import Dict, List, Optional
+
+from overrides import overrides
+import torch
+
+from allennlp.common.util import pad_sequence_to_length
+from allennlp.data import Vocabulary
+from allennlp.data.tokenizers import Token
+from allennlp.data import Field
+
+
+class CopyMapField(Field[torch.Tensor]):
+
+    def __init__(self,
+                 source_tokens: List[Token],
+                 target_namespace: str) -> None:
+        self._source_tokens = source_tokens
+        self._target_namespace = target_namespace
+        self._mapping_array: Optional[List[List[int]]] = None
+
+    @overrides
+    def index(self, vocab: Vocabulary):
+        self._mapping_array = [vocab.get_token_index(x.text, self._target_namespace)
+                               for x in self._source_tokens]
+
+    @overrides
+    def get_padding_lengths(self) -> Dict[str, int]:
+        return {"num_tokens": len(self._source_tokens)}
+
+    @overrides
+    def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor:
+        desired_length = padding_lengths["num_tokens"]
+        padded_tokens = pad_sequence_to_length(self._mapping_array, desired_length)
+        tensor = torch.LongTensor(padded_tokens)
+        return tensor
+
+    @overrides
+    def empty_field(self) -> 'CopyMapField':
+        return CopyMapField([], self._target_namespace)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from modules.data.fields.copy_map_field import CopyMapField`