From 897eb1e4541a8264389db0f3ec219fe849a37d1e Mon Sep 17 00:00:00 2001
From: Shyam Upadhyay <shyamupa@gmail.com>
Date: Sat, 26 Jan 2019 16:39:58 -0500
Subject: [PATCH] code upload

---
 .gitignore                                    |  16 +
 README.md                                     |  71 +-
 baseline/Makefile                             |  10 +
 baseline/align.c                              | 717 ++++++++++++++++++
 baseline/align.py                             |  96 +++
 baseline/align_utils.py                       |  10 +
 baseline/baseline.py                          | 367 +++++++++
 baseline/perceptron.c                         | 397 ++++++++++
 baseline/perceptron_c.py                      | 114 +++
 load_and_test_model_interactive.sh            |  25 +
 load_and_test_model_on_files.sh               |  32 +
 readers/aligned_reader.py                     | 121 +++
 seq2seq/constants.py                          |  11 +
 seq2seq/encoder.py                            |  61 ++
 seq2seq/evaluators/reporter.py                | 185 +++++
 seq2seq/inferences/evaluate.py                |  27 +
 seq2seq/inferences/monotonic_infer.py         | 172 +++++
 seq2seq/lang.py                               |  29 +
 seq2seq/main.py                               | 151 ++++
 seq2seq/model_utils.py                        | 108 +++
 seq2seq/monotonic_decoder.py                  |  42 +
 seq2seq/runner.py                             |  52 ++
 seq2seq/torch_utils.py                        |  33 +
 seq2seq/trainers/monotonic_train.py           | 116 +++
 seq2seq/trainers/seq2seq_attn_trainer.py      | 195 +++++
 train_model.sh                                |  28 +
 train_model_on_files.sh                       |  30 +
 utils/__init__.py                             |   0
 utils/arguments.py                            |  44 ++
 .../news_evaluation_script/news_evaluation.py | 510 +++++++++++++
 utils/news_evaluation_script/news_results.xml | 108 +++
 utils/news_evaluation_script/news_test.xml    |  36 +
 32 files changed, 3906 insertions(+), 8 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 baseline/Makefile
 create mode 100644 baseline/align.c
 create mode 100644 baseline/align.py
 create mode 100644 baseline/align_utils.py
 create mode 100755 baseline/baseline.py
 create mode 100644 baseline/perceptron.c
 create mode 100644 baseline/perceptron_c.py
 create mode 100755 load_and_test_model_interactive.sh
 create mode 100755 load_and_test_model_on_files.sh
 create mode 100644 readers/aligned_reader.py
 create mode 100644 seq2seq/constants.py
 create mode 100644 seq2seq/encoder.py
 create mode 100644 seq2seq/evaluators/reporter.py
 create mode 100644 seq2seq/inferences/evaluate.py
 create mode 100644 seq2seq/inferences/monotonic_infer.py
 create mode 100644 seq2seq/lang.py
 create mode 100644 seq2seq/main.py
 create mode 100644 seq2seq/model_utils.py
 create mode 100644 seq2seq/monotonic_decoder.py
 create mode 100644 seq2seq/runner.py
 create mode 100644 seq2seq/torch_utils.py
 create mode 100644 seq2seq/trainers/monotonic_train.py
 create mode 100644 seq2seq/trainers/seq2seq_attn_trainer.py
 create mode 100755 train_model.sh
 create mode 100755 train_model_on_files.sh
 create mode 100644 utils/__init__.py
 create mode 100644 utils/arguments.py
 create mode 100755 utils/news_evaluation_script/news_evaluation.py
 create mode 100644 utils/news_evaluation_script/news_results.xml
 create mode 100644 utils/news_evaluation_script/news_test.xml

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a65a715
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,16 @@
+*.tab
+*.dict
+*.pred
+*.model
+*.tar
+*.vocab
+*.vocab.romanized
+*.tar.gz
+phone_index*
+data/
+.idea
+*.txt
+*.pyc
+*.log
+*.so
+m2m/
\ No newline at end of file
diff --git a/README.md b/README.md
index b844e87..73de0d8 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,67 @@
-Code for the EMNLP paper, "Bootstrapping Transliteration with Guided Discovery for Low-Resource Languages". 
 
-Coming soon. 
+## Running the code
 
+1. First compile the C code for the aligner.
+```bash
+cd baseline/
+make
 ```
-@InProceedings{UKR18,
-  author =       {Upadhyay, Shyam and Kodner, Jordan and Roth, Dan},
-  title =        {Bootstrapping Transliteration with Guided Discovery for Low-Resource Languages},
-  booktitle =    {EMNLP},
-  year =         {2018},
-}
+
+2. write you train, dev and test data in the following format, 
+
+```
+x1 x2 x3<tab>y1 y2 y3 y4 y5
+```
+where `x1x2x3` is the input word (`xi` is the character), and `y1y2y3y4y5` is the desired output (transliteration). Example train and test files for bengali are in data/ folder. There is a optional 3rd column marking whether the word is *native* or *foreign* (see the paper for these terms); this column can be ignored for most purposes. 
+
+
+3. Run `train_model_on_files.sh` on your train (say train.txt) and dev file (dev.txt) as follows,
+
+```
+./train_model_on_files.sh train.txt dev.txt 100 translit.model
+```
+
+where 100 is the random seed and translit.model is the output model. Other parameters(see `utils/arguments.py` for options) can be specified by modifying the `train_model_on_files.sh` script appropriately.
+
+4. Test the trained model as follows,
+
+```
+./load_and_test_model_on_files.sh train.txt test.txt translit.model 100 output.txt
+```
+
+The output should report relevant metrics,
+
+```
+...
+...
+:INFO: --------------------TEST--------------------
+:INFO: running infer on example 200
+:INFO: running infer on example 400
+:INFO: running infer on example 600
+:INFO: running infer on example 800
+:INFO: accuracy 367/997=0.37
+:INFO: accuracy (nat) 308/661=0.47
+:INFO: accuracy (eng) 59/336=0.18
+:INFO: ********************total********************
+:INFO: ACC:          0.371457 (367/988)
+:INFO: Mean F-score: 0.910995
+:INFO: Mean ED@1: 1.136640+-1.167
+:INFO: Mean NED@1: 0.084884
+:INFO: Median ED@1: 1.000000
+...
+...
 ```
+
+There is also a interactive mode where one can input test words directly,
+
+```
+./load_and_test_model_interactive.sh <ftrain> <model> <seed>
+...
+...
+:INFO: => loading checkpoint hindi.model
+:INFO: => loaded checkpoint!
+enter surface:ओबामा
+ओ ब ा म ा
+[(-0.4624647759074629, 'o b a m a')]
+```
+
diff --git a/baseline/Makefile b/baseline/Makefile
new file mode 100644
index 0000000..50bbda8
--- /dev/null
+++ b/baseline/Makefile
@@ -0,0 +1,10 @@
+all: libperceptron.so libalign.so
+
+libperceptron.so: perceptron.c
+	gcc -O3 -Wall -Wextra -shared -fPIC perceptron.c -o libperceptron.so
+
+libalign.so: align.c
+	gcc -O3 -Wall -Wextra -shared -fPIC align.c -o libalign.so
+
+clean:
+	/bin/rm libperceptron.so libalign.so *.pyc
diff --git a/baseline/align.c b/baseline/align.c
new file mode 100644
index 0000000..632be85
--- /dev/null
+++ b/baseline/align.c
@@ -0,0 +1,717 @@
+/************************************************************************/
+/*   crpalign - Chinese Restaurant Process string pair aligner          */
+/*   Copyright © 2013 Mans Hulden                                       */
+/*                                                                      */
+/*   This file is part of crpalign.                                     */
+/*                                                                      */
+/* Licensed under the Apache License, Version 2.0 (the "License");      */
+/* you may not use this file except in compliance with the License.     */
+/* You may obtain a copy of the License at                              */
+/*                                                                      */
+/*     http://www.apache.org/licenses/LICENSE-2.0                       */
+/*                                                                      */
+/************************************************************************/
+
+/* To build python bindings: gcc -O3 -Wall -Wextra -shared align.c -o libalign.so */
+/* WARNING: currently not thread-safe */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+#include <math.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <getopt.h>
+
+/* Yields minimum of three values */
+#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
+/* Compares three values, yielding -1, 0, 1 depending on if a, b, or c is the smallest */
+#define CMP3(a, b, c) ((a) < (b) ? ((a) < (c) ? (-1) : (1)) : ((b) < (c) ? (0) : (1)))
+
+#define LEFT -1
+#define DIAG  0
+#define DOWN  1
+
+#define INPUT_FORMAT_L2P  0
+#define INPUT_FORMAT_NEWS 1
+
+#define OUTPUT_FORMAT_PLAIN         0
+#define OUTPUT_FORMAT_ALIGNED       1
+#define OUTPUT_FORMAT_PHONETISAURUS 2
+#define OUTPUT_FORMAT_M2M           3
+
+#define MATRIX_MODE_MED   0
+#define MATRIX_MODE_GS    1
+
+int g_maxsymbol = 0;
+int g_debug = 0;
+int g_med = 0;
+int g_in_result[256];
+int g_out_result[256];
+int g_paircount = 0;
+int g_distinct_pairs = 0;
+int g_input_format = INPUT_FORMAT_L2P;
+int g_output_format = OUTPUT_FORMAT_ALIGNED;
+double g_trellis[256][256];
+int g_backptr[256][256];
+int g_current_count[256][256];
+int g_global_count[256][256];
+char *g_symboltable[1024];
+double g_prior = 0.1;
+double g_zero = 0.0;
+
+struct stringpair {          /* These are all               */
+    int *in;                 /* -1 terminated int sequences */
+    int *out;
+    int *inaligned;
+    int *outaligned;
+    struct stringpair *next;
+} *g_stringpairs = NULL, *g_stringpairs_tail = NULL;
+
+void align_init(void) {
+	g_stringpairs = NULL;
+	g_stringpairs_tail = NULL;	
+}
+
+void align_init_with_seed(long seed) {
+        g_stringpairs = NULL;
+        g_stringpairs_tail = NULL;      
+        srand48(seed);
+}
+
+int intseqlen(int *seq) {
+    int i;
+    for (i = 0; seq[i] != -1; i++) { }
+    return i;
+}
+
+double log_add(double logy, double logx) {
+	/* Supposes that inputs are negative log probabilities */
+	if (logy > logx) {
+		double temp = logx;
+		logx = logy;
+		logy = temp;
+	}
+	double negdiff = logx - logy;
+	if (negdiff > 80) {
+		return(logy);
+	}
+	return logx - log(1 + exp(logx - logy));
+}
+
+void debug(const char *fmt, ...) {
+	va_list arg;
+	if (g_debug == 1) {
+		va_start(arg, fmt);
+		vprintf(fmt, arg);
+		va_end(arg);
+	}
+}
+
+/* Gives length in bytes of UTF-8 character */
+int utf8len(char *str) {
+	unsigned char s;
+	s = (unsigned char)(unsigned int) (*str);
+	if (s < 0x80)
+		return 1;
+	if ((s & 0xe0) == 0xc0) {
+		return 2;
+	}
+	if ((s & 0xf0) == 0xe0) {
+		return 3;
+	}
+	if ((s & 0xf8) == 0xf0) {
+		return 4;
+	}
+	return 0;
+}
+
+/* Reverses an integer sequence in-place */
+void vector_reverse(int *s, int length) {
+	int c, i, j;
+	for (i = 0, j = length - 1; i < j; i++, j--) {
+		c = s[i];
+		s[i] = s[j];
+		s[j] = c;
+	}
+}
+
+/* Returns number of UTF8 characters in char array */
+int utf8strlen(char *str) {
+	int i,j, len;
+	len = strlen(str);
+	for (i = 0, j = 0; *(str+i) != '\0' && i < len; j++) {
+		i = i + utf8len(str+i);
+	}
+	return j;
+}
+
+int random_3draw(double a, double b, double c) {
+
+    /* From three negative logprobs, do a weighted coin toss */
+    /* proportional to each probability, returing -1, 0, 1   */
+    /* depending on if a, b, or c is drawn.                  */
+
+	double minv, subv, rand;
+	/* Scale neg logprobs */
+	minv = MIN3(a, b, c);
+	if (minv >= 2) {
+		subv = minv - 2; /* <= we scale so that highest prob entry is 2 (in -log space) */
+		a -= subv;       /* This to avoid underflow when converting to reals            */
+		b -= subv;       /* for the weighted random choice.                             */
+		c -= subv;
+	}
+	a = exp(-a);         /* Convert to three probabilities */
+	b = exp(-b);
+	c = exp(-c);
+	rand = drand48();
+	rand = rand * (a + b + c);
+	if (rand < a)   { return -1; }
+	if (rand < a+b) { return  0; }
+	return 1;
+}
+
+/* Fills trellis with aligned integer sequences in and out, using the callback function  */
+/* cost().  Returns aligned strings in g_in_result[] and g_out_result[]                  */
+/* If mode = MODE_GS, we resample alignments by a CRP process (filling trellis "forward" */
+/*                    and then drawing a new alignment going "backward")                 */
+/* If mode = MODE_MED, we find the "cheapest" alignment                                  */
+
+double fill_trellis(int *in, int *out, double(*cost)(int, int), int mode) {
+    int i, x, y, inlen, outlen;
+    double left, down, diag, p;
+    inlen = intseqlen(in);
+    outlen = intseqlen(out);
+    g_trellis[0][0] = g_zero;
+    for (x = 1; x <= outlen; x++) {
+		g_trellis[x][0] = g_trellis[x-1][0] + cost(0,out[x-1]);
+		g_backptr[x][0] = LEFT;
+    }
+    for (y = 1; y <= inlen; y++) {
+		g_trellis[0][y] = g_trellis[0][y-1] + cost(in[y-1], 0);
+		g_backptr[0][y] = DOWN;
+    }
+    for (x = 1; x <= outlen; x++) {
+		for (y = 1; y <= inlen; y++) {
+			left = g_trellis[x-1][y] + cost(0,out[x-1]);
+			down = g_trellis[x][y-1] + cost(in[y-1], 0);
+			diag = g_trellis[x-1][y-1] + cost(in[y-1], out[x-1]);
+	    
+			if (mode == MATRIX_MODE_MED) {
+				g_trellis[x][y] = MIN3(left, diag, down);
+				g_backptr[x][y] = CMP3(left, diag, down);
+			}
+			else if (mode == MATRIX_MODE_GS) {
+				g_trellis[x][y] = log_add(log_add(left, diag), down);
+			}
+		}
+    }
+
+    /* Resample a new "path" for the string pair <in:out> starting from upper right-hand corner
+       in the matrix and moving left, down, or diagonally down/left until we reach [0,0]
+       ..[B][A]   To choose the direction we do a weighted coin toss between choices A -> B, A -> C, A -> D:
+       ..[C][D]   w(B) = p(B) * p(B->A) ; w(C) = p(C) * p(C->A) ; w(D) = p(D) * p(D -> A).
+          .  .    and p(X->Y) = the probability of taking the transition (X->Y)               
+          .  .    Since we've stored the probabilities in log space, we need to do some scaling
+                  and conversion before doing the weighted toss.		   
+    */                                          
+
+    if (mode == MATRIX_MODE_GS) {
+		for (y = inlen, x = outlen; x > 0 || y > 0 ; ) {
+			if (x == 0) {
+				y--;
+			} else if (y == 0) {
+				x--;
+			} else {
+				left = g_trellis[x-1][y] + cost(0,out[x-1]);
+				down = g_trellis[x][y-1] + cost(in[y-1], 0);
+				diag = g_trellis[x-1][y-1] + cost(in[y-1], out[x-1]);
+				g_backptr[x][y] = random_3draw(left, diag, down);
+				x--;
+				y--;
+			}
+		}
+    }
+
+    for (i = 0, y = inlen, x = outlen; x > 0 || y > 0; i++) {
+		if (g_backptr[x][y] == DIAG) {
+			x--; 
+			y--;
+			g_in_result[i] = in[y];
+			g_out_result[i] = out[x];
+		} else if (g_backptr[x][y] == LEFT) {
+			x--;
+			g_in_result[i] = 0;
+			g_out_result[i] = out[x];
+		} else if (g_backptr[x][y] == DOWN) {
+			y--;
+			g_in_result[i] = in[y];
+			g_out_result[i] = 0;
+		}
+    }
+
+    g_in_result[i] = -1;
+    g_out_result[i] = -1;
+
+    vector_reverse(g_in_result, i);
+    vector_reverse(g_out_result, i);
+    p = g_trellis[outlen][inlen];
+    return(p);
+}
+
+/* Removes the counts of symbol pairs in two -1 -terminated sequences */
+/* to the current count table                                         */
+void remove_counts(int *in, int *out) {
+	int i;
+	for (i = 0; in[i] != -1 && out[i] != -1; i++) {
+		g_current_count[in[i]][out[i]]--;
+		if (g_current_count[in[i]][out[i]] == 0) {
+			g_distinct_pairs--;
+		}
+	}
+}
+
+/* Add the counts of symbol pairs in two -1 -terminated sequences */
+/* to the current count table                                     */
+void add_counts(int *in, int *out) {
+	int i;
+	for (i = 0; in[i] != -1 && out[i] != -1; i++) {
+		g_current_count[in[i]][out[i]]++;
+		g_paircount++;
+		if (g_current_count[in[i]][out[i]] == 1) {
+			g_distinct_pairs++;
+		}
+	}
+}
+
+/* Add running counts of pairs to the global count table */
+void add_global_counts() {
+	int i, j;
+	for (i = 0; i <= g_maxsymbol; i++) {
+		for (j = 0; j <= g_maxsymbol; j++) {
+			g_global_count[i][j] += g_current_count[i][j];
+		}
+	}
+}
+
+void print_counts() {
+	int i, j;
+	debug("\n");
+	for (i = 0; i <= g_maxsymbol; i++) {
+		for (j = 0; j <= g_maxsymbol; j++) {
+			debug("%i ", g_current_count[i][j]);
+		}		
+		debug("\n");
+	}
+}
+
+/* Cost function called by fill_trellis for MED */
+double cost_levenshtein(int a, int b) {
+	if (a != b) {
+		return 1.0;
+	}
+	return 0.0;
+}
+
+/* Cost function called by fill_trellis for CRP alignment */
+double cost_crp(int in, int out) {
+    double cost;
+    cost = (double)( g_current_count[in][out] + g_prior ) / (double)( g_paircount + g_distinct_pairs * g_prior );
+    return(-log(cost));
+}
+
+/* Initially, align all string pairs greedily, i.e. e.g. <abcd, ax> => <abcd,ax00> */
+void initial_align() {
+    struct stringpair *pair;
+    int inlen, outlen, i, j, k;
+    for (pair = g_stringpairs; pair != NULL; pair = pair->next) {
+		inlen = intseqlen(pair->in);
+		outlen = intseqlen(pair->out);
+		pair->inaligned = malloc(sizeof(int) * (inlen+outlen+1));
+		pair->outaligned = malloc(sizeof(int) * (inlen+outlen+1));
+
+		for (i = 0, j = 0, k = 0; pair->in[i] != -1 || pair->out[j] != -1; k++) {
+			if (pair->in[i] == -1) {
+				pair->inaligned[k] = 0;
+				pair->outaligned[k] = pair->out[j];
+				j++;
+			}
+			else if (pair->out[j] == -1) {
+				pair->inaligned[k] = pair->in[i];
+				pair->outaligned[k] = 0;
+				i++;
+			} else {
+				pair->inaligned[k] = pair->in[i];
+				pair->outaligned[k] = pair->out[j];
+				i++;
+				j++;
+			}
+		}
+		pair->inaligned[k] = -1;
+		pair->outaligned[k] = -1;
+		add_counts(pair->inaligned, pair->outaligned);
+	}
+}
+
+/* Align a set of string pairs by minimum edit distance (for reference) */
+void med_align() {
+    struct stringpair *sp;
+    int j;
+    for (sp = g_stringpairs; sp != NULL; sp = sp->next) {
+		fill_trellis(sp->in, sp->out, &cost_levenshtein, MATRIX_MODE_MED); /* Fill trellis */
+		for (j = 0; g_in_result[j] != -1; j++) {
+			sp->inaligned[j] = g_in_result[j];
+			sp->outaligned[j] = g_out_result[j];
+		}
+		sp->inaligned[j] = -1;
+		sp->outaligned[j] = -1;
+    }
+}
+
+void crp_align() {
+    struct stringpair *sp;
+    int j;
+    for (sp = g_stringpairs; sp != NULL; sp = sp->next) {
+		fill_trellis(sp->in, sp->out, &cost_crp, MATRIX_MODE_MED);
+		for (j = 0; g_in_result[j] != -1; j++) {
+			sp->inaligned[j] = g_in_result[j];
+			sp->outaligned[j] = g_out_result[j];
+		}
+		sp->inaligned[j] = -1;
+		sp->outaligned[j] = -1;
+	}
+}
+
+void crp_train(int iterations, int burnin, int lag) {
+    struct stringpair *sp;
+    int i, j;
+	for (i = 0; i < iterations; i++) {
+		fprintf(stderr,"Alignment iteration: %i\n", i);
+		print_counts();
+		for (sp = g_stringpairs; sp != NULL; sp = sp->next) {
+			remove_counts(sp->inaligned, sp->outaligned);  /* Remove counts before aligning */
+			fill_trellis(sp->in, sp->out, &cost_crp, MATRIX_MODE_GS);
+			for (j = 0; g_in_result[j] != -1; j++) {
+				sp->inaligned[j] = g_in_result[j];
+				sp->outaligned[j] = g_out_result[j];
+			}
+			sp->inaligned[j] = -1;
+			sp->outaligned[j] = -1;
+			add_counts(sp->inaligned, sp->outaligned);  /* Add counts back from new alignment */
+		}
+		if (i > burnin && i % lag == 0) {
+			add_global_counts();
+		}
+    }
+}
+
+int get_set_char_num(char *utfstring) {
+    int i;
+    debug("Finding symbol %s with len %i... ", utfstring, utf8len(utfstring));
+	for (i = 1; i <= g_maxsymbol; i++) {
+		if (strcmp(utfstring, g_symboltable[i]) == 0) {
+			debug("Found at %i\n", i);
+			return i;
+		}
+	}
+	g_maxsymbol++;
+	debug("Not found, adding at %i\n", g_maxsymbol);
+	g_symboltable[g_maxsymbol] = strdup(utfstring);
+	return(g_maxsymbol);
+}
+
+/* Reads character sequences in and out and onverts them to integer sequences */
+/* And adds them to the global list of integer sequence pairs                 */
+
+void add_string_pair(char *in, char *out) {
+    int *int_in, *int_out;
+    int i, j;
+    char *token;
+    struct stringpair *newpair;
+    /* Get int array */
+    int_in  = malloc(sizeof(int) * (utf8strlen(in) + 1));
+    int_out = malloc(sizeof(int) * (utf8strlen(out) + 1));
+	if (g_input_format == INPUT_FORMAT_L2P) {
+		for (i = 0, j = 0; in[i] != '\0'; i += utf8len(&in[i]), j++) {
+			int_in[j] = get_set_char_num(strndup(&in[i], utf8len(&in[i])));
+		}
+		int_in[j] = -1;
+		for (i = 0, j = 0; out[i] != '\0'; i += utf8len(&out[i]), j++) {
+			int_out[j] = get_set_char_num(strndup(&out[i], utf8len(&out[i])));
+		}
+		int_out[j] = -1;
+	} else if (g_input_format == INPUT_FORMAT_NEWS) {
+		token = strtok(in, " ");
+		for (j = 0; token != NULL; j++) {
+			int_in[j] = get_set_char_num(token);
+			token = strtok(NULL, " ");
+		}
+		int_in[j] = -1;
+		token = strtok(out, " ");
+		for (j = 0; token != NULL; j++) {
+			int_out[j] = get_set_char_num(token);
+			token = strtok(NULL, " ");
+		}
+		int_out[j] = -1;	
+	}
+
+	newpair = malloc(sizeof(struct stringpair));
+	newpair->in = int_in;
+	newpair->out = int_out;    
+	newpair->next = NULL;
+	if (g_stringpairs == NULL) {
+		g_stringpairs = newpair;
+		g_stringpairs_tail = newpair;
+	} else {
+		g_stringpairs_tail->next = newpair;
+		g_stringpairs_tail = newpair;
+	}    
+}
+
+/* Directly add two -1 terminated integer sequences */
+void add_int_pair(int *in, int *out) {
+	int inlen, outlen;
+    struct stringpair *newpair;
+    newpair = malloc(sizeof(struct stringpair));
+	inlen = intseqlen(in) + 1;
+	outlen = intseqlen(out) + 1;
+	newpair->in = malloc(inlen * sizeof(int));
+	newpair->out = malloc(outlen * sizeof(int));
+	memcpy(newpair->in, in, inlen * sizeof(int));
+	memcpy(newpair->out, out, outlen * sizeof(int));
+    newpair->next = NULL;
+    if (g_stringpairs == NULL) {
+		g_stringpairs = newpair;
+		g_stringpairs_tail = newpair;
+    } else {
+		g_stringpairs_tail->next = newpair;
+		g_stringpairs_tail = newpair;
+    }
+}
+
+void clear_counts() {	
+	int i,j;
+	for (i = 0; i <= g_maxsymbol; i++) {
+		for (j = 0; j <= g_maxsymbol; j++) {
+			g_current_count[i][j] = 0;
+			g_global_count[i][j] = 0;
+		}
+	}
+}
+
+void print_pair_plain(int *in, int *out) {
+	int i;
+	g_symboltable[0] = " ";
+	for (i = 0; in[i] != -1; i++) {
+		printf("%s", in[i] == 0 ? " " : g_symboltable[ in[i] ]);
+	}
+	printf("\n");
+	for (i = 0; out[i] != -1; i++) {
+		printf("%s", out[i] == 0 ? " " : g_symboltable[ out[i] ]);
+	}
+	printf("\n\n");
+}
+
+void print_pair_m2m(int *in, int *out) {
+	int i;
+	g_symboltable[0] = "_";
+	for (i = 0; in[i] != -1; i++) {
+		printf("%s|", in[i] == 0 ? " " : g_symboltable[ in[i] ]);
+	}
+	printf("\t");
+	for (i = 0; out[i] != -1; i++) {
+		printf("%s|", out[i] == 0 ? " " : g_symboltable[ out[i] ]);
+	}
+	printf("\n");
+}
+
+void print_pair_phonetisaurus(int *in, int *out) {
+	int i;
+	g_symboltable[0] = "_";
+	for (i = 0; in[i] != -1 && out[i] != -1; i++) {
+		printf("%s}%s", g_symboltable[in[i]], g_symboltable[out[i]]);
+		if (in[i+1] != -1 && out[i+1] != -1) {
+			printf(" ");
+		}
+	}
+	printf("\n");
+}
+
+void print_pair_aligned(int *in, int *out) {
+	int i, fieldwidth;
+	char *instr, *outstr;
+	g_symboltable[0] = "_";
+	for (i = 0; in[i] != -1 && out[i] != -1; i++) {
+		instr = g_symboltable[ in[i] ];
+		outstr =  g_symboltable[ out[i] ];
+		fieldwidth = utf8strlen(instr) > utf8strlen(outstr) ? utf8strlen(instr) : utf8strlen(outstr);
+		printf("%-*s", fieldwidth, instr);
+		if (in[i+1] != -1 && out[i+1] != -1)
+			printf("|");      
+	}
+	printf("\n");
+	for (i = 0; in[i] != -1 && out[i] != -1; i++) {
+		instr = g_symboltable[ in[i] ];
+		outstr =  g_symboltable[ out[i] ];
+		fieldwidth = utf8strlen(instr) > utf8strlen(outstr) ? utf8strlen(instr) : utf8strlen(outstr);
+		printf("%-*s", fieldwidth, outstr);
+		if (in[i+1] != -1 && out[i+1] != -1)
+			printf("|");      
+	}
+	printf("\n\n");
+}
+
+/* Functions for Python ctypes wrap */
+
+struct stringpair *getpairs_init() {
+	return g_stringpairs;	
+}
+
+int *getpairs_in(struct stringpair *sp) {
+	return sp->inaligned;
+}
+
+int *getpairs_out(struct stringpair *sp) {
+	return sp->outaligned;
+}
+
+struct stringpair *getpairs_advance(struct stringpair *sp) {
+	return sp->next;	
+}
+
+/************************************/
+
+void write_stringpairs() {
+	struct stringpair *sp;
+	for (sp = g_stringpairs; sp != NULL; sp = sp->next) {
+		switch(g_output_format) {
+			case OUTPUT_FORMAT_PLAIN:
+			print_pair_plain(sp->inaligned, sp->outaligned);
+			break;
+			case OUTPUT_FORMAT_ALIGNED:
+			print_pair_aligned(sp->inaligned, sp->outaligned);
+			break;
+			case OUTPUT_FORMAT_PHONETISAURUS:
+			print_pair_phonetisaurus(sp->inaligned, sp->outaligned);
+			break;
+			case OUTPUT_FORMAT_M2M:
+			print_pair_m2m(sp->inaligned, sp->outaligned);
+			break;
+		}
+	}
+}
+
+void read_stringpairs() {
+	char *my_string = NULL, *token1, *token2;
+	char str1[1024], str2[1024];
+	size_t nbytes;
+	int bytes_read;
+	while ((bytes_read = getline(&my_string, &nbytes, stdin)) != -1) {
+		if (g_input_format == INPUT_FORMAT_L2P) {
+			if (sscanf(my_string, "%1023s %1023s", &str1[0], &str2[0]) == 2)
+				add_string_pair(str1, str2);
+		} else if (g_input_format == INPUT_FORMAT_NEWS) {
+			token1 = strtok(my_string, "\t\n");
+			token2 = strtok(NULL, "\t\n");
+			if (token1 != NULL && token2 != NULL)
+				add_string_pair(token1, token2);	    
+		}
+	}
+	clear_counts();
+	initial_align();
+}
+
+int main(int argc, char **argv) {
+	static char *usagestring = 
+	"Chinese restaurant process string pair aligner\n"
+	"Basic usage: crpalign11 [options] < infile.txt > aligned.txt\n"
+	"             infile.txt is a list of TAB-separated word-pairs, one pair per line.\n\n"
+	"Options:\n"
+	"-d     --debug           print debug info\n"
+	"-h     --help            help\n"
+	"-m     --med             do simple med-alignment only (for comparison)\n"
+	"-x NUM --iterations=NUM  run aligner for NUM iterations (default 10)\n"
+	"-i FMT --informat=FMT    expect data in format FMT=l2p|news (default l2p)\n"
+	"-o FMT --outformat=FMT   print data in format FMT=plain|aligned|phonetisaurus|m2m\n"
+	"-b NUM --burnin=NUM      run Gibbs sampler with NUM iterations of burn-in\n"
+	"-l NUM --lag=NUM         collect counts from sampler every NUM iterations\n"
+	"-p NUM --prior=NUM       use a prior of NUM for sampler (default 0.1)\n";
+
+
+	int opt, iterations = 10, burnin = 5, lag = 1, option_index = 0;
+	static struct option long_options[] =
+	{
+		{"debug",       no_argument,       0, 'd'},
+		{"help",        no_argument,       0, 'h'},
+		{"med",         no_argument,       0, 'm'},
+		{"iterations",  required_argument, 0, 'x'},
+		{"informat",    required_argument, 0, 'i'},
+		{"outformat",   required_argument, 0, 'o'},
+		{"burnin",      required_argument, 0, 'b'},
+		{"lag",         required_argument, 0, 'l'},
+		{"prior",       required_argument, 0, 'p'},
+		{0, 0, 0, 0}
+	};
+    
+	while ((opt = getopt_long(argc, argv, "dmx:b:l:p:i:o:h", long_options, &option_index)) != -1) {
+		switch(opt) {
+			case 'd':
+			g_debug = 1;
+			break;
+			case 'm':
+			g_med = 1;
+			break;
+			case 'x':
+			iterations = atoi(optarg);
+			break;
+			case 'b':
+			burnin = atoi(optarg);
+			break;
+			case 'h':
+			printf("%s", usagestring);
+			exit(0);
+			case 'i':
+			if (strcmp(optarg,"l2p") == 0) {
+				g_input_format = INPUT_FORMAT_L2P;
+			} else if (strcmp(optarg, "news") == 0) {
+				g_input_format = INPUT_FORMAT_NEWS;
+			} else {
+				fprintf(stderr, "Invalid option %s for input format\n", optarg);
+				exit(EXIT_FAILURE);
+			}
+			break;
+			case 'o':
+			if (strcmp(optarg,"plain") == 0) {
+				g_output_format = OUTPUT_FORMAT_PLAIN;
+			} else if (strcmp(optarg, "aligned") == 0) {
+				g_output_format = OUTPUT_FORMAT_ALIGNED;
+			} else if (strcmp(optarg, "phonetisaurus") == 0) {
+				g_output_format = OUTPUT_FORMAT_PHONETISAURUS;
+			} else if (strcmp(optarg, "m2m") == 0) {
+				g_output_format = OUTPUT_FORMAT_M2M;
+			} else {
+				fprintf(stderr, "Invalid option %s for output format\n", optarg);
+				exit(EXIT_FAILURE);
+			}
+			break;
+			case 'l':
+			lag = atoi(optarg);
+			break;
+			case 'p':
+			g_prior = strtod(optarg,NULL);
+			break;
+		}
+	}
+    
+	srand48((unsigned int)time((time_t *)NULL));   
+	read_stringpairs();
+	if (g_med == 1) {
+		med_align();
+	} else {
+		crp_train(iterations,burnin,lag);
+		crp_align();
+	}
+	write_stringpairs();
+	return(0);
+}
diff --git a/baseline/align.py b/baseline/align.py
new file mode 100644
index 0000000..0ce267c
--- /dev/null
+++ b/baseline/align.py
@@ -0,0 +1,96 @@
+# Simple class for learning an alignment of strings, MED-style.
+# Weights are learned by a Chinese Restaurant Process sampler
+# that weights single alignments x:y in proportion to how many times
+# such an alignment has been seen elsewhere out of all possible alignments.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Usage:
+# Align(wordpairs) <= wordpairs is an iterable of 2-tuples
+# The resulting Align.alignedpairs is a list of aligned 2-tuples
+
+# Relies on C-code in libalign.so built from align.c through ctypes.
+# Author: Mans Hulden
+# MH20151102
+
+import itertools
+from ctypes import *
+
+libalign = cdll.LoadLibrary('./baseline/libalign.so')
+
+libalign_add_int_pair = libalign.add_int_pair
+libalign_clear_counts = libalign.clear_counts
+libalign_initial_align = libalign.initial_align
+libalign_crp_train = libalign.crp_train
+libalign_crp_align = libalign.crp_align
+libalign_med_align = libalign.med_align
+
+libalign_getpairs_init = libalign.getpairs_init
+libalign_getpairs_init.restype = c_void_p
+libalign_getpairs_in = libalign.getpairs_in
+libalign_getpairs_in.restype = POINTER(c_int)
+libalign_getpairs_out = libalign.getpairs_out
+libalign_getpairs_out.restype = POINTER(c_int)
+libalign_getpairs_advance = libalign.getpairs_advance
+libalign_getpairs_advance.restype = c_void_p
+libalign_align_init = libalign.align_init
+libalign_align_init.restype = None
+libalign_align_init_with_seed = libalign.align_init_with_seed
+libalign_align_init.restype = None
+
+class Aligner:
+
+    def __init__(self, wordpairs, align_symbol = u' ', iterations = 10, burnin = 5, lag = 1, mode = 'crp', random_seed = None):
+        s = set(u''.join((x[0] + x[1] for x in wordpairs)))
+        self.symboltoint = dict(zip(s, range(1,len(s)+1)))
+        self.inttosymbol = {v:k for k, v in self.symboltoint.items()}
+        self.inttosymbol[0] = align_symbol
+        ## Map stringpairs to -1 terminated integer sequences ##
+        intpairs = []
+        for i, o in wordpairs:
+            intin = list(map(lambda x: self.symboltoint[x], i)) + [-1]
+            intout = list(map(lambda x: self.symboltoint[x], o)) + [-1]
+            intpairs.append((intin, intout))
+
+        if random_seed:
+            libalign_align_init_with_seed(random_seed)
+        else:
+            libalign_align_init()
+            
+        for i, o in intpairs:
+            icint = (c_int * len(i))(*i)
+            ocint = (c_int * len(o))(*o)
+            libalign_add_int_pair(icint, ocint)
+            
+        # Run CRP align
+        if mode == 'crp':
+            libalign_clear_counts()
+            libalign_initial_align()
+            libalign_crp_train(c_int(iterations), c_int(burnin), c_int(lag))
+            libalign_crp_align()
+        else:
+            libalign_clear_counts()
+            libalign_initial_align()
+            libalign_med_align()
+        
+        # Reconvert to output
+        self.alignedpairs = []
+        stringpairptr = libalign_getpairs_init()
+        while stringpairptr != None:
+            inints = libalign_getpairs_in(c_void_p(stringpairptr))
+            outints = libalign_getpairs_out(c_void_p(stringpairptr))
+            instr = []
+            outstr = []
+            for j in itertools.count():
+                if inints[j] == -1:
+                    break
+                instr.append(self.inttosymbol[inints[j]])
+            for j in itertools.count():
+                if outints[j] == -1:
+                    break
+                outstr.append(self.inttosymbol[outints[j]])
+            self.alignedpairs.append((''.join(instr), ''.join(outstr)))
+            stringpairptr = libalign_getpairs_advance(c_void_p(stringpairptr))
diff --git a/baseline/align_utils.py b/baseline/align_utils.py
new file mode 100644
index 0000000..5231509
--- /dev/null
+++ b/baseline/align_utils.py
@@ -0,0 +1,10 @@
+import baseline.align as align
+
+def mcmc_align(wordpairs, align_symbol,seed):
+    a = align.Aligner(wordpairs, align_symbol=align_symbol,random_seed=seed)
+    return a.alignedpairs
+
+
+def med_align(wordpairs, align_symbol):
+    a = align.Aligner(wordpairs, align_symbol=align_symbol, mode='med')
+    return a.alignedpairs
diff --git a/baseline/baseline.py b/baseline/baseline.py
new file mode 100755
index 0000000..ae4c5c3
--- /dev/null
+++ b/baseline/baseline.py
@@ -0,0 +1,367 @@
+#!/usr/bin/env python
+"""
+Baseline system for the SIGMORPHON 2016 Shared Task.
+
+Solves tasks 1,2, and 3, evaluating on dev data and outputs guesses.
+
+Author: Mans Hulden
+Last Update: 11/29/2015
+"""
+
+from __future__ import print_function
+import perceptron_c, align, codecs, sys, re, getopt
+
+class MorphModel:
+    def __init__(self):
+        self.features   = {'tolemma':None, 'fromlemma':None}
+        self.classes    = {'tolemma':None, 'fromlemma':None}
+        self.classifier = {'tolemma':None, 'fromlemma':None}
+        
+class Morph:
+
+    def __init__(self):
+        self.models = {}
+        self.msdfeatures = None
+        self.msdclasses = None
+        self.msdclassifier = None        
+
+    def generate(self, word, featurestring, mode):
+        """Generates an output string from an input word and target
+            feature string. The 'mode' variable is either 'tolemma' or
+            'fromlemma' """
+        pos = re.match(r'pos=([^,]*)', featurestring).group(1)
+        ins = ['<'] + list(word) + ['>']
+        outs = []
+        prevaction = 'None'
+        position = 0
+        while position < len(ins):            
+            feats = list(train_get_surrounding_syms(ins, position, u'in_')) + \
+               list(train_get_surrounding_syms(outs, position, u'out_', lookright = False)) + \
+               ['prevaction='+prevaction] + [u'MSD:' + featurestring]
+            feats = feature_pairs(feats)
+            decision = self.models[pos].classifier[mode].decision_function(feats)
+            decision = sorted(decision, key = lambda x: x[1], reverse = True)
+            prevaction = self._findmax(decision, prevaction, len(ins)-position-1)
+            actionlength, outstring = interpret_action(prevaction, ins[position])
+            outs.append(outstring)
+            position += actionlength
+        return ''.join(outs[1:-1])
+            
+    def _findmax(self, decision, lastaction, maxlength):
+        """Find best action that doesn't conflict with last (can't del/ins/chg two in a row)
+           and isn't too long (can't change/del more than what remains)."""
+        if lastaction[0] == 'D' or lastaction[0] == 'C' or lastaction[0] == 'I':
+            for x in xrange(len(decision)):
+                if decision[x][0][0] != lastaction[0]:
+                    if decision[x][0][0] == u'C' and len(decision[x][0][1:]) > maxlength:
+                        continue
+                    if decision[x][0][0] == u'D' and int(decision[x][0][1:]) > maxlength:
+                        continue
+                    return decision[x][0]
+        else:
+            return decision[0][0]
+            
+    def add_features(self, pos, features, classes, mode):
+        """Adds a collection of feautures and classes to a pos model
+           'mode' is either 'tolemma' or 'fromlemma'."""
+        if pos not in self.models:
+            self.models[pos] = MorphModel()
+        self.models[pos].features[mode] = features
+        self.models[pos].classes[mode] = classes
+        
+    def get_pos(self):
+        """Simply lists all poses associated with a model."""
+        return list(self.models.keys())
+
+    def add_classifier(self, pos, classifier, mode):
+        """Adds a classifier to a pos model in a certain mode."""
+        self.models[pos].classifier[mode] = classifier
+        
+    def get_features(self, pos, mode):
+        return self.models[pos].features[mode]
+
+    def get_classes(self, pos, mode):
+        return self.models[pos].classes[mode]
+
+    def extract_task3(self, lang, path):
+        
+        # We use the msd/form combinations from all three
+        msdform = set()
+        lines = [line.strip() for line in codecs.open(path + lang +'-task1-train', "r", encoding="utf-8")]
+        for l in lines:
+            lemma, msd, form = l.split(u'\t')
+            msdform.add((msd, form))
+        lines = [line.strip() for line in codecs.open(path + lang +'-task2-train', "r", encoding="utf-8")]
+        for l in lines:
+            msd1, form1, msd2, form2 = l.split(u'\t')
+            msdform.add((msd1, form1))
+            msdform.add((msd2, form2))
+        lines = [line.strip() for line in codecs.open(path + lang +'-task3-train', "r", encoding="utf-8")]
+        for l in lines:
+            form1, msd2, form2 = l.split(u'\t')
+            msdform.add((msd2, form2))
+
+        self.msdfeatures = []
+        self.msdclasses = []
+        for msd, form in msdform:
+            formfeatures = extract_substrings(form)
+            self.msdfeatures.append(formfeatures)
+            self.msdclasses.append(msd)
+                
+    def extract_task1(self, filename, mode, path):
+        """Parse a file and extract features/classes for
+        mapping to and from a lemma form."""
+    
+        lemmas = {}
+        poses = set()
+        lines = [line.strip() for line in codecs.open(path + filename, "r", encoding="utf-8")]
+        for l in lines:
+            if 'pos=' not in l:
+                continue
+            lemma, feats, form = l.split(u'\t')
+            pos = re.match(r'pos=([^,]*)', feats).group(1)
+            if lemma not in lemmas:
+                lemmas[lemma] = []
+                lemmas[lemma].append((lemma, 'pos=' + pos + ',lemma=true'))
+            lemmas[lemma].append((form, feats))
+            if pos not in poses:
+                poses.add(pos)
+
+        pairs = []
+        wordpairs = []
+        for lemma in lemmas:
+            lemmafeatures = lemmas[lemma]
+            for x in lemmafeatures:
+                for y in lemmafeatures:
+                    if (x != y) and ('lemma=true' in x[1]) and (mode == 'fromlemma'):
+                        pairs.append(tuple((x[0], y[0], y[1])))
+                        # inword, outword, msdfeatures
+                        wordpairs.append(tuple((x[0], y[0])))
+                    elif (x != y) and ('lemma=true' in x[1]) and (mode == 'tolemma'):
+                        pairs.append(tuple((y[0], x[0], y[1])))
+                        # inword, outword, msdfeatures
+                        wordpairs.append(tuple((y[0], x[0])))
+
+        if ALIGNTYPE == 'mcmc':
+            alignedpairs = mcmc_align(wordpairs, ALIGN_SYM)
+        elif ALIGNTYPE == 'med':
+            alignedpairs = med_align(wordpairs, ALIGN_SYM)
+        else:
+            alignedpairs = dumb_align(wordpairs, ALIGN_SYM)
+        
+        chunkedpairs = chunk(alignedpairs)
+
+        for pos in poses: # Do one model per POS
+            features = []
+            classes = []
+            for idx, pair in enumerate(chunkedpairs):
+                if 'pos=' + pos not in pairs[idx][2]:
+                    continue
+                instring = ['<'] + [x[0] for x in pair] + ['>']
+                outstring = ['<'] + [x[1] for x in pair] + ['>']
+
+                #msdfeatures = pairs[idx][2].split(':') # separate features 
+                msdfeatures = [ pairs[idx][2] ] # don't separate features
+                msdfeatures = ['MSD:' + f for f in msdfeatures]
+                prevaction = 'None'
+                for position in range(0, len(instring)):
+                    thiscl, feats = train_get_features(instring, outstring, position)
+                    classes.append(thiscl)
+                    featurelist = list(feats) + msdfeatures + ['prevaction='+prevaction]
+                    featurelist = feature_pairs(featurelist)
+                    features.append(featurelist)
+                    prevaction = thiscl
+            self.add_features(pos, features, classes, mode)
+
+def feature_pairs(f):
+    """Expand features to include pairs of features 
+    where one is always a f=v feature."""
+    pairs = [x + ".x." + y for x in f for y in f if u'=' in y]
+    return pairs + f
+    
+def dumb_align(wordpairs, align_symbol):
+    alignedpairs = []
+    for idx, pair in enumerate(wordpairs):
+        ins = pair[0]
+        outs = pair[1]
+        if len(ins) > len(outs):
+            outs = outs + align_symbol * (len(ins)-len(outs))
+        elif len(outs) > len(ins):
+            ins = ins + align_symbol * (len(outs)-len(ins))
+            alignedpairs.append((ins, outs))
+    return alignedpairs
+    
+def mcmc_align(wordpairs, align_symbol):
+    a = align.Aligner(wordpairs, align_symbol = align_symbol, random_seed = 42)
+    return a.alignedpairs
+    
+def med_align(wordpairs, align_symbol):
+    a = align.Aligner(wordpairs, align_symbol = align_symbol, mode = 'med')
+    return a.alignedpairs
+
+def train_get_surrounding_syms(s, position, featureprefix, lookright = True):
+    """Get surrounding symbols from a list of chunks and position.
+    >>> s = ['<', u'a', u'b', u'u', u'_', u't', u'a', u'n', u'doka', '>']
+    >>> train_get_surrounding_syms(s, 4, 'in_')
+    set([u'nin_ta', u'nin_t', u'nin_tan', u'pin_u', u'pin_bu', u'pin_abu'])
+    """
+    leftfeats = set()
+    rightfeats = set()
+    if position == 0:
+        leftfeats |= {u'p' + featureprefix + u'none'}
+    if (position == len(s)) and lookright:
+        rightfeats |= {u'n' + featureprefix + u'none'}
+    if position > 0:
+        left = ''.join(s[:position]).replace(u'_', u'')
+        leftfeats |= {u'p' + featureprefix + left[x:] for x in [-1,-2,-3]}
+    if (position < len(s)) and lookright:
+        right = ''.join(s[position:]).replace(u'_', u'')
+        rightfeats |= {u'n' + featureprefix + right[:x] for x in [1,2,3]}
+    return leftfeats | rightfeats
+    
+def train_get_features(ins, outs, position):
+    feats = set()
+    # Get class first #
+    if ins[position] == outs[position]:
+        cl = "R"
+    elif u'_' in ins[position]:
+        cl = "I" + outs[position]
+    elif u'_' in outs[position]:
+        cl = "D" + unicode(len(ins[position]))
+    else:
+        cl = "C" + outs[position]
+        
+    # Get features of surrounding symbols #
+    feats |= train_get_surrounding_syms(ins, position, u'in_')
+    feats |= train_get_surrounding_syms(outs, position, u'out_', lookright = False)
+    return cl, feats
+
+def interpret_action(action, ins):
+    """Interpret classifier class: return length of input to consume + output."""
+    if action[0] == u'R':
+        return (1, ins)
+    elif action[0] == u'D':
+        return int(action[1:]), u''
+    elif action[0] == u'C':
+        return len(action[1:]), action[1:]
+    elif action[0] == u'I':
+        return 0, action[1:]
+    
+def chopup(s, t):
+    """Returns grouped alignment of two strings
+       in such a way that consecutive del/ins/chg operations
+       are grouped to be one single operation.
+       The input is two 1-to-1 aligned strings where _ = empty string.
+    >>> chopup(['ka__yyab','kaxx__xy'])
+    (['k', 'a', u'_', 'yy', 'ab'], ['k', 'a', 'xx', u'_', 'xy'])
+    """
+    def action(inchar, outchar):
+        if inchar == u'_':
+            return 'ins'
+        elif outchar == u'_':
+            return 'del'
+        elif inchar != outchar:
+            return 'chg'
+        else:
+            return 'rep'
+            
+    idx = 1
+    s = list(s)
+    t = list(t)
+    while idx < len(s):
+        l = action(s[idx-1], t[idx-1])
+        r = action(s[idx], t[idx])
+        if (l == 'rep' and r == 'rep') or (l != r):
+            s.insert(idx, ' ')
+            t.insert(idx, ' ')
+            idx += 1
+        idx += 1
+    s = tuple(u'_' if u'_' in x else x for x in ''.join(s).split(' '))
+    t = tuple(u'_' if u'_' in x else x for x in ''.join(t).split(' '))
+    return zip(s,t)
+    
+def chunk(pairs):
+    """Chunk alignments to have possibly more than one symbol-one symbol."""
+    chunkedpairs = []
+    for instr, outstr in pairs:
+        chunkedpairs.append(chopup(instr, outstr))
+    return chunkedpairs
+          
+def extract_substrings(word):
+    """Get len 2/3 substrings and return as list."""
+    w3 = zip(word, word[1:], word[2:])
+    w2 = zip(word, word[1:])
+    return [''.join(x) for x in w2+w3]
+
+def announce(*objs):
+    print("***", *objs, file = sys.stderr)
+    
+def main(argv):
+    global ALIGN_SYM
+    global ALIGNTYPE
+    global TASK
+    
+    options, remainder = getopt.gnu_getopt(argv[1:], 'l:t:a:p:', ['language=','task=','align=','path='])
+
+    PATH, ALIGN_SYM, ALIGNTYPE, TASK = './', u'_', 'mcmc', 1
+    for opt, arg in options:
+        if opt in ('-l', '--language'):
+            LANGUAGE = arg
+        elif opt in ('-t', '--task'):
+            TASK = int(arg)
+        elif opt in ('-a', '--align'):
+            ALIGNTYPE = arg
+        elif opt in ('-p', '--path'):
+            PATH = arg
+
+    train = Morph()
+    announce(LANGUAGE + ": learning alignment for form > lemma mapping")
+    train.extract_task1(LANGUAGE + '-task1-train', 'fromlemma', PATH)
+    if TASK == 2 or TASK == 3:
+        announce(LANGUAGE + ": learning alignment for lemma > form mapping")
+        train.extract_task1(LANGUAGE + '-task1-train', 'tolemma', PATH)
+
+    if TASK == 1 or TASK == 2 or TASK == 3:
+        for pos in train.get_pos():
+            announce(LANGUAGE + ": training " + pos + " for lemma > form mapping")
+            P = perceptron_c.Perceptron(shuffle = True, averaged = True, verbose = True, max_iter = 10, random_seed = 42)
+            P.fit(train.get_features(pos, 'fromlemma'), train.get_classes(pos, 'fromlemma'))
+            train.add_classifier(pos, P, 'fromlemma')
+
+    if TASK == 2 or TASK == 3:
+        for pos in train.get_pos():
+            announce(LANGUAGE + ": training " + pos + " for form > lemma mapping")
+            P = perceptron_c.Perceptron(shuffle = True, averaged = True, verbose = True, max_iter = 10, random_seed = 42)
+            P.fit(train.get_features(pos, 'tolemma'), train.get_classes(pos, 'tolemma'))
+            train.add_classifier(pos, P, 'tolemma')
+
+    if TASK == 3:
+        train.extract_task3(LANGUAGE, PATH)
+        announce(LANGUAGE + ": training form > msd classifier")
+        train.msdclassifier = perceptron_c.Perceptron(shuffle = True, averaged = True, verbose = True, max_iter = 10, random_seed = 42)
+        train.msdclassifier.fit(train.msdfeatures, train.msdclasses)
+        
+    testlines = [line.strip() for line in codecs.open(PATH+LANGUAGE + '-task' + str(TASK) + '-dev', "r", encoding="utf-8")]
+    if TASK == 1:
+        for l in testlines:
+            lemma, targetmsd, wordform = l.split('\t')
+            guess = train.generate(lemma, targetmsd, 'fromlemma')
+            print((lemma + "\t" + targetmsd + "\t" + guess).encode("utf-8"))
+            
+    if TASK == 2:
+        for l in testlines:
+            sourcemsd, sourceform, targetmsd, targetform = l.split('\t')
+            lemma = train.generate(sourceform, sourcemsd, 'tolemma')
+            guess = train.generate(lemma, targetmsd, 'fromlemma')
+            print((sourcemsd + "\t" + sourceform + "\t" + targetmsd + "\t" + guess).encode("utf-8"))
+
+    if TASK == 3:
+        for l in testlines:
+            sourceform, targetmsd, targetform = l.split('\t')
+            sourcemsd = train.msdclassifier.predict(extract_substrings(sourceform))
+            lemma = train.generate(sourceform, sourcemsd, 'tolemma')
+            guess = train.generate(lemma, targetmsd, 'fromlemma')
+            print((sourceform + "\t" + targetmsd + "\t" + guess).encode("utf-8"))
+            
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/baseline/perceptron.c b/baseline/perceptron.c
new file mode 100644
index 0000000..f3a6420
--- /dev/null
+++ b/baseline/perceptron.c
@@ -0,0 +1,397 @@
+/************************************************************************/
+/* Simple perceptron/averaged perceptron library                        */
+/* Author: Mans Hulden (mans.hulden@gmail.com)                          */
+/* Copyright 2014 Mans Hulden                                           */
+/*                                                                      */
+/* Licensed under the Apache License, Version 2.0 (the "License");      */
+/* you may not use this file except in compliance with the License.     */
+/* You may obtain a copy of the License at                              */
+/*                                                                      */
+/*     http://www.apache.org/licenses/LICENSE-2.0                       */
+/*                                                                      */
+/* - MH20140831                                                         */
+/************************************************************************/
+
+/* To build for python bindings: gcc -O3 -Wall -Wextra -shared perceptron.c -o libperceptron.so */
+
+/* Usage:
+
+(1) Call perceptron_init with desired parameters
+(2) Add examples to training set using examples_add()
+    - optionally also to dev set using devexamples_add()
+(3) Train perceptron using perceptron_train()
+(4) Classify with perceptron_decision_function_int()/perceptron_decision_function_double()
+    (int is for non-averaged/double for averaged)
+    which return a vector of weights for all classes
+    or:
+    Just use perceptron_classify_double()/perceptron_classify_int()
+    which return the best class index
+(5) perceptron_destroy() frees all data structures associated
+
+Notes:
+
+- Only supports binary features in examples
+- Uses a sparse representation where only hot features are given for examples
+- Weights are integers, although double weights are used for the averaged case
+
+*/
+
+/******************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>  /* For INT_MAX */
+#include <float.h>   /* For DBL_MAX */
+
+struct perceptron {
+    struct examples *ex;     /* Training examples */
+    struct examples *devex;  /* Dev examples      */
+    int averaged;            /* Use averaged perceptron or vanilla? */
+    int tune_on_averaged;    /* Whether to tune AP on averaged weights or running weights */
+    int num_examples;        /* Training set size */
+    int num_devexamples;     /* Dev set size      */
+    int examplecounter;      /* Running counter when adding examples one-by-one */
+    int devexamplecounter;   /* Running counter when adding examples one-by-one */
+    int num_classes;         /* Number of distinct classes */
+    int num_features;        /* Numer of features */
+    int max_iter;            /* Max iterations to run */
+    int shuffle;             /* Shuffle examples before each iteration? */
+    int verbose;             /* Print stats to stderr? */
+
+    int *intweights;         /* Running int weights */
+    int *intbiases;          /* Running int biases  */
+    double *doubleweights;   /* Weights for averaged perceptron */
+    double *doublebiases;    /* Biases */
+    double *lastdweights;    /* Store temp weights for tuning w/ dev set */
+    double *lastdbiases;     /* Store temp biases for tuning w/ dev set */
+    int *lastiweights;       /* Store temp weights for tuning w/ dev set */
+    int *lastibiases;        /* Store temp biases for tuning w/ dev set */
+};
+
+struct examples {
+    int *hotfeatures; /* A list of the features that are hot in this example */
+    int len;          /* Number of hot features in example */
+    int correctclass; /* The class the example belongs to */
+};
+
+/******************************************************************************/
+
+/* Initialize the perceptron structure, returns handle */
+struct perceptron *perceptron_init(int max_iter, int num_examples, int num_devexamples, int num_features, int num_classes, int averaged, int shuffle, int random_seed, int tune_on_averaged, int verbose);
+
+/* Train perceptron w/ current training/(dev) examples and settings */
+void perceptron_train(struct perceptron *perceptron);
+
+/* Free examples/weights + perceptron data structure */
+void perceptron_destroy(struct perceptron *p);
+
+/* Decision function for example (features holds list of hot features, len is number of hot features) */
+/* Returns vector of weights for each class (highest weight is best class) */
+double *perceptron_decision_function_double(struct perceptron *perceptron, int *features, int len);
+
+/* Decision function for non-averaged perceptron */
+/* Returns vector of weights for each class (highest weight is best class) */
+int *perceptron_decision_function_int(struct perceptron *perceptron, int *features, int len);
+
+/* Classify function for example (features holds list of hot features, len is number of hot features) */
+/* Returns best class number */
+/* dev_tuning and numiter are internal parameters used while training/set these to 0,0 */
+int perceptron_classify_double(struct perceptron *perceptron, int *features, int len, int dev_tuning, int numiter);
+
+/* Classify function for example (features holds list of hot features, len is number of hot features) */
+/* Returns best class number */
+/* Use for non-averaged perceptron */
+int perceptron_classify_int(struct perceptron *perceptron, int *features, int len);
+
+/* Add an example to the training set */
+/* supply perceptron handle, a vector of hot features, len of this vector, and correct class index */
+void examples_add(struct perceptron *perceptron, int *features, int len, int correctclass);
+
+/* Add an example to the dev set */
+/* supply perceptron handle, a vector of hot features, len of this vector, and correct class index */
+void devexamples_add(struct perceptron *perceptron, int *features, int len, int correctclass);
+
+/******************************************************************************/
+
+struct perceptron *perceptron_init(int max_iter, int num_examples, int num_devexamples, int num_features, int num_classes, int averaged, int shuffle, int random_seed, int tune_on_averaged, int verbose) {
+    struct perceptron *p;
+    p = calloc(1, sizeof(struct perceptron));
+    p->max_iter = max_iter;
+    p->num_examples = num_examples;
+    p->examplecounter = 0;
+    p->devexamplecounter = 0;
+    p->num_classes = num_classes;
+    p->num_features = num_features;
+    p->shuffle = shuffle;
+    p->ex = calloc(num_examples, sizeof(struct examples));
+    p->verbose = verbose;
+    p->tune_on_averaged = tune_on_averaged;
+    if (random_seed)
+	srand(random_seed);
+    if (num_devexamples > 0) {
+	p->num_devexamples = num_devexamples;
+	p->devex = calloc(num_devexamples, sizeof(struct examples));
+    }
+    p->intweights = calloc(num_features * num_classes, sizeof(int));
+    p->intbiases = calloc(num_classes, sizeof(int));
+    p->averaged = averaged;
+    if (p->averaged) {
+	p->doubleweights = calloc(num_features * num_classes, sizeof(double));
+	p->doublebiases = calloc(num_classes, sizeof(double));
+	p->lastdweights = calloc(num_features * num_classes, sizeof(double));
+	p->lastdbiases = calloc(num_classes, sizeof(double));
+	p->lastiweights = calloc(num_features * num_classes, sizeof(int));
+	p->lastibiases = calloc(num_classes, sizeof(int));
+    }
+    return p;
+}
+
+static int rand_int(int n) {
+    int limit = RAND_MAX - RAND_MAX % n;
+    int rnd;
+    do {
+	rnd = rand();
+    } while (rnd >= limit);
+    return rnd % n;
+}
+
+void shuffle(int *array, int n) {
+    int i, j, tmp;
+    for (i = n - 1; i > 0; i--) {
+	j = rand_int(i + 1);
+	tmp = array[j];
+	array[j] = array[i];
+	array[i] = tmp;
+    }
+}
+
+void perceptron_train(struct perceptron *perceptron) {
+    int i, j, n, m, guessedclass, correctclass, *weightptr, numincorrect, itercount, *classorder, devcorrect, devlastcorrect;
+    double *dweightptr;
+    itercount = 1;
+    classorder = calloc(perceptron->num_examples, sizeof(int));
+    for (i = 0; i < perceptron->num_examples; i++) {
+	classorder[i] = i;
+    }
+    devlastcorrect = 0;
+    for (i = 0; i < perceptron->max_iter; i++) {
+	if (perceptron->shuffle)
+	    shuffle(classorder, perceptron->num_examples);
+	numincorrect = 0;
+	for (n = 0; n < perceptron->num_examples; n++) {
+	    m = classorder[n];
+	    guessedclass = perceptron_classify_int(perceptron, perceptron->ex[m].hotfeatures, perceptron->ex[m].len);
+	    correctclass = perceptron->ex[m].correctclass;
+	    if (guessedclass != correctclass) {
+		numincorrect++;
+		for (j = 0; j < perceptron->ex[m].len; j++) {
+		    weightptr = perceptron->intweights + perceptron->num_features * correctclass; /* Points to correct class weights */
+		    weightptr += *(perceptron->ex[m].hotfeatures + j);
+		    *(weightptr) += 1;
+		    
+		    weightptr = perceptron->intweights + perceptron->num_features * guessedclass; /* Points to incorrect class weights */
+		    weightptr += *(perceptron->ex[m].hotfeatures + j);
+		    *(weightptr) -= 1;		    
+		}
+		perceptron->intbiases[correctclass] += 1;
+		perceptron->intbiases[guessedclass] -= 1;
+ 		if (perceptron->averaged) {
+		    for (j = 0; j < perceptron->ex[m].len; j++) {
+			dweightptr = perceptron->doubleweights + perceptron->num_features * correctclass; /* Points to correct class weights */
+			dweightptr += *(perceptron->ex[m].hotfeatures + j);
+			*(dweightptr) += 1.0 * (double)itercount;
+
+			dweightptr = perceptron->doubleweights + perceptron->num_features * guessedclass; /* Points to incorrect class weights */
+			dweightptr += *(perceptron->ex[m].hotfeatures + j);
+			*(dweightptr) -= 1.0 * (double)itercount;
+		    }
+		    perceptron->doublebiases[correctclass] += 1.0 * (double)itercount;
+		    perceptron->doublebiases[guessedclass] -= 1.0 * (double)itercount;
+		}
+	    }
+	    itercount++;	   
+	}
+
+	/* Print stats */
+	if (perceptron->verbose) {
+	    fprintf(stderr, "Iteration %i - TRAIN: (%i/%i) %lg", i+1, perceptron->num_examples-numincorrect, perceptron->num_examples, (double)(perceptron->num_examples-numincorrect)/(double)perceptron->num_examples);
+	}
+	/* Now test on dev set (if available) */
+	if (perceptron->num_devexamples > 0) {
+	    for (n = 0, devcorrect = 0; n < perceptron->num_devexamples; n++) {
+		if (perceptron->averaged && perceptron->tune_on_averaged) {
+		    guessedclass = perceptron_classify_double(perceptron, perceptron->devex[n].hotfeatures, perceptron->devex[n].len, 1, itercount);
+		} else {
+		    guessedclass = perceptron_classify_int(perceptron, perceptron->devex[n].hotfeatures, perceptron->devex[n].len);
+		}
+		correctclass = perceptron->devex[n].correctclass;
+		if (guessedclass == correctclass) {
+		    devcorrect++;
+		}
+	    }
+	    if (perceptron->verbose)
+		fprintf(stderr, " - DEV (%i/%i) %lg", devcorrect, perceptron->num_devexamples, (double)devcorrect/(double)perceptron->num_devexamples);
+	    if (devcorrect < devlastcorrect) {
+		if (perceptron->verbose)
+		    fprintf(stderr, "\n");
+		break; /* Stop iterations - performance went down */
+	    }
+	    devlastcorrect = devcorrect;
+	} 
+	if (perceptron->verbose)
+	    fprintf(stderr, "\n");
+	if (numincorrect == 0) {
+	    break;
+	}
+	/* Store current (averaged) weights so we can restore them if performance goes down */
+	if (perceptron->averaged) {
+	    memcpy(perceptron->lastdweights, perceptron->doubleweights, perceptron->num_features * perceptron->num_classes * sizeof(double));
+	    memcpy(perceptron->lastdbiases, perceptron->doublebiases, perceptron->num_classes * sizeof(double));
+	    memcpy(perceptron->lastiweights, perceptron->intweights, perceptron->num_features * perceptron->num_classes * sizeof(int));
+	    memcpy(perceptron->lastibiases, perceptron->intbiases, perceptron->num_classes * sizeof(int));
+	}
+    }
+    if (perceptron->averaged) {
+	/* If we use AP w/ dev set, take previous weights because performance has dropped on dev set */
+	for (i = 0; i < perceptron->num_features * perceptron->num_classes; i++) {
+	    if (perceptron->num_devexamples > 0 && perceptron->tune_on_averaged)
+		perceptron->doubleweights[i] = (double)perceptron->lastiweights[i] - perceptron->lastdweights[i]/((double)itercount - 1);
+	    else
+		perceptron->doubleweights[i] = (double)perceptron->intweights[i] - perceptron->doubleweights[i]/(double)itercount;
+	}
+	for (i = 0; i < perceptron->num_classes; i++) {
+	    if (perceptron->num_devexamples > 0 && perceptron->tune_on_averaged)
+		perceptron->doublebiases[i] = (double)perceptron->lastibiases[i] - perceptron->lastdbiases[i]/((double)itercount - 1);
+	    else 
+		perceptron->doublebiases[i] = (double)perceptron->intbiases[i] - perceptron->doublebiases[i]/(double)itercount;
+	}
+    }
+}
+
+void perceptron_free_wrapper(void *ptr) {
+    if (ptr != NULL)
+	free(ptr);
+}
+
+void perceptron_destroy(struct perceptron *p) {
+    free(p->ex);
+    if (p->num_devexamples > 0) {
+	free(p->devex);
+    }
+    free(p->intweights);
+    free(p->intbiases);
+    if (p->averaged) {
+	free(p->doubleweights);
+	free(p->doublebiases);
+	free(p->lastdweights);
+	free(p->lastdbiases);
+	free(p->lastiweights);
+	free(p->lastibiases);
+    }
+    free(p);
+}
+
+double *perceptron_decision_function_double(struct perceptron *perceptron, int *features, int len) {
+    int f, c, fnum;
+    double cumweight, *fweight, *prediction;
+    prediction = calloc(perceptron->num_classes, sizeof(double));
+    for (c = 0; c < perceptron->num_classes; c++) {
+	cumweight = 0.0;
+	for (f = 0; f < len; f++) {
+	    fnum = features[f]; /* Feature that is hot */
+	    fweight = perceptron->doubleweights + perceptron->num_features * c + fnum;
+	    cumweight += *fweight;
+	}
+	cumweight += perceptron->doublebiases[c];
+	prediction[c] = cumweight;
+    }
+    return prediction;
+}
+
+int *perceptron_decision_function_int(struct perceptron *perceptron, int *features, int len) {
+    int f, c, fnum;
+    int cumweight, *fweight, *prediction;
+    prediction = calloc(perceptron->num_classes, sizeof(int));
+    for (c = 0; c < perceptron->num_classes; c++) {
+	cumweight = 0;
+	for (f = 0; f < len; f++) {
+	    fnum = features[f]; /* Feature that is hot */
+	    fweight = perceptron->intweights + perceptron->num_features * c + fnum;
+	    cumweight += *fweight;
+	}
+	cumweight += perceptron->intbiases[c];
+	prediction[c] = cumweight;
+    }
+    return prediction;
+}
+
+int perceptron_classify_double(struct perceptron *perceptron, int *features, int len, int dev_tuning, int numiter) {
+    int f, c, fnum, maxclass, ptr;
+    double maxweight, cumweight, fweight;
+    maxclass = 0;
+    maxweight = -DBL_MAX;
+    for (c = 0; c < perceptron->num_classes; c++) {
+	cumweight = 0.0;
+	for (f = 0; f < len; f++) {
+	    fnum = features[f]; /* Feature that is hot */
+	    if (dev_tuning) {
+		ptr = perceptron->num_features * c + fnum;
+		fweight = (double)perceptron->intweights[ptr] - perceptron->doubleweights[ptr]/(double)numiter;
+	    } else {
+		fweight = perceptron->doubleweights[perceptron->num_features * c + fnum];
+	    }
+	    cumweight += fweight;
+	}
+	if (dev_tuning) {
+	    cumweight += (double)perceptron->intbiases[c] - perceptron->doublebiases[c]/(double)numiter;
+	} else {
+	    cumweight += perceptron->doublebiases[c];
+	}
+	if (cumweight > maxweight) {
+	    maxweight = cumweight;
+	    maxclass = c;
+	}
+    }
+    return maxclass;
+}
+
+int perceptron_classify_int(struct perceptron *perceptron, int *features, int len) {
+    int f, c, *fweight, fnum, maxclass, maxweight, cumweight;    
+    maxclass = 0;
+    maxweight = -INT_MAX;
+    for (c = 0; c < perceptron->num_classes; c++) {
+	cumweight = 0;
+	for (f = 0; f < len; f++) {
+	    fnum = features[f]; /* Feature that is hot */
+	    fweight = perceptron->intweights + perceptron->num_features * c + fnum;
+	    cumweight += *fweight;
+	}
+	cumweight += perceptron->intbiases[c];
+	if (cumweight > maxweight) {
+	    maxweight = cumweight;
+	    maxclass = c;
+	}
+    }
+    return maxclass;
+}
+
+void examples_add(struct perceptron *perceptron, int *features, int len, int correctclass) {
+    struct examples *ex;
+    ex = perceptron->ex;
+    ex[perceptron->examplecounter].len = len;
+    ex[perceptron->examplecounter].correctclass = correctclass;
+    ex[perceptron->examplecounter].hotfeatures = malloc(len * sizeof(int));
+    memcpy(ex[perceptron->examplecounter].hotfeatures, features, len * sizeof(int));
+    perceptron->examplecounter++;
+}
+
+void devexamples_add(struct perceptron *perceptron, int *features, int len, int correctclass) {
+    struct examples *ex;
+    ex = perceptron->devex;
+    ex[perceptron->devexamplecounter].len = len;
+    ex[perceptron->devexamplecounter].correctclass = correctclass;
+    ex[perceptron->devexamplecounter].hotfeatures = malloc(len * sizeof(int));
+    memcpy(ex[perceptron->devexamplecounter].hotfeatures, features, len * sizeof(int));
+    perceptron->devexamplecounter++;
+}
diff --git a/baseline/perceptron_c.py b/baseline/perceptron_c.py
new file mode 100644
index 0000000..bbbd094
--- /dev/null
+++ b/baseline/perceptron_c.py
@@ -0,0 +1,114 @@
+# Wrapper around simple perceptron/averaged perceptron C-library.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Relies on C-code in libpercetron.so built from percetron.c through ctypes.
+# Author: Mans Hulden
+
+from ctypes import *
+
+perceptron = cdll.LoadLibrary('./libperceptron.so')
+
+perceptron_perceptron_init = perceptron.perceptron_init
+perceptron_perceptron_init.restype = c_void_p
+perceptron_examples_add = perceptron.examples_add
+perceptron_examples_add.restype = None
+perceptron_devexamples_add = perceptron.devexamples_add
+perceptron_devexamples_add.restype = None
+perceptron_perceptron_train = perceptron.perceptron_train
+perceptron_perceptron_train.restype = None
+perceptron_perceptron_classify_int = perceptron.perceptron_classify_int
+perceptron_perceptron_classify_int.restype = c_int
+perceptron_perceptron_classify_double = perceptron.perceptron_classify_double
+perceptron_perceptron_classify_double.restype = c_int
+perceptron_perceptron_decision_function_double = perceptron.perceptron_decision_function_double
+perceptron_perceptron_decision_function_double.restype = POINTER(c_double)
+perceptron_perceptron_decision_function_int = perceptron.perceptron_decision_function_int
+perceptron_perceptron_decision_function_int.restype = POINTER(c_int)
+perceptron_perceptron_free_wrapper = perceptron.perceptron_free_wrapper
+perceptron_perceptron_destroy = perceptron.perceptron_destroy
+
+class Perceptron:
+
+    def __init__(self, max_iter = 20, averaged = False, shuffle = True, random_seed = False, tune_on_averaged = False, verbose = False):
+        self.max_iter = max_iter
+        self.averaged = averaged
+        self.shuffle = shuffle
+        self.random_seed = random_seed
+        self.verbose = verbose
+        self.tune_on_averaged = tune_on_averaged
+        self.perceptronhandle = None
+
+    def __del__(self):
+        if self.perceptronhandle:
+            perceptron_perceptron_destroy(c_void_p(self.perceptronhandle))
+            
+    def fit(self, features, classes, devfeatures = [], devclasses = []):
+        # Map features to integers starting from 0
+        self.num_examples = len(features)
+        self.num_devexamples = len(devfeatures)
+        fset = sorted(list(set([f for g in features + devfeatures for f in g])))
+        self.inttofeat = dict(zip(range(len(fset)), fset))
+        self.feattoint = dict(zip(fset, range(len(fset))))
+        self.features = [[self.feattoint[f] for f in g] for g in features]
+        self.num_features = len(fset)
+        self.devfeatures = [[self.feattoint[f] for f in g] for g in devfeatures]
+        
+        # Map classes to integers starting from 0        
+        cset = sorted(list(set([c for c in classes + devclasses])))
+        self.inttoclass = dict(zip(range(len(cset)), cset))
+        self.classtoint = dict(zip(cset, range(len(cset))))
+        self.classes = [self.classtoint[f] for f in classes]
+        self.devclasses = [self.classtoint[f] for f in devclasses]
+        self.num_classes = len(cset)
+
+        self.perceptronhandle = perceptron_perceptron_init(c_int(self.max_iter), c_int(self.num_examples), c_int(self.num_devexamples), c_int(self.num_features), c_int(self.num_classes), c_int(self.averaged), c_int(self.shuffle), c_int(self.random_seed), c_int(self.tune_on_averaged), c_int(self.verbose))
+
+        for index, example_fs in enumerate(self.features):
+            f = (c_int * len(example_fs))(*example_fs)
+            perceptron_examples_add(c_void_p(self.perceptronhandle), f, c_int(len(example_fs)), c_int(self.classes[index]))
+
+        for index, example_fs in enumerate(self.devfeatures):
+            f = (c_int * len(example_fs))(*example_fs)
+            perceptron_devexamples_add(c_void_p(self.perceptronhandle), f, c_int(len(example_fs)), c_int(self.devclasses[index]))
+            
+        perceptron_perceptron_train(c_void_p(self.perceptronhandle))
+
+    def decision_function(self, features):
+        test_fs = [self.feattoint[f] for f in features if f in self.feattoint]
+        f = (c_int * len(test_fs))(*test_fs)
+        if self.averaged:
+            classweights = perceptron_perceptron_decision_function_double(c_void_p(self.perceptronhandle), f, c_int(len(test_fs)))
+        else:
+            classweights =  perceptron_perceptron_decision_function_int(c_void_p(self.perceptronhandle), f, c_int(len(test_fs)))
+        c = [(self.inttoclass[i], classweights[i]) for i in range(self.num_classes)]
+        perceptron_perceptron_free_wrapper(classweights)
+        return c
+        
+    def predict(self, features):
+        test_fs = [self.feattoint[f] for f in features if f in self.feattoint]
+        f = (c_int * len(test_fs))(*test_fs)
+        if self.averaged:
+            correctclass = perceptron_perceptron_classify_double(c_void_p(self.perceptronhandle), f, c_int(len(test_fs)), c_int(0), c_int(0))
+        else:
+            correctclass = perceptron_perceptron_classify_int(c_void_p(self.perceptronhandle), f, c_int(len(test_fs)))
+        return self.inttoclass[correctclass]
+
+if __name__ == "__main__":
+    P = Perceptron(shuffle = True, averaged = True, verbose = True)
+    # 4 training examples, no dev examples (can use any data type for features)
+    # We simply list the 'hot' features for each example 
+    features = [['w','x','y','z'], ['u','w','x'],[232,'w'],[232,'x','y','z']]
+    # The corresponding classes
+    classes = ['CLASS_A','CLASS_A','CLASS_B','CLASS_A']
+    # Train
+    P.fit(features, classes)
+    # Show probabilities of the classes for an instance
+    print(P.decision_function([232, 'w', 'z']))  # Print weights for classes
+    # Show how the classes correspond to indices
+    print(P.classtoint)
+    # Show the best class for example
+    print(P.predict([232, 'w', 'z']))
diff --git a/load_and_test_model_interactive.sh b/load_and_test_model_interactive.sh
new file mode 100755
index 0000000..99f5868
--- /dev/null
+++ b/load_and_test_model_interactive.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+ME=`basename $0` # for usage message
+
+if [ "$#" -ne 3 ]; then 	# number of args
+    echo "USAGE: ${ME} <ftrain> <model> <seed>"
+    echo 
+    exit
+fi
+ftrain=$1
+model=$2
+seed=$3
+time python -m seq2seq.main \
+     --ftrain ${ftrain} \
+     --mono \
+     --beam_width 1 \
+     --restore ${model} \
+     --interactive \
+     --seed ${seed}
+
+if [[ $? == 0 ]]        # success
+then
+    :                   # do nothing
+else                    # something went wrong
+    echo "SOME PROBLEM OCCURED";            # echo file with problems
+fi
diff --git a/load_and_test_model_on_files.sh b/load_and_test_model_on_files.sh
new file mode 100755
index 0000000..713e5ac
--- /dev/null
+++ b/load_and_test_model_on_files.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+ME=`basename $0` # for usage message
+
+if [ "$#" -ne 5 ]; then 	# number of args
+    echo "USAGE: <ftrain> <ftest> <model> <seed> <outfile>"
+    echo "$ME"
+    exit
+fi
+ftrain=$1
+ftest=$2
+model=$3
+seed=$4
+out=$5
+time python -m seq2seq.main \
+     --ftrain ${ftrain} \
+     --ftest ${ftest} \
+     --mono \
+     --beam_width 1 \
+     --restore ${model} \
+     --seed ${seed} \
+     --dump ${out}
+
+
+
+
+
+if [[ $? == 0 ]]        # success
+then
+    :                   # do nothing
+else                    # something went wrong
+    echo "SOME PROBLEM OCCURED";            # echo file with problems
+fi
diff --git a/readers/aligned_reader.py b/readers/aligned_reader.py
new file mode 100644
index 0000000..46ab9ab
--- /dev/null
+++ b/readers/aligned_reader.py
@@ -0,0 +1,121 @@
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import logging
+
+from seq2seq.lang import Lang
+from seq2seq.constants import ALIGN_SYMBOL
+from baseline import align_utils
+
+import random
+from collections import Counter
+# from seq2seq.main import oracle_action
+from seq2seq.constants import STEP
+
+# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+import argparse
+
+
+def safe_replace_spaces(s):
+    s = s.replace("  ", "#")
+    s = s.replace(" ", "")
+    s = s.replace("#", " ")
+    return s
+
+
+def read_examples(fpath, native_or_eng="both", remove_spaces=False, weight=1.0):
+    examples = []
+    bad = 0
+    for idx, l in enumerate(open(fpath)):
+        parts = l.strip().split('\t')
+        if len(parts) == 3:
+            fr_sent, en_sent = parts[:2]
+            is_eng = True
+        elif len(parts) == 2:
+            # print(parts)
+            fr_sent, en_sent = parts
+            is_eng = False
+        elif len(parts) == 4:
+            fr_sent, en_sent, is_eng = parts[:3]
+            is_eng = True if is_eng=="True" else False
+        else:
+            logging.info("#%d bad line %d %s", bad, idx, parts)
+            bad += 1
+            continue
+        if remove_spaces:
+            # fr_sent = fr_sent.replace(" ", "")
+            # en_sent = en_sent.replace(" ", "")
+            fr_sent = safe_replace_spaces(fr_sent)
+            en_sent = safe_replace_spaces(en_sent)
+        if native_or_eng == "nat" and not is_eng:
+            examples.append((fr_sent, en_sent, weight, is_eng))
+        elif native_or_eng == "eng" and is_eng:
+            examples.append((fr_sent, en_sent, weight, is_eng))
+        elif native_or_eng == "both":
+            examples.append((fr_sent, en_sent, weight, is_eng))
+        else:
+            pass
+        if "!!!" in l and not is_eng:
+            logging.info("wierd line %s", l)
+    num_engs = sum([1 if ex[-1] == True else 0 for ex in examples])
+    num_nats = sum([1 if ex[-1] == False else 0 for ex in examples])
+    logging.info("read %d examples in \"%s\" mode", len(examples), native_or_eng)
+    logging.info("# engs %d", num_engs)
+    logging.info("# nats %d", num_nats)
+    return examples
+
+
+def align_examples(examples, seed, algo="mcmc"):
+    logging.info("aligning using %d examples", len(examples))
+
+    pairs = [(x, y) for x, y, weight, is_eng in examples]
+    is_eng_list = [(weight,is_eng) for x, y, weight, is_eng in examples]
+    if algo == "dumb":
+        raise NotImplementedError
+    else:
+        aligned_pairs = align_utils.mcmc_align(pairs, ALIGN_SYMBOL, seed=seed)
+    ans = [(ax, ay, weight, is_eng) for (ax, ay), (weight, is_eng) in zip(aligned_pairs, is_eng_list)]
+    return ans
+
+
+def load_aligned_data(examples, seed, mode=None):
+    ans = []
+
+    if mode == "mcmc":
+        aligned_data = align_examples(examples=examples, seed=seed)
+    else:
+        # No alignments --> seq2seq
+        aligned_data = examples
+    for x, y, weight, is_eng in aligned_data:
+        if mode == "mcmc":
+            raw_x, raw_y = x.replace(ALIGN_SYMBOL, ""), y.replace(ALIGN_SYMBOL, "")
+            raw_x, raw_y = ' '.join(list(raw_x)), ' '.join(list(raw_y))
+        elif mode == "m2m":
+            raise NotImplementedError
+        else:
+            raw_x, raw_y = x, y
+        xs, ys = ' '.join(list(x)), ' '.join(list(y))
+        ans.append((raw_x, raw_y, xs, ys, weight, is_eng))
+    return ans
+
+
+def oracle_action(example):
+    raw_x, raw_y, x, y, weight, is_eng = example
+    x = x.split(' ')
+    y = y.split(' ')
+    actions = []
+    inputs = []
+    alignments = list(zip(x, y))
+    for idx, a in enumerate(alignments):
+        # if 1-to-0 alignment, then step
+        if a[1] == ALIGN_SYMBOL:
+            actions.append(STEP)
+            inputs.append(a[0])
+        else:
+            actions.append(a[1])
+            inputs.append(a[0])
+            if idx + 1 < len(alignments) and alignments[idx + 1][0] != ALIGN_SYMBOL:
+                actions.append(STEP)
+    return inputs,actions
+
diff --git a/seq2seq/constants.py b/seq2seq/constants.py
new file mode 100644
index 0000000..787077f
--- /dev/null
+++ b/seq2seq/constants.py
@@ -0,0 +1,11 @@
+SOS_token = '<SOS>'
+EOS_token = '<EOS>'
+UNK = '<UNK>'
+SOS_ID = 0
+EOS_ID = 1
+UNK_ID = 2
+ALIGN_SYMBOL = '~'
+STEP = '<STEP>'
+EPSILON = '<e>'
+ALIGN_SYMBOL = '~'
+
diff --git a/seq2seq/encoder.py b/seq2seq/encoder.py
new file mode 100644
index 0000000..f73fb23
--- /dev/null
+++ b/seq2seq/encoder.py
@@ -0,0 +1,61 @@
+import torch
+import torch.nn as nn
+from torch.autograd import Variable as V
+
+
+class EncoderRNN(nn.Module):
+    def __init__(self, invoc_size: int, vector_size: int, hidden_size: int, n_layers: int = 1, batch_first: bool = True,
+                 bidi=True, device_id=None) -> None:
+        super(EncoderRNN, self).__init__()
+        self.input_size = invoc_size
+        self.hidden_size = hidden_size
+        self.n_layers = n_layers
+        self.device_id = device_id
+        self.embedding = nn.Embedding(num_embeddings=invoc_size,
+                                      embedding_dim=vector_size)
+        self.bidi = bidi
+        self.batch_first = batch_first
+        self.gru = nn.GRU(input_size=vector_size,
+                          hidden_size=hidden_size,
+                          num_layers=n_layers,
+                          batch_first=batch_first,
+                          bidirectional=bidi
+                          )
+        self.no_pack_padded_seq = False
+
+    def forward(self, word_inputs, hidden):
+        # Note: works with only batch_size = 1
+        # Note: we run this all at once (over the whole input sequence)
+        max_len = len(word_inputs)
+        # L x D
+        embedded = self.embedding(word_inputs)
+        # 1 x L x D, batch first is True
+        embedded = embedded.view(1, max_len, -1)
+        # 1 x L x D, 1 x H x D
+        output, hidden = self.gru(embedded, hidden)
+        return output, hidden
+
+    def init_hidden(self, batch_size=1):
+        if self.bidi:
+            k = self.n_layers * 2
+        else:
+            k = self.n_layers * 1
+        hidden = V(torch.zeros(k, batch_size, self.hidden_size))
+        if self.device_id is not None:
+            hidden = hidden.cuda(self.device_id)
+        return hidden
+
+    def _cuda(self, m):
+        if self.device_id is not None:
+            return m.cuda(self.device_id)
+        return m
+
+
+if __name__ == '__main__':
+    encoder = EncoderRNN(invoc_size=10, vector_size=13, hidden_size=20)
+    print(encoder)
+    hidden = encoder.init_hidden()
+    input_words = torch.LongTensor([1, 2, 3, 4])
+    output, hidden = encoder(V(input_words), hidden)
+    print('Output size:', output.size())
+    print('Hidden size:', [h.size() for h in hidden])
diff --git a/seq2seq/evaluators/reporter.py b/seq2seq/evaluators/reporter.py
new file mode 100644
index 0000000..c4b72bd
--- /dev/null
+++ b/seq2seq/evaluators/reporter.py
@@ -0,0 +1,185 @@
+from __future__ import division
+import shutil
+
+from collections import Counter, defaultdict
+import logging
+
+import numpy as np
+
+from seq2seq.constants import EOS_token, SOS_token
+from utils.news_evaluation_script import news_evaluation
+from utils.news_evaluation_script.news_evaluation import compute_edit_dist as ED
+from seq2seq.constants import STEP
+
+__author__ = 'Shyam'
+
+
+def get_decoded_words(decoded_outputs):
+    ans = []
+    # print(decoded_outputs)
+    for score, output in decoded_outputs:
+        if output[-1] == EOS_token:
+            output = output[:-1]
+        if output[0] == SOS_token:
+            output = output[1:]
+        output = [p for p in output if p != STEP]
+        output = " ".join(output)
+        ans.append((score, output))
+    # print(ans)
+    return ans
+
+
+def compute_acc_at_position(pred_dict, gold_dict, pos):
+    correct = 0
+    for src_word in gold_dict:
+        gold = gold_dict[src_word]
+        preds = pred_dict[src_word]
+        # print(preds,gold)
+        if preds[pos] == gold[0]:
+            correct += 1
+    # print(correct)
+    return correct
+
+
+def print_evauation_details(pred_dict, gold_dict, header="all", vocab=None, beam_width=None):
+    acc_map, f, f_best_match, mrr, map_ref, acc_10, edit_dist, nrm_edit_dist = news_evaluation.evaluate(pred_dict=pred_dict,
+                                                                                         gold_dict=gold_dict)
+    N = len(acc_map)
+    if N == 0:
+        logging.info("N is 0, returning ...")
+        return 0.0
+    edit_dist_freqs = Counter(list(edit_dist.values()))
+    # for k in edit_dist:
+    #     print(k,edit_dist[k])
+    mean_ed_at_1 = np.mean(list(edit_dist.values()))
+    std_ed_at_1 = np.std(list(edit_dist.values()))
+    mean_ned_at_1 = np.mean(list(nrm_edit_dist.values()))
+    median_ed_at_1 = np.median(list(edit_dist.values()))
+    acc_num = float(sum([acc_map[src_word] for src_word in acc_map.keys()]))
+    acc10_num = float(sum([acc_10[src_word] for src_word in acc_10.keys()]))
+    accuracy = acc_num / N
+    accuracy10 = acc10_num / N
+    macro_f1 = float(sum([f[src_word] for src_word in f.keys()])) / N
+    logging.info(20 * "*" + header + 20 * "*")
+    logging.info('ACC:          %f (%d/%d)', accuracy, acc_num, N)
+    logging.info('Mean F-score: %f', macro_f1)
+    logging.info('Mean ED@1: %f+-%.3f', mean_ed_at_1,std_ed_at_1)
+    logging.info('Mean NED@1: %f', mean_ned_at_1)
+    logging.info('Median ED@1: %f', median_ed_at_1)
+    for d in range(3):
+        logging.info('edit dist of %d: %f (%d/%d)', d, edit_dist_freqs[d] / N, edit_dist_freqs[d], N)
+
+    if beam_width is not None:
+        for d in range(beam_width):
+            acc_at_d = compute_acc_at_position(pred_dict=pred_dict, gold_dict=gold_dict, pos=d)
+            logging.info("acc at %d: %.3f (%d/%d)", d, acc_at_d / N, acc_at_d, N)
+    # logging.info('MRR:          %f', float(sum([mrr[src_word] for src_word in mrr.keys()])) / N)
+    # logging.info('MAP_ref:      %f', float(sum([map_ref[src_word] for src_word in map_ref.keys()])) / N)
+    logging.info('ACC@10:       %f (%d/%d)', accuracy10, acc10_num, N)
+    return accuracy, accuracy10
+
+
+class AccReporter:
+    def __init__(self, args, dump_file=None):
+        self.best_acc = 0
+        self.args = args
+        self.best_acc10 = 0
+        self.best_eng_acc = 0
+        self.best_nat_acc = 0
+        self.best_seen = 0
+        self.best_epoch = 0
+        self.dump_file = dump_file
+
+    def print_details(self, epoch, gold_dict, pred_dict, header="all"):
+        beam_width = self.args["beam_width"]
+        # epoch, gold_dict, pred_dict, header = "all"
+        accuracy, accuracy10 = print_evauation_details(gold_dict=gold_dict, pred_dict=pred_dict,
+                                                       header=header, beam_width=beam_width)
+        return accuracy, accuracy10
+
+    def report_eval(self, epoch, seen, examples, evaler):
+        pred_dict, gold_dict = {}, {}
+        eng_pred_dict, eng_gold_dict = {}, {}
+        nat_pred_dict, nat_gold_dict = {}, {}
+        correct = 0
+        correct_nat = 0
+        correct_eng = 0
+        if self.dump_file is not None:
+            out = open(self.dump_file, "w")
+        else:
+            out = None
+        eng_nwords = sum([1 for (_, _, weight, is_eng) in examples if is_eng])
+        nat_nwords = sum([1 for (_, _, weight, is_eng) in examples if not is_eng])
+        for idx, example in enumerate(examples):
+            x, y, weight, is_eng = example
+            # print(weight,is_eng)
+            if idx > 0 and idx % 200 == 0:
+                logging.info("running infer on example %d", idx)
+
+            decoded_outputs = evaler.infer_on_example(sentence=x)
+            scores_and_words = get_decoded_words(decoded_outputs)
+            decoded_words = [w for s, w in scores_and_words]
+
+            key = x.replace(" ", "")
+
+            pred_dict[key] = decoded_words
+            gold_dict[key] = [y]
+
+            if is_eng:
+                eng_pred_dict[key] = decoded_words
+                eng_gold_dict[key] = [y]
+            else:
+                nat_pred_dict[key] = decoded_words
+                nat_gold_dict[key] = [y]
+
+            if decoded_words[0] == y:
+                correct += 1
+                if is_eng:
+                    correct_eng += 1
+                else:
+                    correct_nat += 1
+
+            if out is not None:
+                edit_dists = ";".join([str(ED(ref=y, candidate=word)) for score, word in scores_and_words])
+                beam_outputs = ";".join([word for score, word in scores_and_words])
+                beam_scores = ";".join([str(score) for score, word in scores_and_words])
+                buf = "%s\t%s\t%s\t%s\t%s\t%s\n" % (x, y, is_eng, beam_outputs, beam_scores, edit_dists)
+                out.write(buf)
+
+        logging.info("accuracy %d/%d=%.2f", correct, len(examples), correct / len(examples))
+        NAT_ACC = 0.0 if nat_nwords == 0 else correct_nat / nat_nwords
+        ENG_ACC = 0.0 if eng_nwords == 0 else correct_eng / eng_nwords
+        logging.info("accuracy (nat) %d/%d=%.2f", correct_nat, nat_nwords, NAT_ACC)
+        logging.info("accuracy (eng) %d/%d=%.2f", correct_eng, eng_nwords, ENG_ACC)
+        if out is not None:
+            out.close()
+        all_acc, all_acc10 = self.print_details(header="total", epoch=epoch, gold_dict=gold_dict, pred_dict=pred_dict)
+        if eng_nwords == 0:
+            eng_acc, eng_acc10 = 0, 0
+        else:
+            eng_acc, eng_acc10 = self.print_details(header="eng", epoch=epoch, gold_dict=eng_gold_dict,
+                                                    pred_dict=eng_pred_dict)
+        nat_acc, nat_acc10 = self.print_details(header="nat", epoch=epoch, gold_dict=nat_gold_dict,
+                                                pred_dict=nat_pred_dict)
+        ret_val = False
+        if eng_acc > self.best_eng_acc:
+            self.best_eng_acc = eng_acc
+        if nat_acc > self.best_nat_acc:
+            self.best_nat_acc = nat_acc
+        if all_acc10 > self.best_acc10:
+            self.best_acc10 = all_acc10
+        if all_acc > self.best_acc:
+            self.best_acc = all_acc
+            self.best_seen = seen
+            self.best_epoch = epoch
+            ret_val = True
+        if ret_val is True and self.dump_file is not None:
+            bestpred = self.dump_file + '_best.txt'
+            logging.info("saving best predictions to file %s",bestpred)
+            shutil.copyfile(self.dump_file, bestpred)
+        logging.info("best accuracy: %.3f", self.best_acc)
+        logging.info("best accuracy@10: %.3f", self.best_acc10)
+        logging.info("best after %d mini-batches (%d epoch)", self.best_seen, self.best_epoch)
+        logging.info("best eng accuracy: %.3f", self.best_eng_acc)
+        logging.info("best nat accuracy: %.3f", self.best_nat_acc)
+        return ret_val, self.best_acc
diff --git a/seq2seq/inferences/evaluate.py b/seq2seq/inferences/evaluate.py
new file mode 100644
index 0000000..c0dc021
--- /dev/null
+++ b/seq2seq/inferences/evaluate.py
@@ -0,0 +1,27 @@
+import torch
+import torch.nn as nn
+from torch.autograd import Variable as V
+from torch import optim
+import torch.nn.functional as F
+from seq2seq.constants import EOS_token, SOS_ID, EOS_ID
+from seq2seq.constants import SOS_token
+from seq2seq.torch_utils import variable_from_sentence
+
+
+class Inference:
+    def __init__(self, encoder, decoder, input_lang, output_lang, device_id=None):
+        self.encoder = encoder
+        self.decoder = decoder
+        self.input_lang, self.output_lang = input_lang, output_lang
+        self.device_id = device_id
+
+    def infer_on_example(self, sentence):
+        self.encoder.eval()
+        self.decoder.eval()
+        ans = self.run_inference(sentence)
+        self.encoder.train()
+        self.decoder.train()
+        return ans
+
+    def run_inference(self, sentence, max_length=100):
+        raise NotImplementedError
diff --git a/seq2seq/inferences/monotonic_infer.py b/seq2seq/inferences/monotonic_infer.py
new file mode 100644
index 0000000..6883b44
--- /dev/null
+++ b/seq2seq/inferences/monotonic_infer.py
@@ -0,0 +1,172 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable as V
+
+from seq2seq.constants import EOS_token
+from seq2seq.constants import SOS_token, SOS_ID, UNK_ID
+from seq2seq.constants import STEP
+from seq2seq.inferences.evaluate import Inference
+
+# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+__author__ = 'Shyam'
+
+
+def encode_string(input_str, word2index):
+    ans = []
+    for w in input_str:
+        if w in word2index:
+            t = word2index[w]
+        else:
+            t = UNK_ID
+        ans.append(t)
+    return ans
+
+
+class MonotonicInference(Inference):
+    def __init__(self, encoder, decoder, fr_lang, en_lang, device_id=None, beam_width=1, norm_by_length=False):
+        self.encoder = encoder
+        self.decoder = decoder
+        self.fr_lang, self.en_lang = fr_lang, en_lang
+        self.K = beam_width
+        self.norm_by_length = norm_by_length
+        self.device_id = device_id
+
+    def run_inference(self, x, max_length=60):
+        # list of tokens
+        padded_lemma = [SOS_token] + x.split(' ') + [EOS_token]
+        padded_lemma_idx = encode_string(input_str=padded_lemma, word2index=self.fr_lang.word2index)
+        input_word = V(torch.LongTensor(padded_lemma_idx))
+        enc_hid = self.encoder.init_hidden()
+        enc_outs, enc_hid = self.encoder(input_word, enc_hid)
+
+        # initialize the decoder rnn
+        dec_hid = self.decoder.init_hidden()
+
+        # set prev_output_vec for first lstm step as BEGIN_WORD
+        # prev_word = V(torch.LongTensor([SOS_ID]))
+
+        # i is input index, j is output index
+        i = 0
+        num_outputs = 0
+        beam = [(0, i, [SOS_ID], dec_hid)]
+        outputs = []
+        for beam_idx in range(3 * max_length):
+            next_beam = []
+            for score, att_pos, ys, dec_hid in beam:
+                prev_word = V(torch.LongTensor([ys[-1]]))
+                decoder_output, next_dec_hid = self.decoder(prev_word, att_pos, dec_hid, enc_outs)
+                scores = self.decoder.out(decoder_output)
+                probs = F.softmax(scores, dim=-1)
+                # print("probs",probs)
+                topk_probs, topk_ints = torch.topk(probs, self.K, dim=2)
+                # print(topk_probs,topk_ints)
+                for k in range(self.K - len(outputs)):
+                    top_score = np.log(topk_probs.data[0, 0, k])
+                    top_y = topk_ints.data[0, 0, k]
+                    next_ys = ys + [top_y]
+                    next_score = score + top_score
+                    next_att_pos = att_pos
+                    # print(top_y,self.en_lang.word2index[STEP])
+                    if top_y == self.en_lang.word2index[STEP]:
+                        if att_pos < len(padded_lemma) - 1:
+                            next_att_pos = att_pos + 1
+                    else:
+                        next_att_pos = att_pos
+                    if top_y == self.en_lang.word2index[EOS_token] or len(next_ys) == 3 * max_length:
+                        # if not self.min_output_length or len(next_ys) >= self.min_output_length:
+                        outputs.append((next_score, next_ys))
+                    else:
+                        next_beam.append((next_score, next_att_pos, next_ys, next_dec_hid))
+            if len(outputs) >= self.K:
+                break
+            # sort beam in descending order by score.
+            beam = list(sorted(next_beam, key=lambda tup: -tup[0]))[:self.K - len(outputs)]
+
+        predicted_output_sequences = []
+        for score, output in outputs:
+            seq = []
+            for i in output:
+                seq.append(self.en_lang.index2word[i])
+            if self.norm_by_length:
+                score /= len(seq)
+            predicted_output_sequences.append((score, seq))
+
+        predicted_output_sequences = sorted(predicted_output_sequences, key=lambda tup: -tup[0])
+        prediction = predicted_output_sequences
+        return prediction
+
+    def get_llh(self, x, y, max_length=60):
+        padded_lemma = [SOS_token] + x.split(' ') + [EOS_token]
+        padded_lemma_idx = encode_string(input_str=padded_lemma, word2index=self.fr_lang.word2index)
+        input_word = V(torch.LongTensor(padded_lemma_idx))
+        enc_hid = self.encoder.init_hidden()
+        enc_outs, enc_hid = self.encoder(input_word, enc_hid)
+        y = y.split(' ') + [EOS_token]
+        # initialize the decoder rnn
+        dec_hid = self.decoder.init_hidden()
+
+        # i is input index, j is output index
+        i = 0
+        num_outputs = 0
+        outputs = []
+        y_idx = 0
+        beam = [(0, i, y_idx, [SOS_ID], dec_hid)]
+        for idx in range(3 * max_length):
+            next_beam = []
+            for score, att_pos, y_pos, ys, dec_hid in beam:
+                prev_word = V(torch.LongTensor([ys[-1]]))
+                decoder_output, next_dec_hid = self.decoder(prev_word,
+                                                            att_pos,
+                                                            dec_hid,
+                                                            enc_outs)
+                scores = self.decoder.out(decoder_output)
+                probs = F.softmax(scores, dim=-1)
+                if y_pos == len(y):
+                    seq = []
+                    for i in ys:
+                        seq.append(self.en_lang.index2word[i])
+                    # print("finished seq:", [s for s in seq if s != STEP and s != SOS_token])
+                    outputs.append((score, ys))
+                    continue
+                yo = y[y_pos]
+                # print("yo:", yo)
+                # print(scores.size())
+                yo_score = np.log(probs.data[0][0][self.en_lang.word2index[yo]])
+                st_score = np.log(probs.data[0][0][self.en_lang.word2index[STEP]])
+                if ys[-1] == self.en_lang.word2index[STEP] and ys[-2] == self.en_lang.word2index[STEP]:
+                    possible_actions = [yo]
+                else:
+                    possible_actions = [yo, STEP]
+                for action in possible_actions:
+                    next_ys = ys + [self.en_lang.word2index[action]]
+                    next_att_pos = att_pos
+                    next_y_pos = y_pos
+                    if action == STEP:
+                        next_score = score + st_score
+                        if att_pos < len(padded_lemma) - 1:
+                            next_att_pos = att_pos + 1
+                    else:
+                        next_score = score + yo_score
+                        next_att_pos = att_pos
+                        next_y_pos = y_pos + 1
+                    # print(next_ys)
+                    # print(next_score)
+                    new_state = (next_score, next_att_pos, next_y_pos, next_ys, next_dec_hid)
+                    # print(new_state[:-1])
+                    next_beam.append(new_state)
+            # sort beam in descending order by score.
+            # print("next_beam", len(next_beam))
+            beam = list(sorted(next_beam, key=lambda tup: -tup[0]))[:self.K]
+
+        outputs = sorted(outputs, key=lambda tup: -tup[0])
+        predicted_output_sequences = []
+        for score, output in outputs:
+            seq = []
+            for i in output:
+                seq.append(self.en_lang.index2word[i])
+            print("seq:", seq)
+            # print("seq:", [s for s in seq if s != STEP and s != SOS_token])
+            print("sco:", score)
+            predicted_output_sequences.append((score, seq))
+        print(x, y)
diff --git a/seq2seq/lang.py b/seq2seq/lang.py
new file mode 100644
index 0000000..9fb7347
--- /dev/null
+++ b/seq2seq/lang.py
@@ -0,0 +1,29 @@
+from seq2seq.constants import SOS_token, EOS_token, SOS_ID, EOS_ID, UNK, UNK_ID
+
+
+class Lang:
+    def __init__(self, name):
+        self.name = name
+        self.vocab = set()
+        self.word2index = {SOS_token: SOS_ID, EOS_token: EOS_ID, UNK: UNK_ID}
+        self.word2count = {}
+        self.index2word = {SOS_ID: SOS_token, EOS_ID: EOS_token, UNK_ID: UNK}
+        self.n_words = len(self.word2index)  # Count SOS and EOS
+
+    def index_words(self, sentence):
+        for word in sentence.split(' '):
+            self.index_word(word)
+
+    def index_word(self, word):
+        self.vocab.add(word)
+
+    def compute_maps(self):
+        words = sorted(list(self.vocab))
+        for word in words:
+            if word not in self.word2index:
+                self.word2index[word] = self.n_words
+                self.word2count[word] = 1
+                self.index2word[self.n_words] = word
+                self.n_words += 1
+            else:
+                self.word2count[word] += 1
diff --git a/seq2seq/main.py b/seq2seq/main.py
new file mode 100644
index 0000000..5732fae
--- /dev/null
+++ b/seq2seq/main.py
@@ -0,0 +1,151 @@
+import random
+import logging
+import sys
+
+import torch
+import torch.nn as nn
+import numpy as np
+
+from utils.arguments import PARSER
+from readers.aligned_reader import load_aligned_data, read_examples
+from seq2seq.constants import STEP
+from seq2seq.evaluators.reporter import AccReporter, get_decoded_words
+from seq2seq.lang import Lang
+from seq2seq.runner import run
+from seq2seq.trainers.monotonic_train import MonotonicTrainer
+from seq2seq.model_utils import load_checkpoint, model_builder, setup_optimizers
+
+# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+logging.basicConfig(format=':%(levelname)s: %(message)s', level=logging.INFO)
+
+
+def subsample_examples(examples, frac, single_token):
+    new_examples = []
+    for ex in examples:
+        fr, en, weight, is_eng = ex
+        frtokens, entokens = fr.split(" "), en.split(" ")
+        if len(frtokens) != len(entokens): continue
+        if single_token:
+            if len(frtokens) > 1 or len(entokens) > 1: continue
+        for frtok, entok in zip(frtokens, entokens):
+            new_examples.append((frtok, entok, weight, is_eng))
+    examples = new_examples
+    logging.info("new examples %d", len(examples))
+    # subsample if needed
+    random.shuffle(examples)
+    if frac < 1.0:
+        tmp = examples[0:int(frac * len(examples))]
+        examples = tmp
+    elif frac > 1.0:
+        tmp = examples[0:int(frac)]
+        examples = tmp
+    return examples
+
+
+def index_vocab(examples, fr_lang, en_lang):
+    for ex in examples:
+        raw_x, raw_y, xs, ys, weight, is_eng = ex
+        fr_lang.index_words(xs)
+        en_lang.index_words(ys)
+    logging.info("train size %d", len(examples))
+
+
+langcodes = {"hi": "hindi", "fa": "farsi", "ta": "tamil", "ba": "bengali", "ka": "kannada", "he": "hebrew",
+             "th": "thai"}
+
+if __name__ == '__main__':
+    args = PARSER.parse_args()
+    args = vars(args)
+    logging.info(args)
+    batch_first = args["batch_first"]
+    device_id = args["device_id"]
+    seed = args["seed"]
+    native_or_eng = args["nat_or_eng"]
+    single_token = args["single_token"]
+
+    remove_spaces = True
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+
+    lang = langcodes[args["lang"]]
+
+    trainpath = "data/%s/%s_train_annotateEN" % (lang, lang) if args["ftrain"] is None else args["ftrain"]
+    testpath = "data/%s/%s_test_annotateEN" % (lang, lang) if args["ftest"] is None else args["ftest"]
+
+    examples = read_examples(fpath=trainpath,
+                             native_or_eng=native_or_eng,
+                             remove_spaces=remove_spaces)
+
+    examples = subsample_examples(examples=examples, frac=args["frac"], single_token=single_token)
+
+    fr_lang, en_lang = Lang(name="fr"), Lang(name="en")
+    examples = load_aligned_data(examples=examples,
+                                 mode="mcmc",
+                                 seed=seed)
+    index_vocab(examples, fr_lang, en_lang)
+    en_lang.index_word(STEP)
+    fr_lang.compute_maps()
+    en_lang.compute_maps()
+    # see_phrase_alignments(examples=examples)
+    logging.info(fr_lang.word2index)
+    logging.info(en_lang.word2index)
+    # ALWAYS READ ALL TEST EXAMPLES
+    test = read_examples(fpath=testpath)
+    train = read_examples(fpath=trainpath)
+
+    train = [ex for ex in train if '  ' not in ex[0] and '  ' not in ex[1]]
+    logging.info("input vocab: %d", fr_lang.n_words)
+    logging.info("output vocab: %d", en_lang.n_words)
+    logging.info("beam width: %d", args["beam_width"])
+
+    # Initialize models
+    encoder, decoder, evaler = model_builder(args, fr_lang=fr_lang, en_lang=en_lang)
+    enc_opt, dec_opt, enc_sch, dec_sch = setup_optimizers(args=args, encoder=encoder, decoder=decoder)
+    criterion = nn.NLLLoss()
+
+    trainer = MonotonicTrainer(encoder=encoder, decoder=decoder,
+                               enc_opt=enc_opt, dec_opt=dec_opt,
+                               enc_sch=enc_sch, dec_sch=dec_sch,
+                               fr_lang=fr_lang, en_lang=en_lang)
+
+    # Begin!
+    test_reporter = AccReporter(args=args,
+                                dump_file=args["dump"])
+    train_reporter = AccReporter(args=args,
+                                 dump_file=args["dump"] + ".train.txt" if args["dump"] is not None else None)
+
+    if args["restore"]:
+        if "," in args["restore"]:
+            logging.info("ensembling ...")
+            pass
+        else:
+            load_checkpoint(encoder=encoder, decoder=decoder,
+                            enc_opt=enc_opt, dec_opt=dec_opt,
+                            ckpt_path=args["restore"])
+            if args["interactive"]:
+                try:
+                    while True:
+                        surface = input("enter surface:")
+                        surface = " ".join(list(surface))
+                        print(surface)
+                        x, y, weight, is_eng = surface, None, 1.0, False
+                        decoded_outputs = evaler.infer_on_example(sentence=x)
+                        scores_and_words = get_decoded_words(decoded_outputs)
+                        decoded_words = [w for s, w in scores_and_words]
+                        scores = [s for s, w in scores_and_words]
+                        print(scores_and_words)
+                except KeyboardInterrupt:
+                    print('interrupted!')
+                    sys.exit(0)
+            else:
+                logging.info(20 * "-" + "TEST" + 20 * "-")
+                test_reporter.report_eval(epoch=-1, seen=-1, evaler=evaler, examples=test)
+
+    else:
+        run(args=args,
+            examples=examples,
+            trainer=trainer, evaler=evaler, criterion=criterion,
+            train=train, test=test,
+            train_reporter=train_reporter, test_reporter=test_reporter)
diff --git a/seq2seq/model_utils.py b/seq2seq/model_utils.py
new file mode 100644
index 0000000..afd454a
--- /dev/null
+++ b/seq2seq/model_utils.py
@@ -0,0 +1,108 @@
+import logging
+import sys, shutil
+import os
+from seq2seq.monotonic_decoder import MonotonicDecoder
+from seq2seq.inferences.monotonic_infer import MonotonicInference
+from seq2seq.constants import STEP
+from seq2seq.encoder import EncoderRNN
+import torch
+from torch import optim
+
+__author__ = 'Shyam'
+
+
+def setup_optimizers(args, encoder, decoder):
+    learning_rate = args["lr"]
+    reduction_factor = args['reduction_factor']
+    patience = args['patience']
+
+    enc_opt = optim.Adam(encoder.parameters(), lr=learning_rate)
+    dec_opt = optim.Adam(decoder.parameters(), lr=learning_rate)
+    enc_scheduler = optim.lr_scheduler.ReduceLROnPlateau(enc_opt,
+                                                         factor=reduction_factor,
+                                                         patience=patience,
+                                                         verbose=True)
+    dec_scheduler = optim.lr_scheduler.ReduceLROnPlateau(dec_opt,
+                                                         factor=reduction_factor,
+                                                         patience=patience,
+                                                         verbose=True)
+    return enc_opt, dec_opt, enc_scheduler, dec_scheduler
+
+
+def model_builder(args, fr_lang, en_lang):
+    bidi = args["bidi"]
+    device_id = args["device_id"]
+    batch_first = args["batch_first"]
+    vector_size = args["wdim"]
+    hidden_size = args["hdim"]
+    beam_width = args["beam_width"]
+    norm_by_length = args["norm_by_length"]
+    if args["mono"]:
+        decoder_input_size = 2 * 2 * hidden_size if bidi else 2 * hidden_size
+    else:
+        decoder_input_size = vector_size
+
+    decoder_hidden_size = 2 * hidden_size if bidi else hidden_size
+    # print("hidden_size", hidden_size)
+    # print("decoder_hidden_size", decoder_hidden_size)
+    dropout_p = args["wdrop"]
+
+    if args["mono"]:
+        en_lang.index_word(STEP)
+
+    invoc_size = len(fr_lang.word2index)  # 20
+    outvoc_size = len(en_lang.word2index)  # 30
+
+    encoder = EncoderRNN(invoc_size=invoc_size,
+                         vector_size=vector_size,
+                         hidden_size=hidden_size,
+                         bidi=bidi,
+                         batch_first=batch_first)
+
+    # if args["mono"]:
+    decoder = MonotonicDecoder(input_size=decoder_input_size,
+                               batch_first=batch_first,
+                               outvoc_size=outvoc_size,
+                               hidden_size=decoder_hidden_size)
+    evaler = MonotonicInference(encoder=encoder,
+                                decoder=decoder,
+                                fr_lang=fr_lang,
+                                en_lang=en_lang,
+                                beam_width=beam_width,
+                                norm_by_length=norm_by_length)
+    logging.info(encoder)
+    logging.info(decoder)
+    # Move models to GPU
+    if device_id is not None:
+        encoder.cuda(device_id)
+        decoder.cuda(device_id)
+    return encoder, decoder, evaler
+
+
+def load_checkpoint(encoder, decoder, enc_opt, dec_opt, ckpt_path):
+    if os.path.isfile(ckpt_path):
+        logging.info("=> loading checkpoint %s", ckpt_path)
+        checkpoint = torch.load(ckpt_path)
+        encoder.load_state_dict(checkpoint['enc_state_dict'])
+        decoder.load_state_dict(checkpoint['dec_state_dict'])
+        if enc_opt is not None:
+            enc_opt.load_state_dict(checkpoint['enc_opt_state_dict'])
+        if dec_opt is not None:
+            dec_opt.load_state_dict(checkpoint['dec_opt_state_dict'])
+        logging.info("=> loaded checkpoint!")
+        return checkpoint
+        # any other relevant state variables can be extracted from the checkpoint dict
+    else:
+        logging.info("=> no checkpoint at %s !!!", ckpt_path)
+
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
+    """
+        From https://discuss.pytorch.org/t/saving-and-loading-a-model-in-pytorch/2610/3
+
+        """
+    logging.info("saving model to %s", filename)
+    torch.save(state, filename)
+    if is_best:
+        logging.info("copying to best ...")
+        shutil.copyfile(filename, filename + '_best.pth.tar')
diff --git a/seq2seq/monotonic_decoder.py b/seq2seq/monotonic_decoder.py
new file mode 100644
index 0000000..35e7819
--- /dev/null
+++ b/seq2seq/monotonic_decoder.py
@@ -0,0 +1,42 @@
+import torch
+import torch.nn as nn
+from torch.autograd import Variable as V
+
+
+class MonotonicDecoder(nn.Module):
+    def __init__(self, input_size, outvoc_size, hidden_size, n_layers=1, device_id=None, batch_first=True):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.device_id = device_id
+        self.batch_first = batch_first
+        self.n_layers = n_layers
+        # concatenated_input_dim = input_size + hidden_size
+        # print("concatenated_input_dim",concatenated_input_dim)
+        self.decoder_rnn = nn.GRU(input_size=input_size,
+                                  hidden_size=hidden_size,
+                                  batch_first=True)
+        self.char_lookup = nn.Embedding(outvoc_size, hidden_size)
+        self.out = nn.Linear(in_features=hidden_size,
+                             out_features=outvoc_size)
+
+    def forward(self, prev_word, idx, last_hidden, encoder_outputs):
+        # set prev_output_vec for first lstm step as BEGIN_WORD
+        if self.batch_first:
+            encoder_outputs = encoder_outputs.transpose(0, 1)
+        prev_word_vec = self.char_lookup(prev_word)
+        attended_vec = encoder_outputs[idx]
+        decoder_input = torch.cat((prev_word_vec, attended_vec), dim=1)
+        decoder_output, hidden = self.decoder_rnn(decoder_input.unsqueeze(0), last_hidden)
+        return decoder_output, hidden
+
+    def init_hidden(self, batch_size=1):
+        k = self.n_layers * 1
+        hidden = V(torch.zeros(k, batch_size, self.hidden_size))
+        if self.device_id is not None:
+            hidden = hidden.cuda(self.device_id)
+        return hidden
+
+    def _cuda(self, m):
+        if self.device_id is not None:
+            return m.cuda(self.device_id)
+        return m
diff --git a/seq2seq/runner.py b/seq2seq/runner.py
new file mode 100644
index 0000000..ec664fc
--- /dev/null
+++ b/seq2seq/runner.py
@@ -0,0 +1,52 @@
+import random
+import logging
+
+import numpy as np
+
+from seq2seq.model_utils import save_checkpoint
+
+__author__ = 'Shyam'
+
+
+def run(args, examples, trainer, criterion, evaler, train, test, test_reporter, train_reporter):
+    n_epochs = args["iters"]
+    logging.info("training on %d examples for %d epochs", len(examples), n_epochs)
+    random.shuffle(examples)
+    seen = 0
+    for epoch in range(1, n_epochs + 1):
+        epoch_losses = []
+        random.shuffle(examples)
+        for example in examples:
+            # FOR MONOTONIC MODEL, x CANNOT have any alignment characters!
+            ex_loss = trainer.train_on_example(example=example,
+                                               criterion=criterion)
+            seen += 1
+            # Keep track of loss
+            epoch_losses.append(ex_loss)
+            if seen > 0 and seen % args["evalfreq"] == 0:
+                logging.info("seen %d loss:%.3f", seen, np.average(epoch_losses[-50:]))
+                best_updated, test_acc = test_reporter.report_eval(epoch=epoch, seen=seen, evaler=evaler, examples=test)
+                if best_updated and args["save"]:
+                    state_dict = {
+                        'args': args,
+                        'enc_state_dict': trainer.encoder.state_dict(),
+                        'dec_state_dict': trainer.decoder.state_dict(),
+                        'enc_opt_state_dict': trainer.enc_opt.state_dict(),
+                        'dec_opt_state_dict': trainer.dec_opt.state_dict(),
+                    }
+                    save_checkpoint(state=state_dict, is_best=True, filename=args["save"])
+            if seen > 0 and seen % args["logfreq"] == 0:
+                logging.info("seen %d loss:%.3f", seen, np.average(epoch_losses[-50:]))
+        logging.info("epoch loss %.3f", np.average(epoch_losses))
+    if args["save"]:
+        logging.info("saving final model ...")
+        state_dict = {
+            'args': args,
+            'enc_state_dict': trainer.encoder.state_dict(),
+            'dec_state_dict': trainer.decoder.state_dict(),
+            'enc_opt_state_dict': trainer.enc_opt.state_dict(),
+            'dec_opt_state_dict': trainer.dec_opt.state_dict(),
+        }
+        save_checkpoint(state=state_dict, is_best=False, filename=args["save"])
+    logging.info(20 * "-" + "TEST" + 20 * "-")
+    test_reporter.report_eval(epoch=n_epochs, seen=seen, evaler=evaler, examples=test)
diff --git a/seq2seq/torch_utils.py b/seq2seq/torch_utils.py
new file mode 100644
index 0000000..56febf4
--- /dev/null
+++ b/seq2seq/torch_utils.py
@@ -0,0 +1,33 @@
+import torch
+from torch.autograd import Variable as V
+
+from seq2seq.constants import EOS_ID, EOS_token
+
+
+def variables_from_pair(x, y, input_lang=None, output_lang=None):
+    input_variable = variable_from_sentence(input_lang, x)
+    target_variable = variable_from_sentence(output_lang, y)
+    return input_variable, target_variable
+
+
+def variable_from_sentence(lang, sentence, device_id=None):
+    indexes = indexes_from_sentence(lang, sentence)
+    indexes.append(EOS_ID)
+    var = V(torch.LongTensor(indexes).view(-1, 1))
+    #     print('var =', var)
+    if device_id is not None:
+        var = var.cuda(device_id)
+    return var
+
+
+def indexes_from_sentence(lang, sentence):
+    return [lang.word2index[word] for word in sentence.split(' ')]
+
+
+def pad_batch(batch, pad_unit):
+    lengths = [len(i) for i in batch]
+    max_length = max(lengths)
+    for ex in batch:
+        padding = (max_length - len(ex)) * [pad_unit]
+        ex += padding
+    return batch, lengths
diff --git a/seq2seq/trainers/monotonic_train.py b/seq2seq/trainers/monotonic_train.py
new file mode 100644
index 0000000..1a553c9
--- /dev/null
+++ b/seq2seq/trainers/monotonic_train.py
@@ -0,0 +1,116 @@
+import torch
+import torch.nn as nn
+from torch.autograd import Variable as V
+from seq2seq.constants import SOS_token, SOS_ID
+from seq2seq.constants import EOS_token
+from seq2seq.constants import ALIGN_SYMBOL
+from seq2seq.constants import STEP
+from seq2seq.constants import UNK
+from seq2seq.trainers.seq2seq_attn_trainer import Seq2SeqAttnTrainer
+
+
+def make_target(word, word2idx):
+    return torch.LongTensor([word2idx[word]])
+
+
+class MonotonicTrainer(Seq2SeqAttnTrainer):
+    def __init__(self, encoder, decoder, enc_opt, dec_opt, enc_sch, dec_sch, fr_lang, en_lang, clip=0.5, teacher_forcing_ratio=0.5,
+                 device_id=None):
+        self.encoder = encoder
+        self.decoder = decoder
+        self.enc_opt = enc_opt
+        self.dec_opt = dec_opt
+        self.enc_sch = enc_sch
+        self.dec_sch = dec_sch
+        self.fr_lang = fr_lang
+        self.en_lang = en_lang
+        self.clip = clip
+        self.device_id = device_id
+
+    def prepare_example(self, example):
+        raw_x, raw_y, x, y, weight, is_eng = example
+        raw_x, raw_y, x, y = raw_x.split(" "), raw_y.split(" "), x.split(" "), y.split(" ")
+        example = raw_x, raw_y, x, y, weight
+        return example
+
+    def compute_loss(self, example, criterion, profile):
+        raw_x, raw_y, aligned_x, aligned_y, weight = example
+        # i is input index, j is output index
+        i = 0
+        j = 0
+        padded_raw_x = [SOS_token] + raw_x + [EOS_token]
+        hidden = self.encoder.init_hidden()
+        padded_lemma_idx = [self.fr_lang.word2index[w] for w in padded_raw_x]
+        input_word = V(torch.LongTensor(padded_lemma_idx))
+        encoder_outputs, encoder_hidden_state = self.encoder(input_word, hidden)
+        aligned_x += [EOS_token]
+        aligned_y += [EOS_token]
+
+        # start decoding, keeping track of sequence loss
+        decoder_hidden = self.decoder.init_hidden()
+        prev_word = V(torch.LongTensor([SOS_ID]))
+        loss = []  # V(torch.FloatTensor([0.0]))
+
+        for a, (input_char, output_char) in enumerate(zip(aligned_x, aligned_y)):
+            possible_outputs = []
+            if output_char == EOS_token:
+                decoder_hidden, scores = self.step_decoder(prev_word=prev_word, i=i,
+                                                           decoder_hidden=decoder_hidden,
+                                                           encoder_outputs=encoder_outputs)
+                target = V(make_target(word=EOS_token, word2idx=self.en_lang.word2index))
+                ex_loss = criterion(input=scores, target=target)
+                loss.append(ex_loss)
+                continue
+
+            if padded_raw_x[i] == SOS_token and aligned_x[a] != ALIGN_SYMBOL:
+                decoder_hidden, scores = self.step_decoder(prev_word=prev_word, i=i,
+                                                           decoder_hidden=decoder_hidden,
+                                                           encoder_outputs=encoder_outputs)
+                target = V(make_target(word=STEP, word2idx=self.en_lang.word2index))
+                ex_loss = criterion(input=scores, target=target)
+                loss.append(ex_loss)
+
+                prev_word = V(make_target(word=STEP, word2idx=self.en_lang.word2index))
+                i += 1
+
+            if aligned_y[a] != ALIGN_SYMBOL:
+                decoder_hidden, scores = self.step_decoder(prev_word=prev_word, i=i,
+                                                           decoder_hidden=decoder_hidden,
+                                                           encoder_outputs=encoder_outputs)
+
+                if aligned_y[a] in self.en_lang.word2index:
+                    target = V(make_target(word=aligned_y[a], word2idx=self.en_lang.word2index))
+                    ex_loss = criterion(input=scores, target=target)
+                    prev_word = V(make_target(word=aligned_y[a], word2idx=self.en_lang.word2index))
+                else:
+                    target = V(make_target(word=UNK, word2idx=self.en_lang.word2index))
+                    ex_loss = criterion(input=scores, target=target)
+                    prev_word = V(make_target(word=UNK, word2idx=self.en_lang.word2index))
+
+                loss.append(ex_loss)
+
+                j += 1
+
+            if i < len(padded_raw_x) - 1 and aligned_x[a + 1] != ALIGN_SYMBOL:
+                decoder_hidden, scores = self.step_decoder(prev_word=prev_word, i=i,
+                                                           decoder_hidden=decoder_hidden,
+                                                           encoder_outputs=encoder_outputs)
+                target = V(torch.LongTensor([self.en_lang.word2index[STEP]]))
+                ex_loss = criterion(input=scores, target=target)
+                loss.append(ex_loss)
+                prev_word = V(torch.LongTensor([self.en_lang.word2index[STEP]]))
+                # whenever you step, attend to next position
+                i += 1
+        return weight*sum(loss)/len(loss)
+
+    def step_decoder(self, prev_word, i, decoder_hidden, encoder_outputs):
+        decoder_output, decoder_hidden = self.decoder(prev_word,
+                                                      i,
+                                                      decoder_hidden,
+                                                      encoder_outputs)
+        # compute local loss
+        scores = self.decoder.out(decoder_output)
+        log_softmax = nn.LogSoftmax(dim=2)
+        scores = log_softmax(scores)
+        scores = scores.squeeze(1)
+        return decoder_hidden, scores
diff --git a/seq2seq/trainers/seq2seq_attn_trainer.py b/seq2seq/trainers/seq2seq_attn_trainer.py
new file mode 100644
index 0000000..33c852c
--- /dev/null
+++ b/seq2seq/trainers/seq2seq_attn_trainer.py
@@ -0,0 +1,195 @@
+import time
+import logging
+
+import torch
+from torch.autograd import Variable as V
+import torch.nn.functional as F
+
+from seq2seq.constants import EOS_token, SOS_ID
+from seq2seq.torch_utils import variables_from_pair
+
+
+class Seq2SeqAttnTrainer:
+    def __init__(self, encoder, decoder, fr_lang, en_lang, enc_opt, dec_opt, clip=0.5, teacher_forcing_ratio=0.5,
+                 device_id=None):
+        self.clip = clip
+        self.teacher_forcing_ratio = teacher_forcing_ratio
+        self.device_id = device_id
+        self.teacher_forcing_ratio = teacher_forcing_ratio
+        self.encoder = encoder
+        self.decoder = decoder
+        self.enc_opt = enc_opt
+        self.dec_opt = dec_opt
+        self.fr_lang = fr_lang
+        self.en_lang = en_lang
+
+    def train_on_example(self, example,
+                         criterion, profile=False):
+
+        # y_length = len(y)
+        prep_ex = self.prepare_example(example)
+
+        # Zero gradients of both optimizers
+        self.enc_opt.zero_grad()
+        self.dec_opt.zero_grad()
+
+        loss = self.compute_loss(prep_ex, criterion, profile)
+
+        # Backpropagation
+        loss.backward()
+        torch.nn.utils.clip_grad_norm(self.encoder.parameters(), self.clip)
+        torch.nn.utils.clip_grad_norm(self.decoder.parameters(), self.clip)
+        self.enc_opt.step()
+        self.dec_opt.step()
+
+        return loss.data[0]
+
+    def compute_loss(self, prep_ex, criterion, profile):
+        loss = []  # Added onto for each word
+
+        x, y = prep_ex
+        # Get size of input and target sentences
+        x_length = x.size()[0]
+        y_length = y.size()[0]
+
+        # Run words through encoder
+        tic = time.time()
+        encoder_hidden = self.encoder.init_hidden()
+        encoder_outputs, encoder_hidden = self.encoder(word_inputs=x,
+                                                       hidden=encoder_hidden,
+                                                       )
+        toc = time.time()
+        if profile: logging.info("encoding time %.2f", toc - tic)
+        # print("encoder_outputs",encoder_outputs.size(),encoder_hidden.size())
+        # return
+        # Prepare input and output variables
+        tic = time.time()
+        decoder_input = V(torch.LongTensor([[SOS_ID]]))
+        # Use last hidden state from encoder to start decoder
+        decoder_hidden = self.decoder.init_hidden(encoder_hidden)
+        # print("decoder_hidden",decoder_hidden.size())
+        if self.device_id:
+            decoder_input = decoder_input.cuda(self.device_id)
+
+        # Choose whether to use teacher forcing
+        use_teacher_forcing = True  # random.random() < self.teacher_forcing_ratio
+        if use_teacher_forcing:
+
+            # Teacher forcing: Use the ground-truth target as the next input
+            for di in range(y_length):
+                # print("decoder_input",decoder_input.size())
+                decoder_output, decoder_hidden = self.decoder(decoder_input,
+                                                              encoder_outputs,
+                                                              decoder_hidden)
+                # print(decoder_output[0].size(),target_variable[di].size())
+                ex_loss = criterion(input=decoder_output, target=y[di])
+                loss.append(ex_loss)
+                # decoder_input = y[di]  # Next target is next input
+                decoder_input = y[di].unsqueeze(0)  # Next target is next input
+
+        else:
+            # Without teacher forcing: use network's own prediction as the next input
+            for di in range(y_length):
+                decoder_output, decoder_hidden = self.decoder(decoder_input,
+                                                              encoder_outputs,
+                                                              decoder_hidden)
+                # print(decoder_output[0].size(),target_variable[di].size())
+                ex_loss = criterion(input=decoder_output, target=y[di])
+                loss.append(ex_loss)
+
+                # Get most likely word index (highest value) from output
+                topv, topi = decoder_output.data.topk(1)
+                ni = topi[0][0]
+
+                decoder_input = V(torch.LongTensor([[ni]]))  # Chosen word is next input
+                if self.device_id:
+                    decoder_input = decoder_input.cuda()
+
+                # Stop at end of sentence (not necessary when using known targets)
+                if ni == EOS_token:
+                    break
+
+        toc = time.time()
+        if profile: logging.info("decoding time %.2f", toc - tic)
+        return sum(loss) / len(loss)
+
+    def compute_loss_old(self, prep_ex, criterion, profile):
+        loss = []  # Added onto for each word
+
+        x, y = prep_ex
+        # Get size of input and target sentences
+        x_length = x.size()[0]
+        y_length = y.size()[0]
+
+        # Run words through encoder
+        tic = time.time()
+        encoder_hidden = self.encoder.init_hidden()
+        encoder_outputs, encoder_hidden = self.encoder(word_inputs=x,
+                                                       hidden=encoder_hidden,
+                                                       )
+        toc = time.time()
+        if profile: logging.info("encoding time %.2f", toc - tic)
+        # print("encoder_outputs",encoder_outputs.size(),encoder_hidden.size())
+        # return
+        # Prepare input and output variables
+        tic = time.time()
+        decoder_input = V(torch.LongTensor([[SOS_ID]]))
+        decoder_context = V(torch.zeros(1, self.decoder.hidden_size))
+        decoder_hidden = torch.cat([encoder_hidden[0, :, :], encoder_hidden[1, :, :]], dim=-1).unsqueeze(
+            0)  # Use last hidden state from encoder to start decoder
+        # decoder_hidden = self.decoder.init_hidden()  # Use last hidden state from encoder to start decoder
+        # print("decoder_hidden",decoder_hidden.size())
+        if self.device_id:
+            decoder_input = decoder_input.cuda(self.device_id)
+            decoder_context = decoder_context.cuda(self.device_id)
+
+        # Choose whether to use teacher forcing
+        use_teacher_forcing = True  # random.random() < self.teacher_forcing_ratio
+        if use_teacher_forcing:
+
+            # Teacher forcing: Use the ground-truth target as the next input
+            for di in range(y_length):
+                decoder_output, decoder_context, decoder_hidden, decoder_attention = self.decoder(decoder_input,
+                                                                                                  decoder_context,
+                                                                                                  decoder_hidden,
+                                                                                                  encoder_outputs)
+                # print(decoder_output[0].size(),target_variable[di].size())
+                ex_loss = criterion(input=decoder_output, target=y[di])
+                loss.append(ex_loss)
+                decoder_input = y[di]  # Next target is next input
+
+        else:
+            # Without teacher forcing: use network's own prediction as the next input
+            for di in range(y_length):
+                decoder_output, decoder_context, decoder_hidden, decoder_attention = self.decoder(decoder_input,
+                                                                                                  decoder_context,
+                                                                                                  decoder_hidden,
+                                                                                                  encoder_outputs)
+                # print(decoder_output[0].size(),target_variable[di].size())
+                ex_loss = criterion(input=decoder_output, target=y[di])
+                loss.append(ex_loss)
+
+                # Get most likely word index (highest value) from output
+                topv, topi = decoder_output.data.topk(1)
+                ni = topi[0][0]
+
+                decoder_input = V(torch.LongTensor([[ni]]))  # Chosen word is next input
+                if self.device_id:
+                    decoder_input = decoder_input.cuda()
+
+                # Stop at end of sentence (not necessary when using known targets)
+                if ni == EOS_token:
+                    break
+
+        toc = time.time()
+        if profile: logging.info("decoding time %.2f", toc - tic)
+        return sum(loss) / len(loss)
+
+    def prepare_example(self, example):
+        raw_x, raw_y, x, y, weight, is_eng = example
+        training_pair = variables_from_pair(x, y,
+                                            input_lang=self.fr_lang,
+                                            output_lang=self.en_lang)
+        vx = training_pair[0]
+        vy = training_pair[1]
+        return vx, vy, weight
diff --git a/train_model.sh b/train_model.sh
new file mode 100755
index 0000000..2efb1aa
--- /dev/null
+++ b/train_model.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+ME=`basename $0` # for usage message
+
+if [ "$#" -ne 3 ]; then 	# number of args
+    echo "USAGE: <lang> <seed> <model_path>"
+    echo "$ME"
+    exit
+fi
+lang=$1
+seed=$2
+model=$3
+time python -m seq2seq.main \
+     --lang ${lang} \
+     --mono \
+     --beam_width 1 \
+     --save ${model} \
+     --seed ${seed}
+
+
+
+
+
+if [[ $? == 0 ]]        # success
+then
+    :                   # do nothing
+else                    # something went wrong
+    echo "SOME PROBLEM OCCURED";            # echo file with problems
+fi
diff --git a/train_model_on_files.sh b/train_model_on_files.sh
new file mode 100755
index 0000000..98a2f8d
--- /dev/null
+++ b/train_model_on_files.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+ME=`basename $0` # for usage message
+
+if [ "$#" -ne 4 ]; then 	# number of args
+    echo "USAGE: ${ME} <ftrain> <ftest> <seed> <model_path>"
+    exit
+fi
+ftrain=$1
+ftest=$2
+seed=$3
+model=$4
+
+time python -m seq2seq.main \
+     --ftrain ${ftrain} \
+     --ftest ${ftest} \
+     --mono \
+     --beam_width 1 \
+     --save ${model} \
+     --seed ${seed}
+
+
+
+
+
+if [[ $? == 0 ]]        # success
+then
+    :                   # do nothing
+else                    # something went wrong
+    echo "SOME PROBLEM OCCURED";            # echo file with problems
+fi
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/utils/arguments.py b/utils/arguments.py
new file mode 100644
index 0000000..088cb8c
--- /dev/null
+++ b/utils/arguments.py
@@ -0,0 +1,44 @@
+import argparse
+
+PARSER = argparse.ArgumentParser(description='entity linker')
+PARSER.add_argument('--iters', type=int, default=20, help='# train iters (default: 20)')
+PARSER.add_argument('--maxsteps', type=int, default=500000, help='# train iters (default: 5)')
+PARSER.add_argument('--batch_size', type=int, default=1, help='batch size (default: 1)')
+PARSER.add_argument('--seed', type=int, default=42, metavar='N', help='random seed (default: 42)')
+PARSER.add_argument('--restore', type=str, default=None, help='path from which to restore model')
+PARSER.add_argument('--profile', action='store_true', help='restore model')
+PARSER.add_argument('--save', type=str, default=None, help='save model')
+PARSER.add_argument('--lang', type=str, default="hi", help='language')
+PARSER.add_argument('--wdim', type=int, default=50, help='word vec size')
+PARSER.add_argument('--hdim', type=int, default=20, help='rnn vec size')
+PARSER.add_argument('--cell', type=str, default="gru", help='rnn type')
+PARSER.add_argument('--wdrop', type=float, default=0.0, help='word dropout')
+PARSER.add_argument('--lr', type=float, default=0.001, help='learning rate')
+PARSER.add_argument('--clip', type=float, default=None, help='grad clipping')
+PARSER.add_argument('--optimizer', type=str, default="adam", help='optimizer')
+PARSER.add_argument('--extra', type=str, default=None, help='extra mined data')
+PARSER.add_argument('--nat_or_eng', type=str, default="both", help='nat|eng|both')
+PARSER.add_argument('--evalfreq', type=int, default=500, help='(default: 500)')
+PARSER.add_argument('--logfreq', type=int, default=100, help='(default: 100)')
+PARSER.add_argument('--patience', type=int, default=10, help='(default: 10)')
+PARSER.add_argument('--reduction_factor', type=float, default=0.1, help='reduction factor for LR schedule')
+PARSER.add_argument('--beam_width', type=int, default=1, help='(default: 1)')
+PARSER.add_argument('--norm_by_length', dest='norm_by_length', action='store_true', help='norm score of final beam contents by length')
+PARSER.add_argument('--single_token', dest='single_token', action='store_true', help='filter stuff that is not single token aligned')
+PARSER.set_defaults(single_token=True)
+
+PARSER.add_argument('--max_output_length', type=int, default=75, help='(default: 1)')
+PARSER.add_argument('--ftrain', type=str, help='train file')
+PARSER.add_argument('--ftest', type=str, help='test/val file')
+PARSER.add_argument('--frac', type=float, default=1.0, help='frac of train data')
+PARSER.add_argument('--dump', type=str, default=None, help='to dump test predictions')
+PARSER.add_argument('--device_id', type=int, default=None, help='gpu device')
+PARSER.add_argument('--ncands', type=int, default=20, help='ncands')
+PARSER.add_argument('--no-bidi', dest='bidi', action='store_false', help='do not use bidirectional')
+PARSER.set_defaults(bidi=True)
+PARSER.add_argument('--no-batch-first', dest='batch_first', action='store_false', help='do not use batch first')
+PARSER.set_defaults(batch_first=True)
+PARSER.add_argument('--mono', dest='mono', action='store_true', help='use monotonic transliteration model')
+PARSER.add_argument('--interactive', action="store_true", dest="interactive")
+PARSER.add_argument('--outfile', action="store", dest="outfile")
+
diff --git a/utils/news_evaluation_script/news_evaluation.py b/utils/news_evaluation_script/news_evaluation.py
new file mode 100755
index 0000000..f243a8f
--- /dev/null
+++ b/utils/news_evaluation_script/news_evaluation.py
@@ -0,0 +1,510 @@
+#!/usr/bin/python
+
+import codecs
+import logging
+import sys
+import getopt
+from os.path import basename
+import xml.dom.minidom
+from xml.dom.minidom import Node
+# what we expect to find inside <TransliterationResults> tag...
+import editdistance
+import numpy as np
+RESULT_HEADER_ATTR = ('SourceLang', 'TargetLang', 'GroupID', 'RunID', 'RunType', 'Comments')
+# ... and inside <TransliterationCorpus> tag
+CORPUS_HEADER_ATTR = ('SourceLang', 'TargetLang', 'CorpusID', 'CorpusType', 'CorpusSize', 'CorpusFormat')
+
+MAX_CANDIDATES = 10
+
+
+def usage():
+    '''
+    User's manual
+    '''
+    print('''
+Transliteration results evaluation script for NEWS:
+Named Entities Workshop - Shared Task on Transliteration
+
+Usage:
+    [python] %s [-h|--help] [-i|--input-file=<filename>]
+                [-o|--output-file=<filename>]
+                -t|--test-file=<filename>
+                --max-candidates=<int>
+                [--map-n=<int>]
+
+Options:
+    -h, --help         : Print this help and quit
+
+    --check-only       : Only checks that the file is in correct format.
+                         When this option is given, only one file is
+                         accepted, either stdin or given with -i option.
+
+    -i, --input-file   : Input file with transliteration results in NEWS
+                         XML format. If not given, standard input is used.
+
+    -t, --test-file    : Test file with transliteration references in NEWS
+                         XML format.
+
+    -o, --output-file  : Output file with contribution of each source word
+                         to each metric. If not given, no details are written.
+                         The output file contains comma-separated values
+                         and can be opened by a spreadsheet application
+                         such as Microsoft Excel or OpenOffice Calc.
+                         The values in the file are not divided by the
+                         number of source names.
+
+    --max-candidates   : Maximum number of transliteration candidates
+                         to consider. By default, maximum 10 candidates are
+                         considered for evaluation according to the
+                         NEWS 2009 whitepaper.
+
+
+The input files must be in UTF-8.
+
+Example:
+    %s -i translit_results.xml -t test.xml -o evaluation_details.csv
+
+The detailed description of the metrics is in the NEWS 2010 whitepaper.
+
+For comments, suggestions and bug reports email to Vladimir Pervouchine
+vpervouchine@i2r.a-star.edu.sg.
+    ''' % (basename(sys.argv[0]), basename(sys.argv[0])))
+
+
+def get_options():
+    '''
+    Extracts command line arguments
+    '''
+    input_fname = None
+    output_fname = None
+    test_fname = None
+    max_candidates = MAX_CANDIDATES
+    check_only = False
+    silent = False
+
+    try:
+        opts, args = getopt.gnu_getopt(sys.argv[1:], 'hi:o:t:',
+                                       ['help', 'input-file=', 'output-file=', 'test-file=',
+                                        'check-only', 'silent'])
+    except getopt.GetoptError as err:
+        sys.stderr.write('Error: %s\n' % err)
+        usage()
+        sys.exit(1)
+
+    for o, a in opts:
+        if o in ('-i', '--input-file'):
+            input_fname = a
+        elif o in ('-o', '--output-file'):
+            output_fname = a
+        elif o in ('-t', '--test-file'):
+            test_fname = a
+        elif o in ('-h', '--help'):
+            usage()
+            sys.exit()
+        elif o in ('--check-only',):
+            check_only = True
+        elif o in ('--silent',):
+            silent = True
+        elif o in ('--max-candidates',):
+            try:
+                max_candidates = int(a)
+            except ValueError:
+                sys.stderr.write('Error: --max-candidates takes integer argument (you provided %s).\n' % a)
+                sys.exit(1)
+            if max_candidates < 1:
+                sys.stderr.write('Error: --max-candidates must be above 0.\n')
+                sys.exit(1)
+
+        else:
+            sys.stderr.write('Error: unknown option %s. Type --help to see the options.\n' % o)
+            sys.exit(1)
+
+    if check_only:
+        if test_fname or output_fname:
+            sys.stderr.write('No test file or output file is required to check the input format.\n')
+            sys.exit(1)
+    else:
+        if not test_fname:
+            sys.stderr.write('Error: no test file provided.\n')
+            sys.exit(1)
+
+    return input_fname, output_fname, test_fname, max_candidates, check_only, silent
+
+
+def parse_xml(f_in, max_targets=None):
+    '''
+    Parses XML input and test files with paranoid error checking.
+    Returns a tuple of header and content
+    Content is a dictionary with source names as keys and contains lists of target names.
+    If max_targets is given, the number of target names in the list is cut up to max_targets names.
+    Header is a dictionary of header data
+    '''
+
+    stderr = codecs.getwriter('utf-8')(sys.stderr)
+
+    doc = xml.dom.minidom.parse(f_in)
+    if doc.encoding.lower() != 'utf-8':
+        raise IOError('Invalid encoding. UTF-8 is required but %s found' % doc.encoding)
+
+    # try results
+    header = doc.getElementsByTagName('TransliterationTaskResults')
+    is_results = True
+    if not header:
+        # try corpus
+        is_results = False
+        header = doc.getElementsByTagName('TransliterationCorpus')
+    if not header:
+        raise IOError('Unknown file. TransliterationTaskResults and TransliterationCorpus tags are missing')
+    if len(header) > 1:
+        raise IOError('Invalid file. Several headers were found')
+    header = header[0]
+
+    # parse the comments
+    header_data = {}
+    if is_results:
+        attr_list = RESULT_HEADER_ATTR
+    else:
+        attr_list = CORPUS_HEADER_ATTR
+
+    for attr in attr_list:
+        header_data[attr] = header.getAttribute(attr)
+
+    # parse the data
+    data = {}
+    for node in doc.getElementsByTagName('Name'):
+        # we ignore the name ID unless encounter error
+        # get the source name
+        s = node.getElementsByTagName('SourceName')
+        # import ipdb
+        # ipdb.set_trace()
+        if not s:
+            raise IOError('Invalid file format: one of <Name> nodes does not have <SourceName>')
+        if s[0].childNodes[0].nodeType == Node.TEXT_NODE:
+            src_name = s[0].childNodes[0].data.strip('" ')  # strip quotes and spaces in case someone adds them
+            src_name = src_name.upper()  # convert to uppercase in case it's a language where case matters
+        else:
+            raise IOError('For Name ID %s no SourceName was found or its format is invalid' % node.getAttribute('ID'))
+
+        # get the targets
+        t = node.getElementsByTagName('TargetName')
+        if not t:
+            raise IOError('Invalid file format: one of <Name> nodes does not have <TargetName>')
+        # we'll read target names as tuples: (target_name, ID) so that the list can later be sorted
+        # according to the ID, which is going to be removed after that.
+        tgt_list = []
+        for t_node in t:
+            # get the ID, which is the rank for transliteration candidates
+            try:
+                tgt_id = int(t_node.getAttribute('ID'))
+            except ValueError:
+                raise IOError(
+                    'For name ID %s (%s) one of target names have invalid ID' % (node.getAttribute('ID'), src_name))
+            # get the word
+            if not t_node.childNodes:
+                raise IOError('For name ID %s (%s) one of the target names ID %s is empty' % (
+                    node.getAttribute('ID'), src_name, tgt_id))
+            if t_node.childNodes[0].nodeType == Node.TEXT_NODE:
+                tgt_name = t_node.childNodes[0].data.strip('" ')
+                if tgt_name:
+                    tgt_name = tgt_name.upper()  # convert to uppercase in case it matters
+                    tgt_list.append((tgt_name, tgt_id))
+                else:
+                    stderr.write(
+                        'Warning: Name ID %s (%s) contains empty target words\n' % (node.getAttribute('ID'), src_name))
+            else:
+                raise IOError('For name ID %s (%s) one of target names ID %s have invalid format' % (
+                    node.getAttribute('ID'), src_name, tgt_id))
+
+        # sort by ID
+        if not tgt_list:
+            stderr.write('Warning: no non-empty target words found for name ID %s (%s). This name is ignored.\n' % (
+                node.getAttribute('ID'), src_name))
+
+        else:
+
+            tgt_list.sort(key=lambda x: x[1])
+            # check for duplicate IDs: if there are any, they must be adjacent elements after sorting
+            # we only care for IDs to be unique in the results file because IDs are ranks there.
+            if is_results:
+                for i in range(len(tgt_list) - 1):
+                    if tgt_list[i][1] == tgt_list[i + 1][1]:
+                        raise IOError(
+                            'XML results file contains duplicate IDs for transliterations of word %s' % src_name)
+
+            # cut up to max_targets
+            if max_targets:
+                tgt_list = tgt_list[0:max_targets]
+
+            data[src_name] = [tgt[0] for tgt in tgt_list]  # remove IDs, we don't need them anymore
+
+            # test (codecs.getwriter('utf-8')(sys.stdout)).write('Name: %s\n' % (data[src_name][0]))
+            # test raise IOError('%s' % data[src_name][0])
+
+    return header_data, data, is_results
+
+
+def LCS_length(s1, s2):
+    '''
+    Calculates the length of the longest common subsequence of s1 and s2
+    s1 and s2 must be anything iterable
+    The implementation is almost copy-pasted from Wikibooks.org
+    '''
+    m = len(s1)
+    n = len(s2)
+    # An (m+1) times (n+1) matrix
+    C = [[0] * (n + 1) for i in range(m + 1)]
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            if s1[i - 1] == s2[j - 1]:
+                C[i][j] = C[i - 1][j - 1] + 1
+            else:
+                C[i][j] = max(C[i][j - 1], C[i - 1][j])
+    return C[m][n]
+
+
+def levenshtein(s1, s2):
+    if len(s1) < len(s2):
+        return levenshtein(s2, s1)
+
+    # len(s1) >= len(s2)
+    if len(s2) == 0:
+        return len(s1)
+
+    previous_row = range(len(s2) + 1)
+    for i, c1 in enumerate(s1):
+        current_row = [i + 1]
+        for j, c2 in enumerate(s2):
+            insertions = previous_row[
+                             j + 1] + 1  # j+1 instead of j since previous_row and current_row are one character longer
+            deletions = current_row[j] + 1  # than s2
+            substitutions = previous_row[j] + (c1 != c2)
+            current_row.append(min(insertions, deletions, substitutions))
+        previous_row = current_row
+
+    return previous_row[-1]
+
+
+def compute_edit_dist(ref, candidate):
+    ref = ref.replace(" ", "")
+    candidate = candidate.replace(" ", "")
+    return editdistance.eval(ref, candidate)
+
+
+def f_score(candidate, references):
+    '''
+    Calculates F-score for the candidate and its best matching reference
+    Returns F-score and best matching reference
+    '''
+    # determine the best matching reference (the one with the shortest ED)
+    best_ref = references[0]
+    if len(candidate) == 0:
+        return 0.0, best_ref, 100, 0
+    best_ref_lcs = LCS_length(candidate, references[0])
+    for ref in references[1:]:
+        lcs = LCS_length(candidate, ref)
+        if (len(ref) - 2 * lcs) < (len(best_ref) - 2 * best_ref_lcs):
+            best_ref = ref
+            best_ref_lcs = lcs
+
+    # try:
+    precision = float(best_ref_lcs) / float(len(candidate))
+    recall = float(best_ref_lcs) / float(len(best_ref))
+    # except:
+    #    import ipdb
+    #    ipdb.set_trace()
+    # edit_dist = levenshtein(best_ref,candidate)
+    # edit_dist = Levenshtein.distance(best_ref,candidate)
+    edit_dist = compute_edit_dist(ref=best_ref, candidate=candidate)
+    nrm_edit_dist = edit_dist / len(best_ref)
+    # print("best_ref:", best_ref, "candidate:", candidate, "edit_dist:", edit_dist)
+    if best_ref_lcs:
+        return 2 * precision * recall / (precision + recall), best_ref, edit_dist, nrm_edit_dist
+    else:
+        return 0.0, best_ref, edit_dist, nrm_edit_dist
+
+
+def mean_average_precision(candidates, references, n):
+    '''
+    Calculates mean average precision up to n candidates.
+    '''
+
+    total = 0.0
+    num_correct = 0
+    for k in range(n):
+        if k < len(candidates) and (candidates[k] in references):
+            num_correct += 1
+        total += float(num_correct) / float(k + 1)
+
+    return total / float(n)
+
+
+def inverse_rank(candidates, reference):
+    '''
+    Returns inverse rank of the matching candidate given the reference
+    Returns 0 if no match was found.
+    '''
+    rank = 0
+    while (rank < len(candidates)) and (candidates[rank] != reference):
+        rank += 1
+    if rank == len(candidates):
+        return 0.0
+    else:
+        return 1.0 / (rank + 1)
+
+
+def evaluate(pred_dict, gold_dict):
+    '''
+    REMEMBER! -- pred_dict and gold_dict should be word to lists dictionaries.
+    The list will be ordered in descending order in the pred_dict.
+    If you only have a single reference make sure its in a list.
+
+    Evaluates all metrics to save looping over input_data several times
+    n is the map-n parameter
+    Returns acc, f_score, mrr, map_ref, map_n
+    '''
+    mrr = {}
+    acc = {}
+    f = {}
+    f_best_match = {}
+    # map_n = {}
+    map_ref = {}
+    # map_sys = {}
+    acc_10 = {}
+    edit_dist = {}
+    nrm_edit_dist = {}
+    for src_word in gold_dict.keys():
+        if src_word in pred_dict:
+            candidates = pred_dict[src_word]
+            references = gold_dict[src_word]
+
+            acc[src_word] = max([int(candidates[0] == ref) for ref in references])  # either 1 or 0
+
+            f[src_word], f_best_match[src_word], edit_dist[src_word], nrm_edit_dist[src_word] = f_score(candidates[0], references)
+
+            mrr[src_word] = max([inverse_rank(candidates, ref) for ref in references])
+
+            # map_n[src_word] = mean_average_precision(candidates, references, n)
+            map_ref[src_word] = mean_average_precision(candidates, references, len(references))
+            # map_sys[src_word] = mean_average_precision(candidates, references, len(candidates))
+
+            ## compute accuracy at 10- Anoop
+            acc_10[src_word] = max([int(ref in candidates) for ref in references])  # either 1 or 0
+
+        else:
+            logging.error('Warning: No transliterations found for word %s\n' % src_word)
+            mrr[src_word] = 0.0
+            acc[src_word] = 0.0
+            f[src_word] = 0.0
+            edit_dist[src_word] = np.infty
+            nrm_edit_dist[src_word] = 1.0
+            f_best_match[src_word] = ''
+            # map_n[src_word] = 0.0
+            map_ref[src_word] = 0.0
+            # map_sys[src_word] = 0.0
+            # Anoop
+            acc_10[src_word] = 0.0
+
+    return acc, f, f_best_match, mrr, map_ref, acc_10, edit_dist, nrm_edit_dist  # added by Anoop
+
+
+def write_details(output_fname, input_data, test_data, acc, f, f_best_match, mrr, map_ref, acc_10):
+    '''
+    Writes detailed results to CSV file
+    '''
+    if output_fname == '-':
+        f_out = codecs.getwriter('utf-8')(sys.stdout)
+    else:
+        f_out = codecs.open(output_fname, 'w', 'utf-8')
+
+    f_out.write('%s\n' % (
+        ','.join(['"Source word"', '"First candidate"', '"ACC"', '"ACC-10"', '"F-score"', '"Best matching reference"',
+                  '"MRR"', '"MAP_ref"', '"References"'])))
+
+    for src_word in test_data.keys():
+        if src_word in input_data:
+            first_candidate = input_data[src_word][0]
+        else:
+            first_candidate = ''
+
+        f_out.write('%s,%s,%f,%f,%f,%s,%f,%f,%s\n' % (
+            src_word, first_candidate, acc[src_word], acc_10[src_word], f[src_word], f_best_match[src_word],
+            mrr[src_word],
+            map_ref[src_word], '"' + ' | '.join(test_data[src_word]) + '"'))
+
+    if output_fname != '-':
+        f_out.close()
+
+
+def main():
+    input_fname, output_fname, test_fname, max_candidates, check_only, silent = get_options()
+    stderr = codecs.getwriter('utf-8')(sys.stderr)
+
+    if not input_fname:
+        f = sys.stdin
+    else:
+        f = input_fname
+    try:
+        input_header, input_data, is_results = parse_xml(f, max_targets=max_candidates)
+    except IOError as e:
+        error_message = e.strerror
+        if not error_message:
+            error_message = e.message
+        stderr.write(u'Error encountered while parsing input: %s.\n' % error_message)
+        sys.exit(1)
+
+    if check_only:
+        stdout = codecs.getwriter('utf-8')(sys.stdout)
+
+        if not silent:
+            if is_results:
+                corpus_type = 'testing or reference'
+            else:
+                corpus_type = 'training or development'
+            stdout.write('This is %s corpus\n' % corpus_type)
+            for elem in input_header.keys():
+                stdout.write('%30s : %-30s\n' % (elem, input_header[elem]))
+            stdout.write('Number of words: %d\n' % len(input_data))
+        else:
+            stdout.write("OK\n")
+
+        sys.exit()
+
+    try:
+        test_header, test_data, is_results = parse_xml(test_fname)
+    except IOError as e:
+        error_message = e.strerror
+        if not error_message:
+            error_message = e.message
+        stderr.write(u'Error encountered while parsing test file. Here is what the parser said:\n%s.\n' % error_message)
+        sys.exit(1)
+
+    acc, f, f_best_match, mrr, map_ref, acc_10 = evaluate(input_data, test_data)
+
+    if output_fname:
+        write_details(output_fname, input_data, test_data, acc, f, f_best_match, mrr, map_ref, acc_10)
+
+    N = len(acc)
+    acc_num = float(sum([acc[src_word] for src_word in acc.keys()]))
+    acc10_num = float(sum([acc_10[src_word] for src_word in acc_10.keys()]))
+    sys.stdout.write('ACC:          %f (%d/%d)\n' % (acc_num / N, acc_num, N))
+    sys.stdout.write('Mean F-score: %f\n' % (float(sum([f[src_word] for src_word in f.keys()])) / N))
+    sys.stdout.write('MRR:          %f\n' % (float(sum([mrr[src_word] for src_word in mrr.keys()])) / N))
+    sys.stdout.write('MAP_ref:      %f\n' % (float(sum([map_ref[src_word] for src_word in map_ref.keys()])) / N))
+    sys.stdout.write('ACC@10:       %f (%d/%d)\n' % (acc10_num / N, acc10_num, N))
+    # sys.stdout.write('MAP_%d:       %f\n' % (n, float(sum([map_n[src_word] for src_word in map_n.keys()]))/N))
+    # sys.stdout.write('MAP_sys:      %f\n' % (float(sum([map_sys[src_word] for src_word in map_sys.keys()]))/N))
+
+
+def test():
+    stdout = codecs.getwriter('utf-8')(sys.stdout)
+    input_header, input_data, is_result = parse_xml('news_results.xml', max_targets=10)
+    test_header, test_data, is_result = parse_xml('news_test.xml')
+    acc, f, f_best_match, mrr, map_ref = evaluate(input_data, test_data)
+    for src_word in test_data.keys():
+        stdout.write('%10s ACC=%f\tF-score=%f (%s)\tMRR=%f\tMAP_ref=%f\n' % (
+            src_word, acc[src_word], f[src_word], f_best_match[src_word], mrr[src_word], map_ref[src_word]))
+
+
+if __name__ == '__main__':
+    main()
+    # test()
diff --git a/utils/news_evaluation_script/news_results.xml b/utils/news_evaluation_script/news_results.xml
new file mode 100644
index 0000000..55f96b7
--- /dev/null
+++ b/utils/news_evaluation_script/news_results.xml
@@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+
+<TransliterationTaskResults
+	SourceLang = "English"
+	TargetLang = "Chinese"
+	GroupID = "Trans University"
+	RunID = "1"
+	RunType = "Standard"
+	Comments = "HMM Run with params: alpha=0.8 beta=1.25">
+
+
+	<Name ID="1">
+		<SourceName>ABARBANEL</SourceName>
+		<TargetName ID="1">阿巴巴纳尔</TargetName>
+		<TargetName ID="2">阿巴巴内尔</TargetName>
+		<TargetName ID="3">阿巴班尔</TargetName>
+		<TargetName ID="4">阿巴本尔</TargetName>
+		<TargetName ID="5">阿巴巴尼尔</TargetName>
+		<TargetName ID="6">阿巴班埃尔</TargetName>
+		<TargetName ID="7">阿巴班尔</TargetName>
+		<TargetName ID="8">阿巴尔班埃尔</TargetName>
+		<TargetName ID="9">阿巴巴奈</TargetName>
+		<TargetName ID="10">阿巴巴纳</TargetName>
+		<TargetName ID="11">阿巴尔班尔</TargetName>
+		<TargetName ID="12">埃巴巴纳尔</TargetName>
+		<TargetName ID="13">阿巴尔巴纳尔</TargetName>
+		<TargetName ID="14">阿巴巴恩尔</TargetName>
+		<TargetName ID="15">阿巴贝内尔</TargetName>
+		<TargetName ID="16">埃巴巴内尔</TargetName>
+		<TargetName ID="17">阿巴尔巴内尔</TargetName>
+		<TargetName ID="18">阿巴本埃尔</TargetName>
+		<TargetName ID="19">阿巴班克尔</TargetName>
+		<TargetName ID="20">阿巴贝恩尔</TargetName>
+		<!-- Participants to provide their top 10 candidate transliterations -->
+	</Name>
+	<Name ID="2">
+		<SourceName>ABBELL</SourceName>
+		<TargetName ID="1">阿贝尔</TargetName>
+		<TargetName ID="2">阿贝尔尔</TargetName>
+		<TargetName ID="3">阿布贝尔</TargetName>
+		<TargetName ID="4">阿卜贝尔</TargetName>
+		<TargetName ID="5">埃贝尔</TargetName>
+		<TargetName ID="6">阿布尔</TargetName>
+		<TargetName ID="7">阿比尔</TargetName>
+		<TargetName ID="8">艾贝尔</TargetName>
+		<TargetName ID="9">艾布贝尔</TargetName>
+		<TargetName ID="10">阿比尔</TargetName>
+		<TargetName ID="11">阿贝利</TargetName>
+		<TargetName ID="12">阿比埃尔</TargetName>
+		<TargetName ID="13">奥贝尔</TargetName>
+		<TargetName ID="14">阿贝莱</TargetName>
+		<TargetName ID="15">亚贝尔</TargetName>
+		<TargetName ID="16">阿比厄尔</TargetName>
+		<TargetName ID="17">阿比雷尔</TargetName>
+		<TargetName ID="18">亚伯贝尔</TargetName>
+		<TargetName ID="19">阿布比尔</TargetName>
+		<TargetName ID="20">埃布贝尔</TargetName>
+	</Name>
+	<Name ID="3">
+		<SourceName>ABBOT</SourceName>
+		<TargetName ID="1">阿伯特</TargetName>
+		<TargetName ID="2">阿比特</TargetName>
+		<TargetName ID="3">阿布伯特</TargetName>
+		<TargetName ID="4">阿比奥特</TargetName>
+		<TargetName ID="5">埃伯特</TargetName>
+		<TargetName ID="6">阿卜伯特</TargetName>
+		<TargetName ID="7">阿布特</TargetName>
+		<TargetName ID="8">艾伯特</TargetName>
+		<TargetName ID="9">阿布布特</TargetName>
+		<TargetName ID="10">阿布奥特</TargetName>
+		<TargetName ID="11">阿布鲍特</TargetName>
+		<TargetName ID="12">奥伯特</TargetName>
+		<TargetName ID="13">阿比厄特</TargetName>
+		<TargetName ID="14">亚伯特</TargetName>
+		<TargetName ID="15">阿伯得</TargetName>
+		<TargetName ID="16">阿伯克</TargetName>
+		<TargetName ID="17">艾布伯特</TargetName>
+		<TargetName ID="18">阿卜布特</TargetName>
+		<TargetName ID="19">阿伯托</TargetName>
+		<TargetName ID="20">阿比欧</TargetName>
+	</Name>
+	<Name ID="4">
+		<SourceName>ABELE</SourceName>
+		<TargetName ID="1">阿贝尔</TargetName>
+		<TargetName ID="2">阿伯利</TargetName>
+		<TargetName ID="3">阿布尔</TargetName>
+		<TargetName ID="4">阿伯尔</TargetName>
+		<TargetName ID="5">阿比尔</TargetName>
+		<TargetName ID="6">阿贝利</TargetName>
+		<TargetName ID="7">阿布尔</TargetName>
+		<TargetName ID="8">阿贝尔尔</TargetName>
+		<TargetName ID="9">埃布尔</TargetName>
+		<TargetName ID="10">阿贝勒</TargetName>
+		<TargetName ID="11">阿布斯</TargetName>
+		<TargetName ID="12">阿布尔尔</TargetName>
+		<TargetName ID="13">阿布利</TargetName>
+		<TargetName ID="14">阿贝尔斯</TargetName>
+		<TargetName ID="15">埃贝尔</TargetName>
+		<TargetName ID="16">阿比利</TargetName>
+		<TargetName ID="17">阿拜尔</TargetName>
+		<TargetName ID="18">埃布尔尔</TargetName>
+		<TargetName ID="19">埃比尔</TargetName>
+		<TargetName ID="20">阿布勒</TargetName>
+
+	</Name>
+</TransliterationTaskResults>
+
diff --git a/utils/news_evaluation_script/news_test.xml b/utils/news_evaluation_script/news_test.xml
new file mode 100644
index 0000000..2b1034c
--- /dev/null
+++ b/utils/news_evaluation_script/news_test.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+
+<TransliterationCorpus
+	SourceLang = "English"
+	TargetLang = "Chinese"
+	CorpusID = "Test"
+	CorpusType = "Short test"
+	CorpusSize = "Tsiny"
+	Comments = "Just a test">
+
+
+	<Name ID="1">
+		<SourceName>ABARBANEL</SourceName>
+		<!-- should be first matching -->
+		<TargetName ID="1">阿巴巴纳尔</TargetName>
+		<!-- Participants to provide their top 10 candidate transliterations -->
+	</Name>
+	<Name ID="2">
+		<!-- should be second matching -->
+		<SourceName>ABBELL</SourceName>
+		<TargetName ID="2">阿贝尔尔</TargetName>
+	</Name>
+	<Name ID="3">
+		<!-- should match 3rd, and 5th candidates -->
+		<SourceName>ABBOT</SourceName>
+		<TargetName ID="3">阿布伯特</TargetName>
+		<TargetName ID="5">埃伯特</TargetName>
+	</Name>
+	<Name ID="4">
+		<SourceName>ABELE</SourceName>
+		<!-- should not match anything -->
+		<TargetName ID="1">阿贝尔埃伯特</TargetName>
+	</Name>
+</TransliterationCorpus>
+