From 897eb1e4541a8264389db0f3ec219fe849a37d1e Mon Sep 17 00:00:00 2001 From: Shyam Upadhyay Date: Sat, 26 Jan 2019 16:39:58 -0500 Subject: [PATCH] code upload --- .gitignore | 16 + README.md | 71 +- baseline/Makefile | 10 + baseline/align.c | 717 ++++++++++++++++++ baseline/align.py | 96 +++ baseline/align_utils.py | 10 + baseline/baseline.py | 367 +++++++++ baseline/perceptron.c | 397 ++++++++++ baseline/perceptron_c.py | 114 +++ load_and_test_model_interactive.sh | 25 + load_and_test_model_on_files.sh | 32 + readers/aligned_reader.py | 121 +++ seq2seq/constants.py | 11 + seq2seq/encoder.py | 61 ++ seq2seq/evaluators/reporter.py | 185 +++++ seq2seq/inferences/evaluate.py | 27 + seq2seq/inferences/monotonic_infer.py | 172 +++++ seq2seq/lang.py | 29 + seq2seq/main.py | 151 ++++ seq2seq/model_utils.py | 108 +++ seq2seq/monotonic_decoder.py | 42 + seq2seq/runner.py | 52 ++ seq2seq/torch_utils.py | 33 + seq2seq/trainers/monotonic_train.py | 116 +++ seq2seq/trainers/seq2seq_attn_trainer.py | 195 +++++ train_model.sh | 28 + train_model_on_files.sh | 30 + utils/__init__.py | 0 utils/arguments.py | 44 ++ .../news_evaluation_script/news_evaluation.py | 510 +++++++++++++ utils/news_evaluation_script/news_results.xml | 108 +++ utils/news_evaluation_script/news_test.xml | 36 + 32 files changed, 3906 insertions(+), 8 deletions(-) create mode 100644 .gitignore create mode 100644 baseline/Makefile create mode 100644 baseline/align.c create mode 100644 baseline/align.py create mode 100644 baseline/align_utils.py create mode 100755 baseline/baseline.py create mode 100644 baseline/perceptron.c create mode 100644 baseline/perceptron_c.py create mode 100755 load_and_test_model_interactive.sh create mode 100755 load_and_test_model_on_files.sh create mode 100644 readers/aligned_reader.py create mode 100644 seq2seq/constants.py create mode 100644 seq2seq/encoder.py create mode 100644 seq2seq/evaluators/reporter.py create mode 100644 seq2seq/inferences/evaluate.py create mode 100644 seq2seq/inferences/monotonic_infer.py create mode 100644 seq2seq/lang.py create mode 100644 seq2seq/main.py create mode 100644 seq2seq/model_utils.py create mode 100644 seq2seq/monotonic_decoder.py create mode 100644 seq2seq/runner.py create mode 100644 seq2seq/torch_utils.py create mode 100644 seq2seq/trainers/monotonic_train.py create mode 100644 seq2seq/trainers/seq2seq_attn_trainer.py create mode 100755 train_model.sh create mode 100755 train_model_on_files.sh create mode 100644 utils/__init__.py create mode 100644 utils/arguments.py create mode 100755 utils/news_evaluation_script/news_evaluation.py create mode 100644 utils/news_evaluation_script/news_results.xml create mode 100644 utils/news_evaluation_script/news_test.xml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a65a715 --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +*.tab +*.dict +*.pred +*.model +*.tar +*.vocab +*.vocab.romanized +*.tar.gz +phone_index* +data/ +.idea +*.txt +*.pyc +*.log +*.so +m2m/ \ No newline at end of file diff --git a/README.md b/README.md index b844e87..73de0d8 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,67 @@ -Code for the EMNLP paper, "Bootstrapping Transliteration with Guided Discovery for Low-Resource Languages". -Coming soon. +## Running the code +1. First compile the C code for the aligner. +```bash +cd baseline/ +make ``` -@InProceedings{UKR18, - author = {Upadhyay, Shyam and Kodner, Jordan and Roth, Dan}, - title = {Bootstrapping Transliteration with Guided Discovery for Low-Resource Languages}, - booktitle = {EMNLP}, - year = {2018}, -} + +2. write you train, dev and test data in the following format, + +``` +x1 x2 x3y1 y2 y3 y4 y5 +``` +where `x1x2x3` is the input word (`xi` is the character), and `y1y2y3y4y5` is the desired output (transliteration). Example train and test files for bengali are in data/ folder. There is a optional 3rd column marking whether the word is *native* or *foreign* (see the paper for these terms); this column can be ignored for most purposes. + + +3. Run `train_model_on_files.sh` on your train (say train.txt) and dev file (dev.txt) as follows, + +``` +./train_model_on_files.sh train.txt dev.txt 100 translit.model +``` + +where 100 is the random seed and translit.model is the output model. Other parameters(see `utils/arguments.py` for options) can be specified by modifying the `train_model_on_files.sh` script appropriately. + +4. Test the trained model as follows, + +``` +./load_and_test_model_on_files.sh train.txt test.txt translit.model 100 output.txt +``` + +The output should report relevant metrics, + +``` +... +... +:INFO: --------------------TEST-------------------- +:INFO: running infer on example 200 +:INFO: running infer on example 400 +:INFO: running infer on example 600 +:INFO: running infer on example 800 +:INFO: accuracy 367/997=0.37 +:INFO: accuracy (nat) 308/661=0.47 +:INFO: accuracy (eng) 59/336=0.18 +:INFO: ********************total******************** +:INFO: ACC: 0.371457 (367/988) +:INFO: Mean F-score: 0.910995 +:INFO: Mean ED@1: 1.136640+-1.167 +:INFO: Mean NED@1: 0.084884 +:INFO: Median ED@1: 1.000000 +... +... ``` + +There is also a interactive mode where one can input test words directly, + +``` +./load_and_test_model_interactive.sh +... +... +:INFO: => loading checkpoint hindi.model +:INFO: => loaded checkpoint! +enter surface:ओबामा +ओ ब ा म ा +[(-0.4624647759074629, 'o b a m a')] +``` + diff --git a/baseline/Makefile b/baseline/Makefile new file mode 100644 index 0000000..50bbda8 --- /dev/null +++ b/baseline/Makefile @@ -0,0 +1,10 @@ +all: libperceptron.so libalign.so + +libperceptron.so: perceptron.c + gcc -O3 -Wall -Wextra -shared -fPIC perceptron.c -o libperceptron.so + +libalign.so: align.c + gcc -O3 -Wall -Wextra -shared -fPIC align.c -o libalign.so + +clean: + /bin/rm libperceptron.so libalign.so *.pyc diff --git a/baseline/align.c b/baseline/align.c new file mode 100644 index 0000000..632be85 --- /dev/null +++ b/baseline/align.c @@ -0,0 +1,717 @@ +/************************************************************************/ +/* crpalign - Chinese Restaurant Process string pair aligner */ +/* Copyright © 2013 Mans Hulden */ +/* */ +/* This file is part of crpalign. */ +/* */ +/* Licensed under the Apache License, Version 2.0 (the "License"); */ +/* you may not use this file except in compliance with the License. */ +/* You may obtain a copy of the License at */ +/* */ +/* http://www.apache.org/licenses/LICENSE-2.0 */ +/* */ +/************************************************************************/ + +/* To build python bindings: gcc -O3 -Wall -Wextra -shared align.c -o libalign.so */ +/* WARNING: currently not thread-safe */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Yields minimum of three values */ +#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c))) +/* Compares three values, yielding -1, 0, 1 depending on if a, b, or c is the smallest */ +#define CMP3(a, b, c) ((a) < (b) ? ((a) < (c) ? (-1) : (1)) : ((b) < (c) ? (0) : (1))) + +#define LEFT -1 +#define DIAG 0 +#define DOWN 1 + +#define INPUT_FORMAT_L2P 0 +#define INPUT_FORMAT_NEWS 1 + +#define OUTPUT_FORMAT_PLAIN 0 +#define OUTPUT_FORMAT_ALIGNED 1 +#define OUTPUT_FORMAT_PHONETISAURUS 2 +#define OUTPUT_FORMAT_M2M 3 + +#define MATRIX_MODE_MED 0 +#define MATRIX_MODE_GS 1 + +int g_maxsymbol = 0; +int g_debug = 0; +int g_med = 0; +int g_in_result[256]; +int g_out_result[256]; +int g_paircount = 0; +int g_distinct_pairs = 0; +int g_input_format = INPUT_FORMAT_L2P; +int g_output_format = OUTPUT_FORMAT_ALIGNED; +double g_trellis[256][256]; +int g_backptr[256][256]; +int g_current_count[256][256]; +int g_global_count[256][256]; +char *g_symboltable[1024]; +double g_prior = 0.1; +double g_zero = 0.0; + +struct stringpair { /* These are all */ + int *in; /* -1 terminated int sequences */ + int *out; + int *inaligned; + int *outaligned; + struct stringpair *next; +} *g_stringpairs = NULL, *g_stringpairs_tail = NULL; + +void align_init(void) { + g_stringpairs = NULL; + g_stringpairs_tail = NULL; +} + +void align_init_with_seed(long seed) { + g_stringpairs = NULL; + g_stringpairs_tail = NULL; + srand48(seed); +} + +int intseqlen(int *seq) { + int i; + for (i = 0; seq[i] != -1; i++) { } + return i; +} + +double log_add(double logy, double logx) { + /* Supposes that inputs are negative log probabilities */ + if (logy > logx) { + double temp = logx; + logx = logy; + logy = temp; + } + double negdiff = logx - logy; + if (negdiff > 80) { + return(logy); + } + return logx - log(1 + exp(logx - logy)); +} + +void debug(const char *fmt, ...) { + va_list arg; + if (g_debug == 1) { + va_start(arg, fmt); + vprintf(fmt, arg); + va_end(arg); + } +} + +/* Gives length in bytes of UTF-8 character */ +int utf8len(char *str) { + unsigned char s; + s = (unsigned char)(unsigned int) (*str); + if (s < 0x80) + return 1; + if ((s & 0xe0) == 0xc0) { + return 2; + } + if ((s & 0xf0) == 0xe0) { + return 3; + } + if ((s & 0xf8) == 0xf0) { + return 4; + } + return 0; +} + +/* Reverses an integer sequence in-place */ +void vector_reverse(int *s, int length) { + int c, i, j; + for (i = 0, j = length - 1; i < j; i++, j--) { + c = s[i]; + s[i] = s[j]; + s[j] = c; + } +} + +/* Returns number of UTF8 characters in char array */ +int utf8strlen(char *str) { + int i,j, len; + len = strlen(str); + for (i = 0, j = 0; *(str+i) != '\0' && i < len; j++) { + i = i + utf8len(str+i); + } + return j; +} + +int random_3draw(double a, double b, double c) { + + /* From three negative logprobs, do a weighted coin toss */ + /* proportional to each probability, returing -1, 0, 1 */ + /* depending on if a, b, or c is drawn. */ + + double minv, subv, rand; + /* Scale neg logprobs */ + minv = MIN3(a, b, c); + if (minv >= 2) { + subv = minv - 2; /* <= we scale so that highest prob entry is 2 (in -log space) */ + a -= subv; /* This to avoid underflow when converting to reals */ + b -= subv; /* for the weighted random choice. */ + c -= subv; + } + a = exp(-a); /* Convert to three probabilities */ + b = exp(-b); + c = exp(-c); + rand = drand48(); + rand = rand * (a + b + c); + if (rand < a) { return -1; } + if (rand < a+b) { return 0; } + return 1; +} + +/* Fills trellis with aligned integer sequences in and out, using the callback function */ +/* cost(). Returns aligned strings in g_in_result[] and g_out_result[] */ +/* If mode = MODE_GS, we resample alignments by a CRP process (filling trellis "forward" */ +/* and then drawing a new alignment going "backward") */ +/* If mode = MODE_MED, we find the "cheapest" alignment */ + +double fill_trellis(int *in, int *out, double(*cost)(int, int), int mode) { + int i, x, y, inlen, outlen; + double left, down, diag, p; + inlen = intseqlen(in); + outlen = intseqlen(out); + g_trellis[0][0] = g_zero; + for (x = 1; x <= outlen; x++) { + g_trellis[x][0] = g_trellis[x-1][0] + cost(0,out[x-1]); + g_backptr[x][0] = LEFT; + } + for (y = 1; y <= inlen; y++) { + g_trellis[0][y] = g_trellis[0][y-1] + cost(in[y-1], 0); + g_backptr[0][y] = DOWN; + } + for (x = 1; x <= outlen; x++) { + for (y = 1; y <= inlen; y++) { + left = g_trellis[x-1][y] + cost(0,out[x-1]); + down = g_trellis[x][y-1] + cost(in[y-1], 0); + diag = g_trellis[x-1][y-1] + cost(in[y-1], out[x-1]); + + if (mode == MATRIX_MODE_MED) { + g_trellis[x][y] = MIN3(left, diag, down); + g_backptr[x][y] = CMP3(left, diag, down); + } + else if (mode == MATRIX_MODE_GS) { + g_trellis[x][y] = log_add(log_add(left, diag), down); + } + } + } + + /* Resample a new "path" for the string pair starting from upper right-hand corner + in the matrix and moving left, down, or diagonally down/left until we reach [0,0] + ..[B][A] To choose the direction we do a weighted coin toss between choices A -> B, A -> C, A -> D: + ..[C][D] w(B) = p(B) * p(B->A) ; w(C) = p(C) * p(C->A) ; w(D) = p(D) * p(D -> A). + . . and p(X->Y) = the probability of taking the transition (X->Y) + . . Since we've stored the probabilities in log space, we need to do some scaling + and conversion before doing the weighted toss. + */ + + if (mode == MATRIX_MODE_GS) { + for (y = inlen, x = outlen; x > 0 || y > 0 ; ) { + if (x == 0) { + y--; + } else if (y == 0) { + x--; + } else { + left = g_trellis[x-1][y] + cost(0,out[x-1]); + down = g_trellis[x][y-1] + cost(in[y-1], 0); + diag = g_trellis[x-1][y-1] + cost(in[y-1], out[x-1]); + g_backptr[x][y] = random_3draw(left, diag, down); + x--; + y--; + } + } + } + + for (i = 0, y = inlen, x = outlen; x > 0 || y > 0; i++) { + if (g_backptr[x][y] == DIAG) { + x--; + y--; + g_in_result[i] = in[y]; + g_out_result[i] = out[x]; + } else if (g_backptr[x][y] == LEFT) { + x--; + g_in_result[i] = 0; + g_out_result[i] = out[x]; + } else if (g_backptr[x][y] == DOWN) { + y--; + g_in_result[i] = in[y]; + g_out_result[i] = 0; + } + } + + g_in_result[i] = -1; + g_out_result[i] = -1; + + vector_reverse(g_in_result, i); + vector_reverse(g_out_result, i); + p = g_trellis[outlen][inlen]; + return(p); +} + +/* Removes the counts of symbol pairs in two -1 -terminated sequences */ +/* to the current count table */ +void remove_counts(int *in, int *out) { + int i; + for (i = 0; in[i] != -1 && out[i] != -1; i++) { + g_current_count[in[i]][out[i]]--; + if (g_current_count[in[i]][out[i]] == 0) { + g_distinct_pairs--; + } + } +} + +/* Add the counts of symbol pairs in two -1 -terminated sequences */ +/* to the current count table */ +void add_counts(int *in, int *out) { + int i; + for (i = 0; in[i] != -1 && out[i] != -1; i++) { + g_current_count[in[i]][out[i]]++; + g_paircount++; + if (g_current_count[in[i]][out[i]] == 1) { + g_distinct_pairs++; + } + } +} + +/* Add running counts of pairs to the global count table */ +void add_global_counts() { + int i, j; + for (i = 0; i <= g_maxsymbol; i++) { + for (j = 0; j <= g_maxsymbol; j++) { + g_global_count[i][j] += g_current_count[i][j]; + } + } +} + +void print_counts() { + int i, j; + debug("\n"); + for (i = 0; i <= g_maxsymbol; i++) { + for (j = 0; j <= g_maxsymbol; j++) { + debug("%i ", g_current_count[i][j]); + } + debug("\n"); + } +} + +/* Cost function called by fill_trellis for MED */ +double cost_levenshtein(int a, int b) { + if (a != b) { + return 1.0; + } + return 0.0; +} + +/* Cost function called by fill_trellis for CRP alignment */ +double cost_crp(int in, int out) { + double cost; + cost = (double)( g_current_count[in][out] + g_prior ) / (double)( g_paircount + g_distinct_pairs * g_prior ); + return(-log(cost)); +} + +/* Initially, align all string pairs greedily, i.e. e.g. => */ +void initial_align() { + struct stringpair *pair; + int inlen, outlen, i, j, k; + for (pair = g_stringpairs; pair != NULL; pair = pair->next) { + inlen = intseqlen(pair->in); + outlen = intseqlen(pair->out); + pair->inaligned = malloc(sizeof(int) * (inlen+outlen+1)); + pair->outaligned = malloc(sizeof(int) * (inlen+outlen+1)); + + for (i = 0, j = 0, k = 0; pair->in[i] != -1 || pair->out[j] != -1; k++) { + if (pair->in[i] == -1) { + pair->inaligned[k] = 0; + pair->outaligned[k] = pair->out[j]; + j++; + } + else if (pair->out[j] == -1) { + pair->inaligned[k] = pair->in[i]; + pair->outaligned[k] = 0; + i++; + } else { + pair->inaligned[k] = pair->in[i]; + pair->outaligned[k] = pair->out[j]; + i++; + j++; + } + } + pair->inaligned[k] = -1; + pair->outaligned[k] = -1; + add_counts(pair->inaligned, pair->outaligned); + } +} + +/* Align a set of string pairs by minimum edit distance (for reference) */ +void med_align() { + struct stringpair *sp; + int j; + for (sp = g_stringpairs; sp != NULL; sp = sp->next) { + fill_trellis(sp->in, sp->out, &cost_levenshtein, MATRIX_MODE_MED); /* Fill trellis */ + for (j = 0; g_in_result[j] != -1; j++) { + sp->inaligned[j] = g_in_result[j]; + sp->outaligned[j] = g_out_result[j]; + } + sp->inaligned[j] = -1; + sp->outaligned[j] = -1; + } +} + +void crp_align() { + struct stringpair *sp; + int j; + for (sp = g_stringpairs; sp != NULL; sp = sp->next) { + fill_trellis(sp->in, sp->out, &cost_crp, MATRIX_MODE_MED); + for (j = 0; g_in_result[j] != -1; j++) { + sp->inaligned[j] = g_in_result[j]; + sp->outaligned[j] = g_out_result[j]; + } + sp->inaligned[j] = -1; + sp->outaligned[j] = -1; + } +} + +void crp_train(int iterations, int burnin, int lag) { + struct stringpair *sp; + int i, j; + for (i = 0; i < iterations; i++) { + fprintf(stderr,"Alignment iteration: %i\n", i); + print_counts(); + for (sp = g_stringpairs; sp != NULL; sp = sp->next) { + remove_counts(sp->inaligned, sp->outaligned); /* Remove counts before aligning */ + fill_trellis(sp->in, sp->out, &cost_crp, MATRIX_MODE_GS); + for (j = 0; g_in_result[j] != -1; j++) { + sp->inaligned[j] = g_in_result[j]; + sp->outaligned[j] = g_out_result[j]; + } + sp->inaligned[j] = -1; + sp->outaligned[j] = -1; + add_counts(sp->inaligned, sp->outaligned); /* Add counts back from new alignment */ + } + if (i > burnin && i % lag == 0) { + add_global_counts(); + } + } +} + +int get_set_char_num(char *utfstring) { + int i; + debug("Finding symbol %s with len %i... ", utfstring, utf8len(utfstring)); + for (i = 1; i <= g_maxsymbol; i++) { + if (strcmp(utfstring, g_symboltable[i]) == 0) { + debug("Found at %i\n", i); + return i; + } + } + g_maxsymbol++; + debug("Not found, adding at %i\n", g_maxsymbol); + g_symboltable[g_maxsymbol] = strdup(utfstring); + return(g_maxsymbol); +} + +/* Reads character sequences in and out and onverts them to integer sequences */ +/* And adds them to the global list of integer sequence pairs */ + +void add_string_pair(char *in, char *out) { + int *int_in, *int_out; + int i, j; + char *token; + struct stringpair *newpair; + /* Get int array */ + int_in = malloc(sizeof(int) * (utf8strlen(in) + 1)); + int_out = malloc(sizeof(int) * (utf8strlen(out) + 1)); + if (g_input_format == INPUT_FORMAT_L2P) { + for (i = 0, j = 0; in[i] != '\0'; i += utf8len(&in[i]), j++) { + int_in[j] = get_set_char_num(strndup(&in[i], utf8len(&in[i]))); + } + int_in[j] = -1; + for (i = 0, j = 0; out[i] != '\0'; i += utf8len(&out[i]), j++) { + int_out[j] = get_set_char_num(strndup(&out[i], utf8len(&out[i]))); + } + int_out[j] = -1; + } else if (g_input_format == INPUT_FORMAT_NEWS) { + token = strtok(in, " "); + for (j = 0; token != NULL; j++) { + int_in[j] = get_set_char_num(token); + token = strtok(NULL, " "); + } + int_in[j] = -1; + token = strtok(out, " "); + for (j = 0; token != NULL; j++) { + int_out[j] = get_set_char_num(token); + token = strtok(NULL, " "); + } + int_out[j] = -1; + } + + newpair = malloc(sizeof(struct stringpair)); + newpair->in = int_in; + newpair->out = int_out; + newpair->next = NULL; + if (g_stringpairs == NULL) { + g_stringpairs = newpair; + g_stringpairs_tail = newpair; + } else { + g_stringpairs_tail->next = newpair; + g_stringpairs_tail = newpair; + } +} + +/* Directly add two -1 terminated integer sequences */ +void add_int_pair(int *in, int *out) { + int inlen, outlen; + struct stringpair *newpair; + newpair = malloc(sizeof(struct stringpair)); + inlen = intseqlen(in) + 1; + outlen = intseqlen(out) + 1; + newpair->in = malloc(inlen * sizeof(int)); + newpair->out = malloc(outlen * sizeof(int)); + memcpy(newpair->in, in, inlen * sizeof(int)); + memcpy(newpair->out, out, outlen * sizeof(int)); + newpair->next = NULL; + if (g_stringpairs == NULL) { + g_stringpairs = newpair; + g_stringpairs_tail = newpair; + } else { + g_stringpairs_tail->next = newpair; + g_stringpairs_tail = newpair; + } +} + +void clear_counts() { + int i,j; + for (i = 0; i <= g_maxsymbol; i++) { + for (j = 0; j <= g_maxsymbol; j++) { + g_current_count[i][j] = 0; + g_global_count[i][j] = 0; + } + } +} + +void print_pair_plain(int *in, int *out) { + int i; + g_symboltable[0] = " "; + for (i = 0; in[i] != -1; i++) { + printf("%s", in[i] == 0 ? " " : g_symboltable[ in[i] ]); + } + printf("\n"); + for (i = 0; out[i] != -1; i++) { + printf("%s", out[i] == 0 ? " " : g_symboltable[ out[i] ]); + } + printf("\n\n"); +} + +void print_pair_m2m(int *in, int *out) { + int i; + g_symboltable[0] = "_"; + for (i = 0; in[i] != -1; i++) { + printf("%s|", in[i] == 0 ? " " : g_symboltable[ in[i] ]); + } + printf("\t"); + for (i = 0; out[i] != -1; i++) { + printf("%s|", out[i] == 0 ? " " : g_symboltable[ out[i] ]); + } + printf("\n"); +} + +void print_pair_phonetisaurus(int *in, int *out) { + int i; + g_symboltable[0] = "_"; + for (i = 0; in[i] != -1 && out[i] != -1; i++) { + printf("%s}%s", g_symboltable[in[i]], g_symboltable[out[i]]); + if (in[i+1] != -1 && out[i+1] != -1) { + printf(" "); + } + } + printf("\n"); +} + +void print_pair_aligned(int *in, int *out) { + int i, fieldwidth; + char *instr, *outstr; + g_symboltable[0] = "_"; + for (i = 0; in[i] != -1 && out[i] != -1; i++) { + instr = g_symboltable[ in[i] ]; + outstr = g_symboltable[ out[i] ]; + fieldwidth = utf8strlen(instr) > utf8strlen(outstr) ? utf8strlen(instr) : utf8strlen(outstr); + printf("%-*s", fieldwidth, instr); + if (in[i+1] != -1 && out[i+1] != -1) + printf("|"); + } + printf("\n"); + for (i = 0; in[i] != -1 && out[i] != -1; i++) { + instr = g_symboltable[ in[i] ]; + outstr = g_symboltable[ out[i] ]; + fieldwidth = utf8strlen(instr) > utf8strlen(outstr) ? utf8strlen(instr) : utf8strlen(outstr); + printf("%-*s", fieldwidth, outstr); + if (in[i+1] != -1 && out[i+1] != -1) + printf("|"); + } + printf("\n\n"); +} + +/* Functions for Python ctypes wrap */ + +struct stringpair *getpairs_init() { + return g_stringpairs; +} + +int *getpairs_in(struct stringpair *sp) { + return sp->inaligned; +} + +int *getpairs_out(struct stringpair *sp) { + return sp->outaligned; +} + +struct stringpair *getpairs_advance(struct stringpair *sp) { + return sp->next; +} + +/************************************/ + +void write_stringpairs() { + struct stringpair *sp; + for (sp = g_stringpairs; sp != NULL; sp = sp->next) { + switch(g_output_format) { + case OUTPUT_FORMAT_PLAIN: + print_pair_plain(sp->inaligned, sp->outaligned); + break; + case OUTPUT_FORMAT_ALIGNED: + print_pair_aligned(sp->inaligned, sp->outaligned); + break; + case OUTPUT_FORMAT_PHONETISAURUS: + print_pair_phonetisaurus(sp->inaligned, sp->outaligned); + break; + case OUTPUT_FORMAT_M2M: + print_pair_m2m(sp->inaligned, sp->outaligned); + break; + } + } +} + +void read_stringpairs() { + char *my_string = NULL, *token1, *token2; + char str1[1024], str2[1024]; + size_t nbytes; + int bytes_read; + while ((bytes_read = getline(&my_string, &nbytes, stdin)) != -1) { + if (g_input_format == INPUT_FORMAT_L2P) { + if (sscanf(my_string, "%1023s %1023s", &str1[0], &str2[0]) == 2) + add_string_pair(str1, str2); + } else if (g_input_format == INPUT_FORMAT_NEWS) { + token1 = strtok(my_string, "\t\n"); + token2 = strtok(NULL, "\t\n"); + if (token1 != NULL && token2 != NULL) + add_string_pair(token1, token2); + } + } + clear_counts(); + initial_align(); +} + +int main(int argc, char **argv) { + static char *usagestring = + "Chinese restaurant process string pair aligner\n" + "Basic usage: crpalign11 [options] < infile.txt > aligned.txt\n" + " infile.txt is a list of TAB-separated word-pairs, one pair per line.\n\n" + "Options:\n" + "-d --debug print debug info\n" + "-h --help help\n" + "-m --med do simple med-alignment only (for comparison)\n" + "-x NUM --iterations=NUM run aligner for NUM iterations (default 10)\n" + "-i FMT --informat=FMT expect data in format FMT=l2p|news (default l2p)\n" + "-o FMT --outformat=FMT print data in format FMT=plain|aligned|phonetisaurus|m2m\n" + "-b NUM --burnin=NUM run Gibbs sampler with NUM iterations of burn-in\n" + "-l NUM --lag=NUM collect counts from sampler every NUM iterations\n" + "-p NUM --prior=NUM use a prior of NUM for sampler (default 0.1)\n"; + + + int opt, iterations = 10, burnin = 5, lag = 1, option_index = 0; + static struct option long_options[] = + { + {"debug", no_argument, 0, 'd'}, + {"help", no_argument, 0, 'h'}, + {"med", no_argument, 0, 'm'}, + {"iterations", required_argument, 0, 'x'}, + {"informat", required_argument, 0, 'i'}, + {"outformat", required_argument, 0, 'o'}, + {"burnin", required_argument, 0, 'b'}, + {"lag", required_argument, 0, 'l'}, + {"prior", required_argument, 0, 'p'}, + {0, 0, 0, 0} + }; + + while ((opt = getopt_long(argc, argv, "dmx:b:l:p:i:o:h", long_options, &option_index)) != -1) { + switch(opt) { + case 'd': + g_debug = 1; + break; + case 'm': + g_med = 1; + break; + case 'x': + iterations = atoi(optarg); + break; + case 'b': + burnin = atoi(optarg); + break; + case 'h': + printf("%s", usagestring); + exit(0); + case 'i': + if (strcmp(optarg,"l2p") == 0) { + g_input_format = INPUT_FORMAT_L2P; + } else if (strcmp(optarg, "news") == 0) { + g_input_format = INPUT_FORMAT_NEWS; + } else { + fprintf(stderr, "Invalid option %s for input format\n", optarg); + exit(EXIT_FAILURE); + } + break; + case 'o': + if (strcmp(optarg,"plain") == 0) { + g_output_format = OUTPUT_FORMAT_PLAIN; + } else if (strcmp(optarg, "aligned") == 0) { + g_output_format = OUTPUT_FORMAT_ALIGNED; + } else if (strcmp(optarg, "phonetisaurus") == 0) { + g_output_format = OUTPUT_FORMAT_PHONETISAURUS; + } else if (strcmp(optarg, "m2m") == 0) { + g_output_format = OUTPUT_FORMAT_M2M; + } else { + fprintf(stderr, "Invalid option %s for output format\n", optarg); + exit(EXIT_FAILURE); + } + break; + case 'l': + lag = atoi(optarg); + break; + case 'p': + g_prior = strtod(optarg,NULL); + break; + } + } + + srand48((unsigned int)time((time_t *)NULL)); + read_stringpairs(); + if (g_med == 1) { + med_align(); + } else { + crp_train(iterations,burnin,lag); + crp_align(); + } + write_stringpairs(); + return(0); +} diff --git a/baseline/align.py b/baseline/align.py new file mode 100644 index 0000000..0ce267c --- /dev/null +++ b/baseline/align.py @@ -0,0 +1,96 @@ +# Simple class for learning an alignment of strings, MED-style. +# Weights are learned by a Chinese Restaurant Process sampler +# that weights single alignments x:y in proportion to how many times +# such an alignment has been seen elsewhere out of all possible alignments. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 + +# Usage: +# Align(wordpairs) <= wordpairs is an iterable of 2-tuples +# The resulting Align.alignedpairs is a list of aligned 2-tuples + +# Relies on C-code in libalign.so built from align.c through ctypes. +# Author: Mans Hulden +# MH20151102 + +import itertools +from ctypes import * + +libalign = cdll.LoadLibrary('./baseline/libalign.so') + +libalign_add_int_pair = libalign.add_int_pair +libalign_clear_counts = libalign.clear_counts +libalign_initial_align = libalign.initial_align +libalign_crp_train = libalign.crp_train +libalign_crp_align = libalign.crp_align +libalign_med_align = libalign.med_align + +libalign_getpairs_init = libalign.getpairs_init +libalign_getpairs_init.restype = c_void_p +libalign_getpairs_in = libalign.getpairs_in +libalign_getpairs_in.restype = POINTER(c_int) +libalign_getpairs_out = libalign.getpairs_out +libalign_getpairs_out.restype = POINTER(c_int) +libalign_getpairs_advance = libalign.getpairs_advance +libalign_getpairs_advance.restype = c_void_p +libalign_align_init = libalign.align_init +libalign_align_init.restype = None +libalign_align_init_with_seed = libalign.align_init_with_seed +libalign_align_init.restype = None + +class Aligner: + + def __init__(self, wordpairs, align_symbol = u' ', iterations = 10, burnin = 5, lag = 1, mode = 'crp', random_seed = None): + s = set(u''.join((x[0] + x[1] for x in wordpairs))) + self.symboltoint = dict(zip(s, range(1,len(s)+1))) + self.inttosymbol = {v:k for k, v in self.symboltoint.items()} + self.inttosymbol[0] = align_symbol + ## Map stringpairs to -1 terminated integer sequences ## + intpairs = [] + for i, o in wordpairs: + intin = list(map(lambda x: self.symboltoint[x], i)) + [-1] + intout = list(map(lambda x: self.symboltoint[x], o)) + [-1] + intpairs.append((intin, intout)) + + if random_seed: + libalign_align_init_with_seed(random_seed) + else: + libalign_align_init() + + for i, o in intpairs: + icint = (c_int * len(i))(*i) + ocint = (c_int * len(o))(*o) + libalign_add_int_pair(icint, ocint) + + # Run CRP align + if mode == 'crp': + libalign_clear_counts() + libalign_initial_align() + libalign_crp_train(c_int(iterations), c_int(burnin), c_int(lag)) + libalign_crp_align() + else: + libalign_clear_counts() + libalign_initial_align() + libalign_med_align() + + # Reconvert to output + self.alignedpairs = [] + stringpairptr = libalign_getpairs_init() + while stringpairptr != None: + inints = libalign_getpairs_in(c_void_p(stringpairptr)) + outints = libalign_getpairs_out(c_void_p(stringpairptr)) + instr = [] + outstr = [] + for j in itertools.count(): + if inints[j] == -1: + break + instr.append(self.inttosymbol[inints[j]]) + for j in itertools.count(): + if outints[j] == -1: + break + outstr.append(self.inttosymbol[outints[j]]) + self.alignedpairs.append((''.join(instr), ''.join(outstr))) + stringpairptr = libalign_getpairs_advance(c_void_p(stringpairptr)) diff --git a/baseline/align_utils.py b/baseline/align_utils.py new file mode 100644 index 0000000..5231509 --- /dev/null +++ b/baseline/align_utils.py @@ -0,0 +1,10 @@ +import baseline.align as align + +def mcmc_align(wordpairs, align_symbol,seed): + a = align.Aligner(wordpairs, align_symbol=align_symbol,random_seed=seed) + return a.alignedpairs + + +def med_align(wordpairs, align_symbol): + a = align.Aligner(wordpairs, align_symbol=align_symbol, mode='med') + return a.alignedpairs diff --git a/baseline/baseline.py b/baseline/baseline.py new file mode 100755 index 0000000..ae4c5c3 --- /dev/null +++ b/baseline/baseline.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python +""" +Baseline system for the SIGMORPHON 2016 Shared Task. + +Solves tasks 1,2, and 3, evaluating on dev data and outputs guesses. + +Author: Mans Hulden +Last Update: 11/29/2015 +""" + +from __future__ import print_function +import perceptron_c, align, codecs, sys, re, getopt + +class MorphModel: + def __init__(self): + self.features = {'tolemma':None, 'fromlemma':None} + self.classes = {'tolemma':None, 'fromlemma':None} + self.classifier = {'tolemma':None, 'fromlemma':None} + +class Morph: + + def __init__(self): + self.models = {} + self.msdfeatures = None + self.msdclasses = None + self.msdclassifier = None + + def generate(self, word, featurestring, mode): + """Generates an output string from an input word and target + feature string. The 'mode' variable is either 'tolemma' or + 'fromlemma' """ + pos = re.match(r'pos=([^,]*)', featurestring).group(1) + ins = ['<'] + list(word) + ['>'] + outs = [] + prevaction = 'None' + position = 0 + while position < len(ins): + feats = list(train_get_surrounding_syms(ins, position, u'in_')) + \ + list(train_get_surrounding_syms(outs, position, u'out_', lookright = False)) + \ + ['prevaction='+prevaction] + [u'MSD:' + featurestring] + feats = feature_pairs(feats) + decision = self.models[pos].classifier[mode].decision_function(feats) + decision = sorted(decision, key = lambda x: x[1], reverse = True) + prevaction = self._findmax(decision, prevaction, len(ins)-position-1) + actionlength, outstring = interpret_action(prevaction, ins[position]) + outs.append(outstring) + position += actionlength + return ''.join(outs[1:-1]) + + def _findmax(self, decision, lastaction, maxlength): + """Find best action that doesn't conflict with last (can't del/ins/chg two in a row) + and isn't too long (can't change/del more than what remains).""" + if lastaction[0] == 'D' or lastaction[0] == 'C' or lastaction[0] == 'I': + for x in xrange(len(decision)): + if decision[x][0][0] != lastaction[0]: + if decision[x][0][0] == u'C' and len(decision[x][0][1:]) > maxlength: + continue + if decision[x][0][0] == u'D' and int(decision[x][0][1:]) > maxlength: + continue + return decision[x][0] + else: + return decision[0][0] + + def add_features(self, pos, features, classes, mode): + """Adds a collection of feautures and classes to a pos model + 'mode' is either 'tolemma' or 'fromlemma'.""" + if pos not in self.models: + self.models[pos] = MorphModel() + self.models[pos].features[mode] = features + self.models[pos].classes[mode] = classes + + def get_pos(self): + """Simply lists all poses associated with a model.""" + return list(self.models.keys()) + + def add_classifier(self, pos, classifier, mode): + """Adds a classifier to a pos model in a certain mode.""" + self.models[pos].classifier[mode] = classifier + + def get_features(self, pos, mode): + return self.models[pos].features[mode] + + def get_classes(self, pos, mode): + return self.models[pos].classes[mode] + + def extract_task3(self, lang, path): + + # We use the msd/form combinations from all three + msdform = set() + lines = [line.strip() for line in codecs.open(path + lang +'-task1-train', "r", encoding="utf-8")] + for l in lines: + lemma, msd, form = l.split(u'\t') + msdform.add((msd, form)) + lines = [line.strip() for line in codecs.open(path + lang +'-task2-train', "r", encoding="utf-8")] + for l in lines: + msd1, form1, msd2, form2 = l.split(u'\t') + msdform.add((msd1, form1)) + msdform.add((msd2, form2)) + lines = [line.strip() for line in codecs.open(path + lang +'-task3-train', "r", encoding="utf-8")] + for l in lines: + form1, msd2, form2 = l.split(u'\t') + msdform.add((msd2, form2)) + + self.msdfeatures = [] + self.msdclasses = [] + for msd, form in msdform: + formfeatures = extract_substrings(form) + self.msdfeatures.append(formfeatures) + self.msdclasses.append(msd) + + def extract_task1(self, filename, mode, path): + """Parse a file and extract features/classes for + mapping to and from a lemma form.""" + + lemmas = {} + poses = set() + lines = [line.strip() for line in codecs.open(path + filename, "r", encoding="utf-8")] + for l in lines: + if 'pos=' not in l: + continue + lemma, feats, form = l.split(u'\t') + pos = re.match(r'pos=([^,]*)', feats).group(1) + if lemma not in lemmas: + lemmas[lemma] = [] + lemmas[lemma].append((lemma, 'pos=' + pos + ',lemma=true')) + lemmas[lemma].append((form, feats)) + if pos not in poses: + poses.add(pos) + + pairs = [] + wordpairs = [] + for lemma in lemmas: + lemmafeatures = lemmas[lemma] + for x in lemmafeatures: + for y in lemmafeatures: + if (x != y) and ('lemma=true' in x[1]) and (mode == 'fromlemma'): + pairs.append(tuple((x[0], y[0], y[1]))) + # inword, outword, msdfeatures + wordpairs.append(tuple((x[0], y[0]))) + elif (x != y) and ('lemma=true' in x[1]) and (mode == 'tolemma'): + pairs.append(tuple((y[0], x[0], y[1]))) + # inword, outword, msdfeatures + wordpairs.append(tuple((y[0], x[0]))) + + if ALIGNTYPE == 'mcmc': + alignedpairs = mcmc_align(wordpairs, ALIGN_SYM) + elif ALIGNTYPE == 'med': + alignedpairs = med_align(wordpairs, ALIGN_SYM) + else: + alignedpairs = dumb_align(wordpairs, ALIGN_SYM) + + chunkedpairs = chunk(alignedpairs) + + for pos in poses: # Do one model per POS + features = [] + classes = [] + for idx, pair in enumerate(chunkedpairs): + if 'pos=' + pos not in pairs[idx][2]: + continue + instring = ['<'] + [x[0] for x in pair] + ['>'] + outstring = ['<'] + [x[1] for x in pair] + ['>'] + + #msdfeatures = pairs[idx][2].split(':') # separate features + msdfeatures = [ pairs[idx][2] ] # don't separate features + msdfeatures = ['MSD:' + f for f in msdfeatures] + prevaction = 'None' + for position in range(0, len(instring)): + thiscl, feats = train_get_features(instring, outstring, position) + classes.append(thiscl) + featurelist = list(feats) + msdfeatures + ['prevaction='+prevaction] + featurelist = feature_pairs(featurelist) + features.append(featurelist) + prevaction = thiscl + self.add_features(pos, features, classes, mode) + +def feature_pairs(f): + """Expand features to include pairs of features + where one is always a f=v feature.""" + pairs = [x + ".x." + y for x in f for y in f if u'=' in y] + return pairs + f + +def dumb_align(wordpairs, align_symbol): + alignedpairs = [] + for idx, pair in enumerate(wordpairs): + ins = pair[0] + outs = pair[1] + if len(ins) > len(outs): + outs = outs + align_symbol * (len(ins)-len(outs)) + elif len(outs) > len(ins): + ins = ins + align_symbol * (len(outs)-len(ins)) + alignedpairs.append((ins, outs)) + return alignedpairs + +def mcmc_align(wordpairs, align_symbol): + a = align.Aligner(wordpairs, align_symbol = align_symbol, random_seed = 42) + return a.alignedpairs + +def med_align(wordpairs, align_symbol): + a = align.Aligner(wordpairs, align_symbol = align_symbol, mode = 'med') + return a.alignedpairs + +def train_get_surrounding_syms(s, position, featureprefix, lookright = True): + """Get surrounding symbols from a list of chunks and position. + >>> s = ['<', u'a', u'b', u'u', u'_', u't', u'a', u'n', u'doka', '>'] + >>> train_get_surrounding_syms(s, 4, 'in_') + set([u'nin_ta', u'nin_t', u'nin_tan', u'pin_u', u'pin_bu', u'pin_abu']) + """ + leftfeats = set() + rightfeats = set() + if position == 0: + leftfeats |= {u'p' + featureprefix + u'none'} + if (position == len(s)) and lookright: + rightfeats |= {u'n' + featureprefix + u'none'} + if position > 0: + left = ''.join(s[:position]).replace(u'_', u'') + leftfeats |= {u'p' + featureprefix + left[x:] for x in [-1,-2,-3]} + if (position < len(s)) and lookright: + right = ''.join(s[position:]).replace(u'_', u'') + rightfeats |= {u'n' + featureprefix + right[:x] for x in [1,2,3]} + return leftfeats | rightfeats + +def train_get_features(ins, outs, position): + feats = set() + # Get class first # + if ins[position] == outs[position]: + cl = "R" + elif u'_' in ins[position]: + cl = "I" + outs[position] + elif u'_' in outs[position]: + cl = "D" + unicode(len(ins[position])) + else: + cl = "C" + outs[position] + + # Get features of surrounding symbols # + feats |= train_get_surrounding_syms(ins, position, u'in_') + feats |= train_get_surrounding_syms(outs, position, u'out_', lookright = False) + return cl, feats + +def interpret_action(action, ins): + """Interpret classifier class: return length of input to consume + output.""" + if action[0] == u'R': + return (1, ins) + elif action[0] == u'D': + return int(action[1:]), u'' + elif action[0] == u'C': + return len(action[1:]), action[1:] + elif action[0] == u'I': + return 0, action[1:] + +def chopup(s, t): + """Returns grouped alignment of two strings + in such a way that consecutive del/ins/chg operations + are grouped to be one single operation. + The input is two 1-to-1 aligned strings where _ = empty string. + >>> chopup(['ka__yyab','kaxx__xy']) + (['k', 'a', u'_', 'yy', 'ab'], ['k', 'a', 'xx', u'_', 'xy']) + """ + def action(inchar, outchar): + if inchar == u'_': + return 'ins' + elif outchar == u'_': + return 'del' + elif inchar != outchar: + return 'chg' + else: + return 'rep' + + idx = 1 + s = list(s) + t = list(t) + while idx < len(s): + l = action(s[idx-1], t[idx-1]) + r = action(s[idx], t[idx]) + if (l == 'rep' and r == 'rep') or (l != r): + s.insert(idx, ' ') + t.insert(idx, ' ') + idx += 1 + idx += 1 + s = tuple(u'_' if u'_' in x else x for x in ''.join(s).split(' ')) + t = tuple(u'_' if u'_' in x else x for x in ''.join(t).split(' ')) + return zip(s,t) + +def chunk(pairs): + """Chunk alignments to have possibly more than one symbol-one symbol.""" + chunkedpairs = [] + for instr, outstr in pairs: + chunkedpairs.append(chopup(instr, outstr)) + return chunkedpairs + +def extract_substrings(word): + """Get len 2/3 substrings and return as list.""" + w3 = zip(word, word[1:], word[2:]) + w2 = zip(word, word[1:]) + return [''.join(x) for x in w2+w3] + +def announce(*objs): + print("***", *objs, file = sys.stderr) + +def main(argv): + global ALIGN_SYM + global ALIGNTYPE + global TASK + + options, remainder = getopt.gnu_getopt(argv[1:], 'l:t:a:p:', ['language=','task=','align=','path=']) + + PATH, ALIGN_SYM, ALIGNTYPE, TASK = './', u'_', 'mcmc', 1 + for opt, arg in options: + if opt in ('-l', '--language'): + LANGUAGE = arg + elif opt in ('-t', '--task'): + TASK = int(arg) + elif opt in ('-a', '--align'): + ALIGNTYPE = arg + elif opt in ('-p', '--path'): + PATH = arg + + train = Morph() + announce(LANGUAGE + ": learning alignment for form > lemma mapping") + train.extract_task1(LANGUAGE + '-task1-train', 'fromlemma', PATH) + if TASK == 2 or TASK == 3: + announce(LANGUAGE + ": learning alignment for lemma > form mapping") + train.extract_task1(LANGUAGE + '-task1-train', 'tolemma', PATH) + + if TASK == 1 or TASK == 2 or TASK == 3: + for pos in train.get_pos(): + announce(LANGUAGE + ": training " + pos + " for lemma > form mapping") + P = perceptron_c.Perceptron(shuffle = True, averaged = True, verbose = True, max_iter = 10, random_seed = 42) + P.fit(train.get_features(pos, 'fromlemma'), train.get_classes(pos, 'fromlemma')) + train.add_classifier(pos, P, 'fromlemma') + + if TASK == 2 or TASK == 3: + for pos in train.get_pos(): + announce(LANGUAGE + ": training " + pos + " for form > lemma mapping") + P = perceptron_c.Perceptron(shuffle = True, averaged = True, verbose = True, max_iter = 10, random_seed = 42) + P.fit(train.get_features(pos, 'tolemma'), train.get_classes(pos, 'tolemma')) + train.add_classifier(pos, P, 'tolemma') + + if TASK == 3: + train.extract_task3(LANGUAGE, PATH) + announce(LANGUAGE + ": training form > msd classifier") + train.msdclassifier = perceptron_c.Perceptron(shuffle = True, averaged = True, verbose = True, max_iter = 10, random_seed = 42) + train.msdclassifier.fit(train.msdfeatures, train.msdclasses) + + testlines = [line.strip() for line in codecs.open(PATH+LANGUAGE + '-task' + str(TASK) + '-dev', "r", encoding="utf-8")] + if TASK == 1: + for l in testlines: + lemma, targetmsd, wordform = l.split('\t') + guess = train.generate(lemma, targetmsd, 'fromlemma') + print((lemma + "\t" + targetmsd + "\t" + guess).encode("utf-8")) + + if TASK == 2: + for l in testlines: + sourcemsd, sourceform, targetmsd, targetform = l.split('\t') + lemma = train.generate(sourceform, sourcemsd, 'tolemma') + guess = train.generate(lemma, targetmsd, 'fromlemma') + print((sourcemsd + "\t" + sourceform + "\t" + targetmsd + "\t" + guess).encode("utf-8")) + + if TASK == 3: + for l in testlines: + sourceform, targetmsd, targetform = l.split('\t') + sourcemsd = train.msdclassifier.predict(extract_substrings(sourceform)) + lemma = train.generate(sourceform, sourcemsd, 'tolemma') + guess = train.generate(lemma, targetmsd, 'fromlemma') + print((sourceform + "\t" + targetmsd + "\t" + guess).encode("utf-8")) + +if __name__ == "__main__": + main(sys.argv) diff --git a/baseline/perceptron.c b/baseline/perceptron.c new file mode 100644 index 0000000..f3a6420 --- /dev/null +++ b/baseline/perceptron.c @@ -0,0 +1,397 @@ +/************************************************************************/ +/* Simple perceptron/averaged perceptron library */ +/* Author: Mans Hulden (mans.hulden@gmail.com) */ +/* Copyright 2014 Mans Hulden */ +/* */ +/* Licensed under the Apache License, Version 2.0 (the "License"); */ +/* you may not use this file except in compliance with the License. */ +/* You may obtain a copy of the License at */ +/* */ +/* http://www.apache.org/licenses/LICENSE-2.0 */ +/* */ +/* - MH20140831 */ +/************************************************************************/ + +/* To build for python bindings: gcc -O3 -Wall -Wextra -shared perceptron.c -o libperceptron.so */ + +/* Usage: + +(1) Call perceptron_init with desired parameters +(2) Add examples to training set using examples_add() + - optionally also to dev set using devexamples_add() +(3) Train perceptron using perceptron_train() +(4) Classify with perceptron_decision_function_int()/perceptron_decision_function_double() + (int is for non-averaged/double for averaged) + which return a vector of weights for all classes + or: + Just use perceptron_classify_double()/perceptron_classify_int() + which return the best class index +(5) perceptron_destroy() frees all data structures associated + +Notes: + +- Only supports binary features in examples +- Uses a sparse representation where only hot features are given for examples +- Weights are integers, although double weights are used for the averaged case + +*/ + +/******************************************************************************/ + +#include +#include +#include +#include /* For INT_MAX */ +#include /* For DBL_MAX */ + +struct perceptron { + struct examples *ex; /* Training examples */ + struct examples *devex; /* Dev examples */ + int averaged; /* Use averaged perceptron or vanilla? */ + int tune_on_averaged; /* Whether to tune AP on averaged weights or running weights */ + int num_examples; /* Training set size */ + int num_devexamples; /* Dev set size */ + int examplecounter; /* Running counter when adding examples one-by-one */ + int devexamplecounter; /* Running counter when adding examples one-by-one */ + int num_classes; /* Number of distinct classes */ + int num_features; /* Numer of features */ + int max_iter; /* Max iterations to run */ + int shuffle; /* Shuffle examples before each iteration? */ + int verbose; /* Print stats to stderr? */ + + int *intweights; /* Running int weights */ + int *intbiases; /* Running int biases */ + double *doubleweights; /* Weights for averaged perceptron */ + double *doublebiases; /* Biases */ + double *lastdweights; /* Store temp weights for tuning w/ dev set */ + double *lastdbiases; /* Store temp biases for tuning w/ dev set */ + int *lastiweights; /* Store temp weights for tuning w/ dev set */ + int *lastibiases; /* Store temp biases for tuning w/ dev set */ +}; + +struct examples { + int *hotfeatures; /* A list of the features that are hot in this example */ + int len; /* Number of hot features in example */ + int correctclass; /* The class the example belongs to */ +}; + +/******************************************************************************/ + +/* Initialize the perceptron structure, returns handle */ +struct perceptron *perceptron_init(int max_iter, int num_examples, int num_devexamples, int num_features, int num_classes, int averaged, int shuffle, int random_seed, int tune_on_averaged, int verbose); + +/* Train perceptron w/ current training/(dev) examples and settings */ +void perceptron_train(struct perceptron *perceptron); + +/* Free examples/weights + perceptron data structure */ +void perceptron_destroy(struct perceptron *p); + +/* Decision function for example (features holds list of hot features, len is number of hot features) */ +/* Returns vector of weights for each class (highest weight is best class) */ +double *perceptron_decision_function_double(struct perceptron *perceptron, int *features, int len); + +/* Decision function for non-averaged perceptron */ +/* Returns vector of weights for each class (highest weight is best class) */ +int *perceptron_decision_function_int(struct perceptron *perceptron, int *features, int len); + +/* Classify function for example (features holds list of hot features, len is number of hot features) */ +/* Returns best class number */ +/* dev_tuning and numiter are internal parameters used while training/set these to 0,0 */ +int perceptron_classify_double(struct perceptron *perceptron, int *features, int len, int dev_tuning, int numiter); + +/* Classify function for example (features holds list of hot features, len is number of hot features) */ +/* Returns best class number */ +/* Use for non-averaged perceptron */ +int perceptron_classify_int(struct perceptron *perceptron, int *features, int len); + +/* Add an example to the training set */ +/* supply perceptron handle, a vector of hot features, len of this vector, and correct class index */ +void examples_add(struct perceptron *perceptron, int *features, int len, int correctclass); + +/* Add an example to the dev set */ +/* supply perceptron handle, a vector of hot features, len of this vector, and correct class index */ +void devexamples_add(struct perceptron *perceptron, int *features, int len, int correctclass); + +/******************************************************************************/ + +struct perceptron *perceptron_init(int max_iter, int num_examples, int num_devexamples, int num_features, int num_classes, int averaged, int shuffle, int random_seed, int tune_on_averaged, int verbose) { + struct perceptron *p; + p = calloc(1, sizeof(struct perceptron)); + p->max_iter = max_iter; + p->num_examples = num_examples; + p->examplecounter = 0; + p->devexamplecounter = 0; + p->num_classes = num_classes; + p->num_features = num_features; + p->shuffle = shuffle; + p->ex = calloc(num_examples, sizeof(struct examples)); + p->verbose = verbose; + p->tune_on_averaged = tune_on_averaged; + if (random_seed) + srand(random_seed); + if (num_devexamples > 0) { + p->num_devexamples = num_devexamples; + p->devex = calloc(num_devexamples, sizeof(struct examples)); + } + p->intweights = calloc(num_features * num_classes, sizeof(int)); + p->intbiases = calloc(num_classes, sizeof(int)); + p->averaged = averaged; + if (p->averaged) { + p->doubleweights = calloc(num_features * num_classes, sizeof(double)); + p->doublebiases = calloc(num_classes, sizeof(double)); + p->lastdweights = calloc(num_features * num_classes, sizeof(double)); + p->lastdbiases = calloc(num_classes, sizeof(double)); + p->lastiweights = calloc(num_features * num_classes, sizeof(int)); + p->lastibiases = calloc(num_classes, sizeof(int)); + } + return p; +} + +static int rand_int(int n) { + int limit = RAND_MAX - RAND_MAX % n; + int rnd; + do { + rnd = rand(); + } while (rnd >= limit); + return rnd % n; +} + +void shuffle(int *array, int n) { + int i, j, tmp; + for (i = n - 1; i > 0; i--) { + j = rand_int(i + 1); + tmp = array[j]; + array[j] = array[i]; + array[i] = tmp; + } +} + +void perceptron_train(struct perceptron *perceptron) { + int i, j, n, m, guessedclass, correctclass, *weightptr, numincorrect, itercount, *classorder, devcorrect, devlastcorrect; + double *dweightptr; + itercount = 1; + classorder = calloc(perceptron->num_examples, sizeof(int)); + for (i = 0; i < perceptron->num_examples; i++) { + classorder[i] = i; + } + devlastcorrect = 0; + for (i = 0; i < perceptron->max_iter; i++) { + if (perceptron->shuffle) + shuffle(classorder, perceptron->num_examples); + numincorrect = 0; + for (n = 0; n < perceptron->num_examples; n++) { + m = classorder[n]; + guessedclass = perceptron_classify_int(perceptron, perceptron->ex[m].hotfeatures, perceptron->ex[m].len); + correctclass = perceptron->ex[m].correctclass; + if (guessedclass != correctclass) { + numincorrect++; + for (j = 0; j < perceptron->ex[m].len; j++) { + weightptr = perceptron->intweights + perceptron->num_features * correctclass; /* Points to correct class weights */ + weightptr += *(perceptron->ex[m].hotfeatures + j); + *(weightptr) += 1; + + weightptr = perceptron->intweights + perceptron->num_features * guessedclass; /* Points to incorrect class weights */ + weightptr += *(perceptron->ex[m].hotfeatures + j); + *(weightptr) -= 1; + } + perceptron->intbiases[correctclass] += 1; + perceptron->intbiases[guessedclass] -= 1; + if (perceptron->averaged) { + for (j = 0; j < perceptron->ex[m].len; j++) { + dweightptr = perceptron->doubleweights + perceptron->num_features * correctclass; /* Points to correct class weights */ + dweightptr += *(perceptron->ex[m].hotfeatures + j); + *(dweightptr) += 1.0 * (double)itercount; + + dweightptr = perceptron->doubleweights + perceptron->num_features * guessedclass; /* Points to incorrect class weights */ + dweightptr += *(perceptron->ex[m].hotfeatures + j); + *(dweightptr) -= 1.0 * (double)itercount; + } + perceptron->doublebiases[correctclass] += 1.0 * (double)itercount; + perceptron->doublebiases[guessedclass] -= 1.0 * (double)itercount; + } + } + itercount++; + } + + /* Print stats */ + if (perceptron->verbose) { + fprintf(stderr, "Iteration %i - TRAIN: (%i/%i) %lg", i+1, perceptron->num_examples-numincorrect, perceptron->num_examples, (double)(perceptron->num_examples-numincorrect)/(double)perceptron->num_examples); + } + /* Now test on dev set (if available) */ + if (perceptron->num_devexamples > 0) { + for (n = 0, devcorrect = 0; n < perceptron->num_devexamples; n++) { + if (perceptron->averaged && perceptron->tune_on_averaged) { + guessedclass = perceptron_classify_double(perceptron, perceptron->devex[n].hotfeatures, perceptron->devex[n].len, 1, itercount); + } else { + guessedclass = perceptron_classify_int(perceptron, perceptron->devex[n].hotfeatures, perceptron->devex[n].len); + } + correctclass = perceptron->devex[n].correctclass; + if (guessedclass == correctclass) { + devcorrect++; + } + } + if (perceptron->verbose) + fprintf(stderr, " - DEV (%i/%i) %lg", devcorrect, perceptron->num_devexamples, (double)devcorrect/(double)perceptron->num_devexamples); + if (devcorrect < devlastcorrect) { + if (perceptron->verbose) + fprintf(stderr, "\n"); + break; /* Stop iterations - performance went down */ + } + devlastcorrect = devcorrect; + } + if (perceptron->verbose) + fprintf(stderr, "\n"); + if (numincorrect == 0) { + break; + } + /* Store current (averaged) weights so we can restore them if performance goes down */ + if (perceptron->averaged) { + memcpy(perceptron->lastdweights, perceptron->doubleweights, perceptron->num_features * perceptron->num_classes * sizeof(double)); + memcpy(perceptron->lastdbiases, perceptron->doublebiases, perceptron->num_classes * sizeof(double)); + memcpy(perceptron->lastiweights, perceptron->intweights, perceptron->num_features * perceptron->num_classes * sizeof(int)); + memcpy(perceptron->lastibiases, perceptron->intbiases, perceptron->num_classes * sizeof(int)); + } + } + if (perceptron->averaged) { + /* If we use AP w/ dev set, take previous weights because performance has dropped on dev set */ + for (i = 0; i < perceptron->num_features * perceptron->num_classes; i++) { + if (perceptron->num_devexamples > 0 && perceptron->tune_on_averaged) + perceptron->doubleweights[i] = (double)perceptron->lastiweights[i] - perceptron->lastdweights[i]/((double)itercount - 1); + else + perceptron->doubleweights[i] = (double)perceptron->intweights[i] - perceptron->doubleweights[i]/(double)itercount; + } + for (i = 0; i < perceptron->num_classes; i++) { + if (perceptron->num_devexamples > 0 && perceptron->tune_on_averaged) + perceptron->doublebiases[i] = (double)perceptron->lastibiases[i] - perceptron->lastdbiases[i]/((double)itercount - 1); + else + perceptron->doublebiases[i] = (double)perceptron->intbiases[i] - perceptron->doublebiases[i]/(double)itercount; + } + } +} + +void perceptron_free_wrapper(void *ptr) { + if (ptr != NULL) + free(ptr); +} + +void perceptron_destroy(struct perceptron *p) { + free(p->ex); + if (p->num_devexamples > 0) { + free(p->devex); + } + free(p->intweights); + free(p->intbiases); + if (p->averaged) { + free(p->doubleweights); + free(p->doublebiases); + free(p->lastdweights); + free(p->lastdbiases); + free(p->lastiweights); + free(p->lastibiases); + } + free(p); +} + +double *perceptron_decision_function_double(struct perceptron *perceptron, int *features, int len) { + int f, c, fnum; + double cumweight, *fweight, *prediction; + prediction = calloc(perceptron->num_classes, sizeof(double)); + for (c = 0; c < perceptron->num_classes; c++) { + cumweight = 0.0; + for (f = 0; f < len; f++) { + fnum = features[f]; /* Feature that is hot */ + fweight = perceptron->doubleweights + perceptron->num_features * c + fnum; + cumweight += *fweight; + } + cumweight += perceptron->doublebiases[c]; + prediction[c] = cumweight; + } + return prediction; +} + +int *perceptron_decision_function_int(struct perceptron *perceptron, int *features, int len) { + int f, c, fnum; + int cumweight, *fweight, *prediction; + prediction = calloc(perceptron->num_classes, sizeof(int)); + for (c = 0; c < perceptron->num_classes; c++) { + cumweight = 0; + for (f = 0; f < len; f++) { + fnum = features[f]; /* Feature that is hot */ + fweight = perceptron->intweights + perceptron->num_features * c + fnum; + cumweight += *fweight; + } + cumweight += perceptron->intbiases[c]; + prediction[c] = cumweight; + } + return prediction; +} + +int perceptron_classify_double(struct perceptron *perceptron, int *features, int len, int dev_tuning, int numiter) { + int f, c, fnum, maxclass, ptr; + double maxweight, cumweight, fweight; + maxclass = 0; + maxweight = -DBL_MAX; + for (c = 0; c < perceptron->num_classes; c++) { + cumweight = 0.0; + for (f = 0; f < len; f++) { + fnum = features[f]; /* Feature that is hot */ + if (dev_tuning) { + ptr = perceptron->num_features * c + fnum; + fweight = (double)perceptron->intweights[ptr] - perceptron->doubleweights[ptr]/(double)numiter; + } else { + fweight = perceptron->doubleweights[perceptron->num_features * c + fnum]; + } + cumweight += fweight; + } + if (dev_tuning) { + cumweight += (double)perceptron->intbiases[c] - perceptron->doublebiases[c]/(double)numiter; + } else { + cumweight += perceptron->doublebiases[c]; + } + if (cumweight > maxweight) { + maxweight = cumweight; + maxclass = c; + } + } + return maxclass; +} + +int perceptron_classify_int(struct perceptron *perceptron, int *features, int len) { + int f, c, *fweight, fnum, maxclass, maxweight, cumweight; + maxclass = 0; + maxweight = -INT_MAX; + for (c = 0; c < perceptron->num_classes; c++) { + cumweight = 0; + for (f = 0; f < len; f++) { + fnum = features[f]; /* Feature that is hot */ + fweight = perceptron->intweights + perceptron->num_features * c + fnum; + cumweight += *fweight; + } + cumweight += perceptron->intbiases[c]; + if (cumweight > maxweight) { + maxweight = cumweight; + maxclass = c; + } + } + return maxclass; +} + +void examples_add(struct perceptron *perceptron, int *features, int len, int correctclass) { + struct examples *ex; + ex = perceptron->ex; + ex[perceptron->examplecounter].len = len; + ex[perceptron->examplecounter].correctclass = correctclass; + ex[perceptron->examplecounter].hotfeatures = malloc(len * sizeof(int)); + memcpy(ex[perceptron->examplecounter].hotfeatures, features, len * sizeof(int)); + perceptron->examplecounter++; +} + +void devexamples_add(struct perceptron *perceptron, int *features, int len, int correctclass) { + struct examples *ex; + ex = perceptron->devex; + ex[perceptron->devexamplecounter].len = len; + ex[perceptron->devexamplecounter].correctclass = correctclass; + ex[perceptron->devexamplecounter].hotfeatures = malloc(len * sizeof(int)); + memcpy(ex[perceptron->devexamplecounter].hotfeatures, features, len * sizeof(int)); + perceptron->devexamplecounter++; +} diff --git a/baseline/perceptron_c.py b/baseline/perceptron_c.py new file mode 100644 index 0000000..bbbd094 --- /dev/null +++ b/baseline/perceptron_c.py @@ -0,0 +1,114 @@ +# Wrapper around simple perceptron/averaged perceptron C-library. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 + +# Relies on C-code in libpercetron.so built from percetron.c through ctypes. +# Author: Mans Hulden + +from ctypes import * + +perceptron = cdll.LoadLibrary('./libperceptron.so') + +perceptron_perceptron_init = perceptron.perceptron_init +perceptron_perceptron_init.restype = c_void_p +perceptron_examples_add = perceptron.examples_add +perceptron_examples_add.restype = None +perceptron_devexamples_add = perceptron.devexamples_add +perceptron_devexamples_add.restype = None +perceptron_perceptron_train = perceptron.perceptron_train +perceptron_perceptron_train.restype = None +perceptron_perceptron_classify_int = perceptron.perceptron_classify_int +perceptron_perceptron_classify_int.restype = c_int +perceptron_perceptron_classify_double = perceptron.perceptron_classify_double +perceptron_perceptron_classify_double.restype = c_int +perceptron_perceptron_decision_function_double = perceptron.perceptron_decision_function_double +perceptron_perceptron_decision_function_double.restype = POINTER(c_double) +perceptron_perceptron_decision_function_int = perceptron.perceptron_decision_function_int +perceptron_perceptron_decision_function_int.restype = POINTER(c_int) +perceptron_perceptron_free_wrapper = perceptron.perceptron_free_wrapper +perceptron_perceptron_destroy = perceptron.perceptron_destroy + +class Perceptron: + + def __init__(self, max_iter = 20, averaged = False, shuffle = True, random_seed = False, tune_on_averaged = False, verbose = False): + self.max_iter = max_iter + self.averaged = averaged + self.shuffle = shuffle + self.random_seed = random_seed + self.verbose = verbose + self.tune_on_averaged = tune_on_averaged + self.perceptronhandle = None + + def __del__(self): + if self.perceptronhandle: + perceptron_perceptron_destroy(c_void_p(self.perceptronhandle)) + + def fit(self, features, classes, devfeatures = [], devclasses = []): + # Map features to integers starting from 0 + self.num_examples = len(features) + self.num_devexamples = len(devfeatures) + fset = sorted(list(set([f for g in features + devfeatures for f in g]))) + self.inttofeat = dict(zip(range(len(fset)), fset)) + self.feattoint = dict(zip(fset, range(len(fset)))) + self.features = [[self.feattoint[f] for f in g] for g in features] + self.num_features = len(fset) + self.devfeatures = [[self.feattoint[f] for f in g] for g in devfeatures] + + # Map classes to integers starting from 0 + cset = sorted(list(set([c for c in classes + devclasses]))) + self.inttoclass = dict(zip(range(len(cset)), cset)) + self.classtoint = dict(zip(cset, range(len(cset)))) + self.classes = [self.classtoint[f] for f in classes] + self.devclasses = [self.classtoint[f] for f in devclasses] + self.num_classes = len(cset) + + self.perceptronhandle = perceptron_perceptron_init(c_int(self.max_iter), c_int(self.num_examples), c_int(self.num_devexamples), c_int(self.num_features), c_int(self.num_classes), c_int(self.averaged), c_int(self.shuffle), c_int(self.random_seed), c_int(self.tune_on_averaged), c_int(self.verbose)) + + for index, example_fs in enumerate(self.features): + f = (c_int * len(example_fs))(*example_fs) + perceptron_examples_add(c_void_p(self.perceptronhandle), f, c_int(len(example_fs)), c_int(self.classes[index])) + + for index, example_fs in enumerate(self.devfeatures): + f = (c_int * len(example_fs))(*example_fs) + perceptron_devexamples_add(c_void_p(self.perceptronhandle), f, c_int(len(example_fs)), c_int(self.devclasses[index])) + + perceptron_perceptron_train(c_void_p(self.perceptronhandle)) + + def decision_function(self, features): + test_fs = [self.feattoint[f] for f in features if f in self.feattoint] + f = (c_int * len(test_fs))(*test_fs) + if self.averaged: + classweights = perceptron_perceptron_decision_function_double(c_void_p(self.perceptronhandle), f, c_int(len(test_fs))) + else: + classweights = perceptron_perceptron_decision_function_int(c_void_p(self.perceptronhandle), f, c_int(len(test_fs))) + c = [(self.inttoclass[i], classweights[i]) for i in range(self.num_classes)] + perceptron_perceptron_free_wrapper(classweights) + return c + + def predict(self, features): + test_fs = [self.feattoint[f] for f in features if f in self.feattoint] + f = (c_int * len(test_fs))(*test_fs) + if self.averaged: + correctclass = perceptron_perceptron_classify_double(c_void_p(self.perceptronhandle), f, c_int(len(test_fs)), c_int(0), c_int(0)) + else: + correctclass = perceptron_perceptron_classify_int(c_void_p(self.perceptronhandle), f, c_int(len(test_fs))) + return self.inttoclass[correctclass] + +if __name__ == "__main__": + P = Perceptron(shuffle = True, averaged = True, verbose = True) + # 4 training examples, no dev examples (can use any data type for features) + # We simply list the 'hot' features for each example + features = [['w','x','y','z'], ['u','w','x'],[232,'w'],[232,'x','y','z']] + # The corresponding classes + classes = ['CLASS_A','CLASS_A','CLASS_B','CLASS_A'] + # Train + P.fit(features, classes) + # Show probabilities of the classes for an instance + print(P.decision_function([232, 'w', 'z'])) # Print weights for classes + # Show how the classes correspond to indices + print(P.classtoint) + # Show the best class for example + print(P.predict([232, 'w', 'z'])) diff --git a/load_and_test_model_interactive.sh b/load_and_test_model_interactive.sh new file mode 100755 index 0000000..99f5868 --- /dev/null +++ b/load_and_test_model_interactive.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +ME=`basename $0` # for usage message + +if [ "$#" -ne 3 ]; then # number of args + echo "USAGE: ${ME} " + echo + exit +fi +ftrain=$1 +model=$2 +seed=$3 +time python -m seq2seq.main \ + --ftrain ${ftrain} \ + --mono \ + --beam_width 1 \ + --restore ${model} \ + --interactive \ + --seed ${seed} + +if [[ $? == 0 ]] # success +then + : # do nothing +else # something went wrong + echo "SOME PROBLEM OCCURED"; # echo file with problems +fi diff --git a/load_and_test_model_on_files.sh b/load_and_test_model_on_files.sh new file mode 100755 index 0000000..713e5ac --- /dev/null +++ b/load_and_test_model_on_files.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +ME=`basename $0` # for usage message + +if [ "$#" -ne 5 ]; then # number of args + echo "USAGE: " + echo "$ME" + exit +fi +ftrain=$1 +ftest=$2 +model=$3 +seed=$4 +out=$5 +time python -m seq2seq.main \ + --ftrain ${ftrain} \ + --ftest ${ftest} \ + --mono \ + --beam_width 1 \ + --restore ${model} \ + --seed ${seed} \ + --dump ${out} + + + + + +if [[ $? == 0 ]] # success +then + : # do nothing +else # something went wrong + echo "SOME PROBLEM OCCURED"; # echo file with problems +fi diff --git a/readers/aligned_reader.py b/readers/aligned_reader.py new file mode 100644 index 0000000..46ab9ab --- /dev/null +++ b/readers/aligned_reader.py @@ -0,0 +1,121 @@ +from __future__ import division +from __future__ import print_function + +import sys +import logging + +from seq2seq.lang import Lang +from seq2seq.constants import ALIGN_SYMBOL +from baseline import align_utils + +import random +from collections import Counter +# from seq2seq.main import oracle_action +from seq2seq.constants import STEP + +# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) +import argparse + + +def safe_replace_spaces(s): + s = s.replace(" ", "#") + s = s.replace(" ", "") + s = s.replace("#", " ") + return s + + +def read_examples(fpath, native_or_eng="both", remove_spaces=False, weight=1.0): + examples = [] + bad = 0 + for idx, l in enumerate(open(fpath)): + parts = l.strip().split('\t') + if len(parts) == 3: + fr_sent, en_sent = parts[:2] + is_eng = True + elif len(parts) == 2: + # print(parts) + fr_sent, en_sent = parts + is_eng = False + elif len(parts) == 4: + fr_sent, en_sent, is_eng = parts[:3] + is_eng = True if is_eng=="True" else False + else: + logging.info("#%d bad line %d %s", bad, idx, parts) + bad += 1 + continue + if remove_spaces: + # fr_sent = fr_sent.replace(" ", "") + # en_sent = en_sent.replace(" ", "") + fr_sent = safe_replace_spaces(fr_sent) + en_sent = safe_replace_spaces(en_sent) + if native_or_eng == "nat" and not is_eng: + examples.append((fr_sent, en_sent, weight, is_eng)) + elif native_or_eng == "eng" and is_eng: + examples.append((fr_sent, en_sent, weight, is_eng)) + elif native_or_eng == "both": + examples.append((fr_sent, en_sent, weight, is_eng)) + else: + pass + if "!!!" in l and not is_eng: + logging.info("wierd line %s", l) + num_engs = sum([1 if ex[-1] == True else 0 for ex in examples]) + num_nats = sum([1 if ex[-1] == False else 0 for ex in examples]) + logging.info("read %d examples in \"%s\" mode", len(examples), native_or_eng) + logging.info("# engs %d", num_engs) + logging.info("# nats %d", num_nats) + return examples + + +def align_examples(examples, seed, algo="mcmc"): + logging.info("aligning using %d examples", len(examples)) + + pairs = [(x, y) for x, y, weight, is_eng in examples] + is_eng_list = [(weight,is_eng) for x, y, weight, is_eng in examples] + if algo == "dumb": + raise NotImplementedError + else: + aligned_pairs = align_utils.mcmc_align(pairs, ALIGN_SYMBOL, seed=seed) + ans = [(ax, ay, weight, is_eng) for (ax, ay), (weight, is_eng) in zip(aligned_pairs, is_eng_list)] + return ans + + +def load_aligned_data(examples, seed, mode=None): + ans = [] + + if mode == "mcmc": + aligned_data = align_examples(examples=examples, seed=seed) + else: + # No alignments --> seq2seq + aligned_data = examples + for x, y, weight, is_eng in aligned_data: + if mode == "mcmc": + raw_x, raw_y = x.replace(ALIGN_SYMBOL, ""), y.replace(ALIGN_SYMBOL, "") + raw_x, raw_y = ' '.join(list(raw_x)), ' '.join(list(raw_y)) + elif mode == "m2m": + raise NotImplementedError + else: + raw_x, raw_y = x, y + xs, ys = ' '.join(list(x)), ' '.join(list(y)) + ans.append((raw_x, raw_y, xs, ys, weight, is_eng)) + return ans + + +def oracle_action(example): + raw_x, raw_y, x, y, weight, is_eng = example + x = x.split(' ') + y = y.split(' ') + actions = [] + inputs = [] + alignments = list(zip(x, y)) + for idx, a in enumerate(alignments): + # if 1-to-0 alignment, then step + if a[1] == ALIGN_SYMBOL: + actions.append(STEP) + inputs.append(a[0]) + else: + actions.append(a[1]) + inputs.append(a[0]) + if idx + 1 < len(alignments) and alignments[idx + 1][0] != ALIGN_SYMBOL: + actions.append(STEP) + return inputs,actions + diff --git a/seq2seq/constants.py b/seq2seq/constants.py new file mode 100644 index 0000000..787077f --- /dev/null +++ b/seq2seq/constants.py @@ -0,0 +1,11 @@ +SOS_token = '' +EOS_token = '' +UNK = '' +SOS_ID = 0 +EOS_ID = 1 +UNK_ID = 2 +ALIGN_SYMBOL = '~' +STEP = '' +EPSILON = '' +ALIGN_SYMBOL = '~' + diff --git a/seq2seq/encoder.py b/seq2seq/encoder.py new file mode 100644 index 0000000..f73fb23 --- /dev/null +++ b/seq2seq/encoder.py @@ -0,0 +1,61 @@ +import torch +import torch.nn as nn +from torch.autograd import Variable as V + + +class EncoderRNN(nn.Module): + def __init__(self, invoc_size: int, vector_size: int, hidden_size: int, n_layers: int = 1, batch_first: bool = True, + bidi=True, device_id=None) -> None: + super(EncoderRNN, self).__init__() + self.input_size = invoc_size + self.hidden_size = hidden_size + self.n_layers = n_layers + self.device_id = device_id + self.embedding = nn.Embedding(num_embeddings=invoc_size, + embedding_dim=vector_size) + self.bidi = bidi + self.batch_first = batch_first + self.gru = nn.GRU(input_size=vector_size, + hidden_size=hidden_size, + num_layers=n_layers, + batch_first=batch_first, + bidirectional=bidi + ) + self.no_pack_padded_seq = False + + def forward(self, word_inputs, hidden): + # Note: works with only batch_size = 1 + # Note: we run this all at once (over the whole input sequence) + max_len = len(word_inputs) + # L x D + embedded = self.embedding(word_inputs) + # 1 x L x D, batch first is True + embedded = embedded.view(1, max_len, -1) + # 1 x L x D, 1 x H x D + output, hidden = self.gru(embedded, hidden) + return output, hidden + + def init_hidden(self, batch_size=1): + if self.bidi: + k = self.n_layers * 2 + else: + k = self.n_layers * 1 + hidden = V(torch.zeros(k, batch_size, self.hidden_size)) + if self.device_id is not None: + hidden = hidden.cuda(self.device_id) + return hidden + + def _cuda(self, m): + if self.device_id is not None: + return m.cuda(self.device_id) + return m + + +if __name__ == '__main__': + encoder = EncoderRNN(invoc_size=10, vector_size=13, hidden_size=20) + print(encoder) + hidden = encoder.init_hidden() + input_words = torch.LongTensor([1, 2, 3, 4]) + output, hidden = encoder(V(input_words), hidden) + print('Output size:', output.size()) + print('Hidden size:', [h.size() for h in hidden]) diff --git a/seq2seq/evaluators/reporter.py b/seq2seq/evaluators/reporter.py new file mode 100644 index 0000000..c4b72bd --- /dev/null +++ b/seq2seq/evaluators/reporter.py @@ -0,0 +1,185 @@ +from __future__ import division +import shutil + +from collections import Counter, defaultdict +import logging + +import numpy as np + +from seq2seq.constants import EOS_token, SOS_token +from utils.news_evaluation_script import news_evaluation +from utils.news_evaluation_script.news_evaluation import compute_edit_dist as ED +from seq2seq.constants import STEP + +__author__ = 'Shyam' + + +def get_decoded_words(decoded_outputs): + ans = [] + # print(decoded_outputs) + for score, output in decoded_outputs: + if output[-1] == EOS_token: + output = output[:-1] + if output[0] == SOS_token: + output = output[1:] + output = [p for p in output if p != STEP] + output = " ".join(output) + ans.append((score, output)) + # print(ans) + return ans + + +def compute_acc_at_position(pred_dict, gold_dict, pos): + correct = 0 + for src_word in gold_dict: + gold = gold_dict[src_word] + preds = pred_dict[src_word] + # print(preds,gold) + if preds[pos] == gold[0]: + correct += 1 + # print(correct) + return correct + + +def print_evauation_details(pred_dict, gold_dict, header="all", vocab=None, beam_width=None): + acc_map, f, f_best_match, mrr, map_ref, acc_10, edit_dist, nrm_edit_dist = news_evaluation.evaluate(pred_dict=pred_dict, + gold_dict=gold_dict) + N = len(acc_map) + if N == 0: + logging.info("N is 0, returning ...") + return 0.0 + edit_dist_freqs = Counter(list(edit_dist.values())) + # for k in edit_dist: + # print(k,edit_dist[k]) + mean_ed_at_1 = np.mean(list(edit_dist.values())) + std_ed_at_1 = np.std(list(edit_dist.values())) + mean_ned_at_1 = np.mean(list(nrm_edit_dist.values())) + median_ed_at_1 = np.median(list(edit_dist.values())) + acc_num = float(sum([acc_map[src_word] for src_word in acc_map.keys()])) + acc10_num = float(sum([acc_10[src_word] for src_word in acc_10.keys()])) + accuracy = acc_num / N + accuracy10 = acc10_num / N + macro_f1 = float(sum([f[src_word] for src_word in f.keys()])) / N + logging.info(20 * "*" + header + 20 * "*") + logging.info('ACC: %f (%d/%d)', accuracy, acc_num, N) + logging.info('Mean F-score: %f', macro_f1) + logging.info('Mean ED@1: %f+-%.3f', mean_ed_at_1,std_ed_at_1) + logging.info('Mean NED@1: %f', mean_ned_at_1) + logging.info('Median ED@1: %f', median_ed_at_1) + for d in range(3): + logging.info('edit dist of %d: %f (%d/%d)', d, edit_dist_freqs[d] / N, edit_dist_freqs[d], N) + + if beam_width is not None: + for d in range(beam_width): + acc_at_d = compute_acc_at_position(pred_dict=pred_dict, gold_dict=gold_dict, pos=d) + logging.info("acc at %d: %.3f (%d/%d)", d, acc_at_d / N, acc_at_d, N) + # logging.info('MRR: %f', float(sum([mrr[src_word] for src_word in mrr.keys()])) / N) + # logging.info('MAP_ref: %f', float(sum([map_ref[src_word] for src_word in map_ref.keys()])) / N) + logging.info('ACC@10: %f (%d/%d)', accuracy10, acc10_num, N) + return accuracy, accuracy10 + + +class AccReporter: + def __init__(self, args, dump_file=None): + self.best_acc = 0 + self.args = args + self.best_acc10 = 0 + self.best_eng_acc = 0 + self.best_nat_acc = 0 + self.best_seen = 0 + self.best_epoch = 0 + self.dump_file = dump_file + + def print_details(self, epoch, gold_dict, pred_dict, header="all"): + beam_width = self.args["beam_width"] + # epoch, gold_dict, pred_dict, header = "all" + accuracy, accuracy10 = print_evauation_details(gold_dict=gold_dict, pred_dict=pred_dict, + header=header, beam_width=beam_width) + return accuracy, accuracy10 + + def report_eval(self, epoch, seen, examples, evaler): + pred_dict, gold_dict = {}, {} + eng_pred_dict, eng_gold_dict = {}, {} + nat_pred_dict, nat_gold_dict = {}, {} + correct = 0 + correct_nat = 0 + correct_eng = 0 + if self.dump_file is not None: + out = open(self.dump_file, "w") + else: + out = None + eng_nwords = sum([1 for (_, _, weight, is_eng) in examples if is_eng]) + nat_nwords = sum([1 for (_, _, weight, is_eng) in examples if not is_eng]) + for idx, example in enumerate(examples): + x, y, weight, is_eng = example + # print(weight,is_eng) + if idx > 0 and idx % 200 == 0: + logging.info("running infer on example %d", idx) + + decoded_outputs = evaler.infer_on_example(sentence=x) + scores_and_words = get_decoded_words(decoded_outputs) + decoded_words = [w for s, w in scores_and_words] + + key = x.replace(" ", "") + + pred_dict[key] = decoded_words + gold_dict[key] = [y] + + if is_eng: + eng_pred_dict[key] = decoded_words + eng_gold_dict[key] = [y] + else: + nat_pred_dict[key] = decoded_words + nat_gold_dict[key] = [y] + + if decoded_words[0] == y: + correct += 1 + if is_eng: + correct_eng += 1 + else: + correct_nat += 1 + + if out is not None: + edit_dists = ";".join([str(ED(ref=y, candidate=word)) for score, word in scores_and_words]) + beam_outputs = ";".join([word for score, word in scores_and_words]) + beam_scores = ";".join([str(score) for score, word in scores_and_words]) + buf = "%s\t%s\t%s\t%s\t%s\t%s\n" % (x, y, is_eng, beam_outputs, beam_scores, edit_dists) + out.write(buf) + + logging.info("accuracy %d/%d=%.2f", correct, len(examples), correct / len(examples)) + NAT_ACC = 0.0 if nat_nwords == 0 else correct_nat / nat_nwords + ENG_ACC = 0.0 if eng_nwords == 0 else correct_eng / eng_nwords + logging.info("accuracy (nat) %d/%d=%.2f", correct_nat, nat_nwords, NAT_ACC) + logging.info("accuracy (eng) %d/%d=%.2f", correct_eng, eng_nwords, ENG_ACC) + if out is not None: + out.close() + all_acc, all_acc10 = self.print_details(header="total", epoch=epoch, gold_dict=gold_dict, pred_dict=pred_dict) + if eng_nwords == 0: + eng_acc, eng_acc10 = 0, 0 + else: + eng_acc, eng_acc10 = self.print_details(header="eng", epoch=epoch, gold_dict=eng_gold_dict, + pred_dict=eng_pred_dict) + nat_acc, nat_acc10 = self.print_details(header="nat", epoch=epoch, gold_dict=nat_gold_dict, + pred_dict=nat_pred_dict) + ret_val = False + if eng_acc > self.best_eng_acc: + self.best_eng_acc = eng_acc + if nat_acc > self.best_nat_acc: + self.best_nat_acc = nat_acc + if all_acc10 > self.best_acc10: + self.best_acc10 = all_acc10 + if all_acc > self.best_acc: + self.best_acc = all_acc + self.best_seen = seen + self.best_epoch = epoch + ret_val = True + if ret_val is True and self.dump_file is not None: + bestpred = self.dump_file + '_best.txt' + logging.info("saving best predictions to file %s",bestpred) + shutil.copyfile(self.dump_file, bestpred) + logging.info("best accuracy: %.3f", self.best_acc) + logging.info("best accuracy@10: %.3f", self.best_acc10) + logging.info("best after %d mini-batches (%d epoch)", self.best_seen, self.best_epoch) + logging.info("best eng accuracy: %.3f", self.best_eng_acc) + logging.info("best nat accuracy: %.3f", self.best_nat_acc) + return ret_val, self.best_acc diff --git a/seq2seq/inferences/evaluate.py b/seq2seq/inferences/evaluate.py new file mode 100644 index 0000000..c0dc021 --- /dev/null +++ b/seq2seq/inferences/evaluate.py @@ -0,0 +1,27 @@ +import torch +import torch.nn as nn +from torch.autograd import Variable as V +from torch import optim +import torch.nn.functional as F +from seq2seq.constants import EOS_token, SOS_ID, EOS_ID +from seq2seq.constants import SOS_token +from seq2seq.torch_utils import variable_from_sentence + + +class Inference: + def __init__(self, encoder, decoder, input_lang, output_lang, device_id=None): + self.encoder = encoder + self.decoder = decoder + self.input_lang, self.output_lang = input_lang, output_lang + self.device_id = device_id + + def infer_on_example(self, sentence): + self.encoder.eval() + self.decoder.eval() + ans = self.run_inference(sentence) + self.encoder.train() + self.decoder.train() + return ans + + def run_inference(self, sentence, max_length=100): + raise NotImplementedError diff --git a/seq2seq/inferences/monotonic_infer.py b/seq2seq/inferences/monotonic_infer.py new file mode 100644 index 0000000..6883b44 --- /dev/null +++ b/seq2seq/inferences/monotonic_infer.py @@ -0,0 +1,172 @@ +import numpy as np +import torch +import torch.nn.functional as F +from torch.autograd import Variable as V + +from seq2seq.constants import EOS_token +from seq2seq.constants import SOS_token, SOS_ID, UNK_ID +from seq2seq.constants import STEP +from seq2seq.inferences.evaluate import Inference + +# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) +__author__ = 'Shyam' + + +def encode_string(input_str, word2index): + ans = [] + for w in input_str: + if w in word2index: + t = word2index[w] + else: + t = UNK_ID + ans.append(t) + return ans + + +class MonotonicInference(Inference): + def __init__(self, encoder, decoder, fr_lang, en_lang, device_id=None, beam_width=1, norm_by_length=False): + self.encoder = encoder + self.decoder = decoder + self.fr_lang, self.en_lang = fr_lang, en_lang + self.K = beam_width + self.norm_by_length = norm_by_length + self.device_id = device_id + + def run_inference(self, x, max_length=60): + # list of tokens + padded_lemma = [SOS_token] + x.split(' ') + [EOS_token] + padded_lemma_idx = encode_string(input_str=padded_lemma, word2index=self.fr_lang.word2index) + input_word = V(torch.LongTensor(padded_lemma_idx)) + enc_hid = self.encoder.init_hidden() + enc_outs, enc_hid = self.encoder(input_word, enc_hid) + + # initialize the decoder rnn + dec_hid = self.decoder.init_hidden() + + # set prev_output_vec for first lstm step as BEGIN_WORD + # prev_word = V(torch.LongTensor([SOS_ID])) + + # i is input index, j is output index + i = 0 + num_outputs = 0 + beam = [(0, i, [SOS_ID], dec_hid)] + outputs = [] + for beam_idx in range(3 * max_length): + next_beam = [] + for score, att_pos, ys, dec_hid in beam: + prev_word = V(torch.LongTensor([ys[-1]])) + decoder_output, next_dec_hid = self.decoder(prev_word, att_pos, dec_hid, enc_outs) + scores = self.decoder.out(decoder_output) + probs = F.softmax(scores, dim=-1) + # print("probs",probs) + topk_probs, topk_ints = torch.topk(probs, self.K, dim=2) + # print(topk_probs,topk_ints) + for k in range(self.K - len(outputs)): + top_score = np.log(topk_probs.data[0, 0, k]) + top_y = topk_ints.data[0, 0, k] + next_ys = ys + [top_y] + next_score = score + top_score + next_att_pos = att_pos + # print(top_y,self.en_lang.word2index[STEP]) + if top_y == self.en_lang.word2index[STEP]: + if att_pos < len(padded_lemma) - 1: + next_att_pos = att_pos + 1 + else: + next_att_pos = att_pos + if top_y == self.en_lang.word2index[EOS_token] or len(next_ys) == 3 * max_length: + # if not self.min_output_length or len(next_ys) >= self.min_output_length: + outputs.append((next_score, next_ys)) + else: + next_beam.append((next_score, next_att_pos, next_ys, next_dec_hid)) + if len(outputs) >= self.K: + break + # sort beam in descending order by score. + beam = list(sorted(next_beam, key=lambda tup: -tup[0]))[:self.K - len(outputs)] + + predicted_output_sequences = [] + for score, output in outputs: + seq = [] + for i in output: + seq.append(self.en_lang.index2word[i]) + if self.norm_by_length: + score /= len(seq) + predicted_output_sequences.append((score, seq)) + + predicted_output_sequences = sorted(predicted_output_sequences, key=lambda tup: -tup[0]) + prediction = predicted_output_sequences + return prediction + + def get_llh(self, x, y, max_length=60): + padded_lemma = [SOS_token] + x.split(' ') + [EOS_token] + padded_lemma_idx = encode_string(input_str=padded_lemma, word2index=self.fr_lang.word2index) + input_word = V(torch.LongTensor(padded_lemma_idx)) + enc_hid = self.encoder.init_hidden() + enc_outs, enc_hid = self.encoder(input_word, enc_hid) + y = y.split(' ') + [EOS_token] + # initialize the decoder rnn + dec_hid = self.decoder.init_hidden() + + # i is input index, j is output index + i = 0 + num_outputs = 0 + outputs = [] + y_idx = 0 + beam = [(0, i, y_idx, [SOS_ID], dec_hid)] + for idx in range(3 * max_length): + next_beam = [] + for score, att_pos, y_pos, ys, dec_hid in beam: + prev_word = V(torch.LongTensor([ys[-1]])) + decoder_output, next_dec_hid = self.decoder(prev_word, + att_pos, + dec_hid, + enc_outs) + scores = self.decoder.out(decoder_output) + probs = F.softmax(scores, dim=-1) + if y_pos == len(y): + seq = [] + for i in ys: + seq.append(self.en_lang.index2word[i]) + # print("finished seq:", [s for s in seq if s != STEP and s != SOS_token]) + outputs.append((score, ys)) + continue + yo = y[y_pos] + # print("yo:", yo) + # print(scores.size()) + yo_score = np.log(probs.data[0][0][self.en_lang.word2index[yo]]) + st_score = np.log(probs.data[0][0][self.en_lang.word2index[STEP]]) + if ys[-1] == self.en_lang.word2index[STEP] and ys[-2] == self.en_lang.word2index[STEP]: + possible_actions = [yo] + else: + possible_actions = [yo, STEP] + for action in possible_actions: + next_ys = ys + [self.en_lang.word2index[action]] + next_att_pos = att_pos + next_y_pos = y_pos + if action == STEP: + next_score = score + st_score + if att_pos < len(padded_lemma) - 1: + next_att_pos = att_pos + 1 + else: + next_score = score + yo_score + next_att_pos = att_pos + next_y_pos = y_pos + 1 + # print(next_ys) + # print(next_score) + new_state = (next_score, next_att_pos, next_y_pos, next_ys, next_dec_hid) + # print(new_state[:-1]) + next_beam.append(new_state) + # sort beam in descending order by score. + # print("next_beam", len(next_beam)) + beam = list(sorted(next_beam, key=lambda tup: -tup[0]))[:self.K] + + outputs = sorted(outputs, key=lambda tup: -tup[0]) + predicted_output_sequences = [] + for score, output in outputs: + seq = [] + for i in output: + seq.append(self.en_lang.index2word[i]) + print("seq:", seq) + # print("seq:", [s for s in seq if s != STEP and s != SOS_token]) + print("sco:", score) + predicted_output_sequences.append((score, seq)) + print(x, y) diff --git a/seq2seq/lang.py b/seq2seq/lang.py new file mode 100644 index 0000000..9fb7347 --- /dev/null +++ b/seq2seq/lang.py @@ -0,0 +1,29 @@ +from seq2seq.constants import SOS_token, EOS_token, SOS_ID, EOS_ID, UNK, UNK_ID + + +class Lang: + def __init__(self, name): + self.name = name + self.vocab = set() + self.word2index = {SOS_token: SOS_ID, EOS_token: EOS_ID, UNK: UNK_ID} + self.word2count = {} + self.index2word = {SOS_ID: SOS_token, EOS_ID: EOS_token, UNK_ID: UNK} + self.n_words = len(self.word2index) # Count SOS and EOS + + def index_words(self, sentence): + for word in sentence.split(' '): + self.index_word(word) + + def index_word(self, word): + self.vocab.add(word) + + def compute_maps(self): + words = sorted(list(self.vocab)) + for word in words: + if word not in self.word2index: + self.word2index[word] = self.n_words + self.word2count[word] = 1 + self.index2word[self.n_words] = word + self.n_words += 1 + else: + self.word2count[word] += 1 diff --git a/seq2seq/main.py b/seq2seq/main.py new file mode 100644 index 0000000..5732fae --- /dev/null +++ b/seq2seq/main.py @@ -0,0 +1,151 @@ +import random +import logging +import sys + +import torch +import torch.nn as nn +import numpy as np + +from utils.arguments import PARSER +from readers.aligned_reader import load_aligned_data, read_examples +from seq2seq.constants import STEP +from seq2seq.evaluators.reporter import AccReporter, get_decoded_words +from seq2seq.lang import Lang +from seq2seq.runner import run +from seq2seq.trainers.monotonic_train import MonotonicTrainer +from seq2seq.model_utils import load_checkpoint, model_builder, setup_optimizers + +# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) +logging.basicConfig(format=':%(levelname)s: %(message)s', level=logging.INFO) + + +def subsample_examples(examples, frac, single_token): + new_examples = [] + for ex in examples: + fr, en, weight, is_eng = ex + frtokens, entokens = fr.split(" "), en.split(" ") + if len(frtokens) != len(entokens): continue + if single_token: + if len(frtokens) > 1 or len(entokens) > 1: continue + for frtok, entok in zip(frtokens, entokens): + new_examples.append((frtok, entok, weight, is_eng)) + examples = new_examples + logging.info("new examples %d", len(examples)) + # subsample if needed + random.shuffle(examples) + if frac < 1.0: + tmp = examples[0:int(frac * len(examples))] + examples = tmp + elif frac > 1.0: + tmp = examples[0:int(frac)] + examples = tmp + return examples + + +def index_vocab(examples, fr_lang, en_lang): + for ex in examples: + raw_x, raw_y, xs, ys, weight, is_eng = ex + fr_lang.index_words(xs) + en_lang.index_words(ys) + logging.info("train size %d", len(examples)) + + +langcodes = {"hi": "hindi", "fa": "farsi", "ta": "tamil", "ba": "bengali", "ka": "kannada", "he": "hebrew", + "th": "thai"} + +if __name__ == '__main__': + args = PARSER.parse_args() + args = vars(args) + logging.info(args) + batch_first = args["batch_first"] + device_id = args["device_id"] + seed = args["seed"] + native_or_eng = args["nat_or_eng"] + single_token = args["single_token"] + + remove_spaces = True + np.random.seed(seed) + random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + + lang = langcodes[args["lang"]] + + trainpath = "data/%s/%s_train_annotateEN" % (lang, lang) if args["ftrain"] is None else args["ftrain"] + testpath = "data/%s/%s_test_annotateEN" % (lang, lang) if args["ftest"] is None else args["ftest"] + + examples = read_examples(fpath=trainpath, + native_or_eng=native_or_eng, + remove_spaces=remove_spaces) + + examples = subsample_examples(examples=examples, frac=args["frac"], single_token=single_token) + + fr_lang, en_lang = Lang(name="fr"), Lang(name="en") + examples = load_aligned_data(examples=examples, + mode="mcmc", + seed=seed) + index_vocab(examples, fr_lang, en_lang) + en_lang.index_word(STEP) + fr_lang.compute_maps() + en_lang.compute_maps() + # see_phrase_alignments(examples=examples) + logging.info(fr_lang.word2index) + logging.info(en_lang.word2index) + # ALWAYS READ ALL TEST EXAMPLES + test = read_examples(fpath=testpath) + train = read_examples(fpath=trainpath) + + train = [ex for ex in train if ' ' not in ex[0] and ' ' not in ex[1]] + logging.info("input vocab: %d", fr_lang.n_words) + logging.info("output vocab: %d", en_lang.n_words) + logging.info("beam width: %d", args["beam_width"]) + + # Initialize models + encoder, decoder, evaler = model_builder(args, fr_lang=fr_lang, en_lang=en_lang) + enc_opt, dec_opt, enc_sch, dec_sch = setup_optimizers(args=args, encoder=encoder, decoder=decoder) + criterion = nn.NLLLoss() + + trainer = MonotonicTrainer(encoder=encoder, decoder=decoder, + enc_opt=enc_opt, dec_opt=dec_opt, + enc_sch=enc_sch, dec_sch=dec_sch, + fr_lang=fr_lang, en_lang=en_lang) + + # Begin! + test_reporter = AccReporter(args=args, + dump_file=args["dump"]) + train_reporter = AccReporter(args=args, + dump_file=args["dump"] + ".train.txt" if args["dump"] is not None else None) + + if args["restore"]: + if "," in args["restore"]: + logging.info("ensembling ...") + pass + else: + load_checkpoint(encoder=encoder, decoder=decoder, + enc_opt=enc_opt, dec_opt=dec_opt, + ckpt_path=args["restore"]) + if args["interactive"]: + try: + while True: + surface = input("enter surface:") + surface = " ".join(list(surface)) + print(surface) + x, y, weight, is_eng = surface, None, 1.0, False + decoded_outputs = evaler.infer_on_example(sentence=x) + scores_and_words = get_decoded_words(decoded_outputs) + decoded_words = [w for s, w in scores_and_words] + scores = [s for s, w in scores_and_words] + print(scores_and_words) + except KeyboardInterrupt: + print('interrupted!') + sys.exit(0) + else: + logging.info(20 * "-" + "TEST" + 20 * "-") + test_reporter.report_eval(epoch=-1, seen=-1, evaler=evaler, examples=test) + + else: + run(args=args, + examples=examples, + trainer=trainer, evaler=evaler, criterion=criterion, + train=train, test=test, + train_reporter=train_reporter, test_reporter=test_reporter) diff --git a/seq2seq/model_utils.py b/seq2seq/model_utils.py new file mode 100644 index 0000000..afd454a --- /dev/null +++ b/seq2seq/model_utils.py @@ -0,0 +1,108 @@ +import logging +import sys, shutil +import os +from seq2seq.monotonic_decoder import MonotonicDecoder +from seq2seq.inferences.monotonic_infer import MonotonicInference +from seq2seq.constants import STEP +from seq2seq.encoder import EncoderRNN +import torch +from torch import optim + +__author__ = 'Shyam' + + +def setup_optimizers(args, encoder, decoder): + learning_rate = args["lr"] + reduction_factor = args['reduction_factor'] + patience = args['patience'] + + enc_opt = optim.Adam(encoder.parameters(), lr=learning_rate) + dec_opt = optim.Adam(decoder.parameters(), lr=learning_rate) + enc_scheduler = optim.lr_scheduler.ReduceLROnPlateau(enc_opt, + factor=reduction_factor, + patience=patience, + verbose=True) + dec_scheduler = optim.lr_scheduler.ReduceLROnPlateau(dec_opt, + factor=reduction_factor, + patience=patience, + verbose=True) + return enc_opt, dec_opt, enc_scheduler, dec_scheduler + + +def model_builder(args, fr_lang, en_lang): + bidi = args["bidi"] + device_id = args["device_id"] + batch_first = args["batch_first"] + vector_size = args["wdim"] + hidden_size = args["hdim"] + beam_width = args["beam_width"] + norm_by_length = args["norm_by_length"] + if args["mono"]: + decoder_input_size = 2 * 2 * hidden_size if bidi else 2 * hidden_size + else: + decoder_input_size = vector_size + + decoder_hidden_size = 2 * hidden_size if bidi else hidden_size + # print("hidden_size", hidden_size) + # print("decoder_hidden_size", decoder_hidden_size) + dropout_p = args["wdrop"] + + if args["mono"]: + en_lang.index_word(STEP) + + invoc_size = len(fr_lang.word2index) # 20 + outvoc_size = len(en_lang.word2index) # 30 + + encoder = EncoderRNN(invoc_size=invoc_size, + vector_size=vector_size, + hidden_size=hidden_size, + bidi=bidi, + batch_first=batch_first) + + # if args["mono"]: + decoder = MonotonicDecoder(input_size=decoder_input_size, + batch_first=batch_first, + outvoc_size=outvoc_size, + hidden_size=decoder_hidden_size) + evaler = MonotonicInference(encoder=encoder, + decoder=decoder, + fr_lang=fr_lang, + en_lang=en_lang, + beam_width=beam_width, + norm_by_length=norm_by_length) + logging.info(encoder) + logging.info(decoder) + # Move models to GPU + if device_id is not None: + encoder.cuda(device_id) + decoder.cuda(device_id) + return encoder, decoder, evaler + + +def load_checkpoint(encoder, decoder, enc_opt, dec_opt, ckpt_path): + if os.path.isfile(ckpt_path): + logging.info("=> loading checkpoint %s", ckpt_path) + checkpoint = torch.load(ckpt_path) + encoder.load_state_dict(checkpoint['enc_state_dict']) + decoder.load_state_dict(checkpoint['dec_state_dict']) + if enc_opt is not None: + enc_opt.load_state_dict(checkpoint['enc_opt_state_dict']) + if dec_opt is not None: + dec_opt.load_state_dict(checkpoint['dec_opt_state_dict']) + logging.info("=> loaded checkpoint!") + return checkpoint + # any other relevant state variables can be extracted from the checkpoint dict + else: + logging.info("=> no checkpoint at %s !!!", ckpt_path) + + +def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): + """ + From https://discuss.pytorch.org/t/saving-and-loading-a-model-in-pytorch/2610/3 + + """ + logging.info("saving model to %s", filename) + torch.save(state, filename) + if is_best: + logging.info("copying to best ...") + shutil.copyfile(filename, filename + '_best.pth.tar') diff --git a/seq2seq/monotonic_decoder.py b/seq2seq/monotonic_decoder.py new file mode 100644 index 0000000..35e7819 --- /dev/null +++ b/seq2seq/monotonic_decoder.py @@ -0,0 +1,42 @@ +import torch +import torch.nn as nn +from torch.autograd import Variable as V + + +class MonotonicDecoder(nn.Module): + def __init__(self, input_size, outvoc_size, hidden_size, n_layers=1, device_id=None, batch_first=True): + super().__init__() + self.hidden_size = hidden_size + self.device_id = device_id + self.batch_first = batch_first + self.n_layers = n_layers + # concatenated_input_dim = input_size + hidden_size + # print("concatenated_input_dim",concatenated_input_dim) + self.decoder_rnn = nn.GRU(input_size=input_size, + hidden_size=hidden_size, + batch_first=True) + self.char_lookup = nn.Embedding(outvoc_size, hidden_size) + self.out = nn.Linear(in_features=hidden_size, + out_features=outvoc_size) + + def forward(self, prev_word, idx, last_hidden, encoder_outputs): + # set prev_output_vec for first lstm step as BEGIN_WORD + if self.batch_first: + encoder_outputs = encoder_outputs.transpose(0, 1) + prev_word_vec = self.char_lookup(prev_word) + attended_vec = encoder_outputs[idx] + decoder_input = torch.cat((prev_word_vec, attended_vec), dim=1) + decoder_output, hidden = self.decoder_rnn(decoder_input.unsqueeze(0), last_hidden) + return decoder_output, hidden + + def init_hidden(self, batch_size=1): + k = self.n_layers * 1 + hidden = V(torch.zeros(k, batch_size, self.hidden_size)) + if self.device_id is not None: + hidden = hidden.cuda(self.device_id) + return hidden + + def _cuda(self, m): + if self.device_id is not None: + return m.cuda(self.device_id) + return m diff --git a/seq2seq/runner.py b/seq2seq/runner.py new file mode 100644 index 0000000..ec664fc --- /dev/null +++ b/seq2seq/runner.py @@ -0,0 +1,52 @@ +import random +import logging + +import numpy as np + +from seq2seq.model_utils import save_checkpoint + +__author__ = 'Shyam' + + +def run(args, examples, trainer, criterion, evaler, train, test, test_reporter, train_reporter): + n_epochs = args["iters"] + logging.info("training on %d examples for %d epochs", len(examples), n_epochs) + random.shuffle(examples) + seen = 0 + for epoch in range(1, n_epochs + 1): + epoch_losses = [] + random.shuffle(examples) + for example in examples: + # FOR MONOTONIC MODEL, x CANNOT have any alignment characters! + ex_loss = trainer.train_on_example(example=example, + criterion=criterion) + seen += 1 + # Keep track of loss + epoch_losses.append(ex_loss) + if seen > 0 and seen % args["evalfreq"] == 0: + logging.info("seen %d loss:%.3f", seen, np.average(epoch_losses[-50:])) + best_updated, test_acc = test_reporter.report_eval(epoch=epoch, seen=seen, evaler=evaler, examples=test) + if best_updated and args["save"]: + state_dict = { + 'args': args, + 'enc_state_dict': trainer.encoder.state_dict(), + 'dec_state_dict': trainer.decoder.state_dict(), + 'enc_opt_state_dict': trainer.enc_opt.state_dict(), + 'dec_opt_state_dict': trainer.dec_opt.state_dict(), + } + save_checkpoint(state=state_dict, is_best=True, filename=args["save"]) + if seen > 0 and seen % args["logfreq"] == 0: + logging.info("seen %d loss:%.3f", seen, np.average(epoch_losses[-50:])) + logging.info("epoch loss %.3f", np.average(epoch_losses)) + if args["save"]: + logging.info("saving final model ...") + state_dict = { + 'args': args, + 'enc_state_dict': trainer.encoder.state_dict(), + 'dec_state_dict': trainer.decoder.state_dict(), + 'enc_opt_state_dict': trainer.enc_opt.state_dict(), + 'dec_opt_state_dict': trainer.dec_opt.state_dict(), + } + save_checkpoint(state=state_dict, is_best=False, filename=args["save"]) + logging.info(20 * "-" + "TEST" + 20 * "-") + test_reporter.report_eval(epoch=n_epochs, seen=seen, evaler=evaler, examples=test) diff --git a/seq2seq/torch_utils.py b/seq2seq/torch_utils.py new file mode 100644 index 0000000..56febf4 --- /dev/null +++ b/seq2seq/torch_utils.py @@ -0,0 +1,33 @@ +import torch +from torch.autograd import Variable as V + +from seq2seq.constants import EOS_ID, EOS_token + + +def variables_from_pair(x, y, input_lang=None, output_lang=None): + input_variable = variable_from_sentence(input_lang, x) + target_variable = variable_from_sentence(output_lang, y) + return input_variable, target_variable + + +def variable_from_sentence(lang, sentence, device_id=None): + indexes = indexes_from_sentence(lang, sentence) + indexes.append(EOS_ID) + var = V(torch.LongTensor(indexes).view(-1, 1)) + # print('var =', var) + if device_id is not None: + var = var.cuda(device_id) + return var + + +def indexes_from_sentence(lang, sentence): + return [lang.word2index[word] for word in sentence.split(' ')] + + +def pad_batch(batch, pad_unit): + lengths = [len(i) for i in batch] + max_length = max(lengths) + for ex in batch: + padding = (max_length - len(ex)) * [pad_unit] + ex += padding + return batch, lengths diff --git a/seq2seq/trainers/monotonic_train.py b/seq2seq/trainers/monotonic_train.py new file mode 100644 index 0000000..1a553c9 --- /dev/null +++ b/seq2seq/trainers/monotonic_train.py @@ -0,0 +1,116 @@ +import torch +import torch.nn as nn +from torch.autograd import Variable as V +from seq2seq.constants import SOS_token, SOS_ID +from seq2seq.constants import EOS_token +from seq2seq.constants import ALIGN_SYMBOL +from seq2seq.constants import STEP +from seq2seq.constants import UNK +from seq2seq.trainers.seq2seq_attn_trainer import Seq2SeqAttnTrainer + + +def make_target(word, word2idx): + return torch.LongTensor([word2idx[word]]) + + +class MonotonicTrainer(Seq2SeqAttnTrainer): + def __init__(self, encoder, decoder, enc_opt, dec_opt, enc_sch, dec_sch, fr_lang, en_lang, clip=0.5, teacher_forcing_ratio=0.5, + device_id=None): + self.encoder = encoder + self.decoder = decoder + self.enc_opt = enc_opt + self.dec_opt = dec_opt + self.enc_sch = enc_sch + self.dec_sch = dec_sch + self.fr_lang = fr_lang + self.en_lang = en_lang + self.clip = clip + self.device_id = device_id + + def prepare_example(self, example): + raw_x, raw_y, x, y, weight, is_eng = example + raw_x, raw_y, x, y = raw_x.split(" "), raw_y.split(" "), x.split(" "), y.split(" ") + example = raw_x, raw_y, x, y, weight + return example + + def compute_loss(self, example, criterion, profile): + raw_x, raw_y, aligned_x, aligned_y, weight = example + # i is input index, j is output index + i = 0 + j = 0 + padded_raw_x = [SOS_token] + raw_x + [EOS_token] + hidden = self.encoder.init_hidden() + padded_lemma_idx = [self.fr_lang.word2index[w] for w in padded_raw_x] + input_word = V(torch.LongTensor(padded_lemma_idx)) + encoder_outputs, encoder_hidden_state = self.encoder(input_word, hidden) + aligned_x += [EOS_token] + aligned_y += [EOS_token] + + # start decoding, keeping track of sequence loss + decoder_hidden = self.decoder.init_hidden() + prev_word = V(torch.LongTensor([SOS_ID])) + loss = [] # V(torch.FloatTensor([0.0])) + + for a, (input_char, output_char) in enumerate(zip(aligned_x, aligned_y)): + possible_outputs = [] + if output_char == EOS_token: + decoder_hidden, scores = self.step_decoder(prev_word=prev_word, i=i, + decoder_hidden=decoder_hidden, + encoder_outputs=encoder_outputs) + target = V(make_target(word=EOS_token, word2idx=self.en_lang.word2index)) + ex_loss = criterion(input=scores, target=target) + loss.append(ex_loss) + continue + + if padded_raw_x[i] == SOS_token and aligned_x[a] != ALIGN_SYMBOL: + decoder_hidden, scores = self.step_decoder(prev_word=prev_word, i=i, + decoder_hidden=decoder_hidden, + encoder_outputs=encoder_outputs) + target = V(make_target(word=STEP, word2idx=self.en_lang.word2index)) + ex_loss = criterion(input=scores, target=target) + loss.append(ex_loss) + + prev_word = V(make_target(word=STEP, word2idx=self.en_lang.word2index)) + i += 1 + + if aligned_y[a] != ALIGN_SYMBOL: + decoder_hidden, scores = self.step_decoder(prev_word=prev_word, i=i, + decoder_hidden=decoder_hidden, + encoder_outputs=encoder_outputs) + + if aligned_y[a] in self.en_lang.word2index: + target = V(make_target(word=aligned_y[a], word2idx=self.en_lang.word2index)) + ex_loss = criterion(input=scores, target=target) + prev_word = V(make_target(word=aligned_y[a], word2idx=self.en_lang.word2index)) + else: + target = V(make_target(word=UNK, word2idx=self.en_lang.word2index)) + ex_loss = criterion(input=scores, target=target) + prev_word = V(make_target(word=UNK, word2idx=self.en_lang.word2index)) + + loss.append(ex_loss) + + j += 1 + + if i < len(padded_raw_x) - 1 and aligned_x[a + 1] != ALIGN_SYMBOL: + decoder_hidden, scores = self.step_decoder(prev_word=prev_word, i=i, + decoder_hidden=decoder_hidden, + encoder_outputs=encoder_outputs) + target = V(torch.LongTensor([self.en_lang.word2index[STEP]])) + ex_loss = criterion(input=scores, target=target) + loss.append(ex_loss) + prev_word = V(torch.LongTensor([self.en_lang.word2index[STEP]])) + # whenever you step, attend to next position + i += 1 + return weight*sum(loss)/len(loss) + + def step_decoder(self, prev_word, i, decoder_hidden, encoder_outputs): + decoder_output, decoder_hidden = self.decoder(prev_word, + i, + decoder_hidden, + encoder_outputs) + # compute local loss + scores = self.decoder.out(decoder_output) + log_softmax = nn.LogSoftmax(dim=2) + scores = log_softmax(scores) + scores = scores.squeeze(1) + return decoder_hidden, scores diff --git a/seq2seq/trainers/seq2seq_attn_trainer.py b/seq2seq/trainers/seq2seq_attn_trainer.py new file mode 100644 index 0000000..33c852c --- /dev/null +++ b/seq2seq/trainers/seq2seq_attn_trainer.py @@ -0,0 +1,195 @@ +import time +import logging + +import torch +from torch.autograd import Variable as V +import torch.nn.functional as F + +from seq2seq.constants import EOS_token, SOS_ID +from seq2seq.torch_utils import variables_from_pair + + +class Seq2SeqAttnTrainer: + def __init__(self, encoder, decoder, fr_lang, en_lang, enc_opt, dec_opt, clip=0.5, teacher_forcing_ratio=0.5, + device_id=None): + self.clip = clip + self.teacher_forcing_ratio = teacher_forcing_ratio + self.device_id = device_id + self.teacher_forcing_ratio = teacher_forcing_ratio + self.encoder = encoder + self.decoder = decoder + self.enc_opt = enc_opt + self.dec_opt = dec_opt + self.fr_lang = fr_lang + self.en_lang = en_lang + + def train_on_example(self, example, + criterion, profile=False): + + # y_length = len(y) + prep_ex = self.prepare_example(example) + + # Zero gradients of both optimizers + self.enc_opt.zero_grad() + self.dec_opt.zero_grad() + + loss = self.compute_loss(prep_ex, criterion, profile) + + # Backpropagation + loss.backward() + torch.nn.utils.clip_grad_norm(self.encoder.parameters(), self.clip) + torch.nn.utils.clip_grad_norm(self.decoder.parameters(), self.clip) + self.enc_opt.step() + self.dec_opt.step() + + return loss.data[0] + + def compute_loss(self, prep_ex, criterion, profile): + loss = [] # Added onto for each word + + x, y = prep_ex + # Get size of input and target sentences + x_length = x.size()[0] + y_length = y.size()[0] + + # Run words through encoder + tic = time.time() + encoder_hidden = self.encoder.init_hidden() + encoder_outputs, encoder_hidden = self.encoder(word_inputs=x, + hidden=encoder_hidden, + ) + toc = time.time() + if profile: logging.info("encoding time %.2f", toc - tic) + # print("encoder_outputs",encoder_outputs.size(),encoder_hidden.size()) + # return + # Prepare input and output variables + tic = time.time() + decoder_input = V(torch.LongTensor([[SOS_ID]])) + # Use last hidden state from encoder to start decoder + decoder_hidden = self.decoder.init_hidden(encoder_hidden) + # print("decoder_hidden",decoder_hidden.size()) + if self.device_id: + decoder_input = decoder_input.cuda(self.device_id) + + # Choose whether to use teacher forcing + use_teacher_forcing = True # random.random() < self.teacher_forcing_ratio + if use_teacher_forcing: + + # Teacher forcing: Use the ground-truth target as the next input + for di in range(y_length): + # print("decoder_input",decoder_input.size()) + decoder_output, decoder_hidden = self.decoder(decoder_input, + encoder_outputs, + decoder_hidden) + # print(decoder_output[0].size(),target_variable[di].size()) + ex_loss = criterion(input=decoder_output, target=y[di]) + loss.append(ex_loss) + # decoder_input = y[di] # Next target is next input + decoder_input = y[di].unsqueeze(0) # Next target is next input + + else: + # Without teacher forcing: use network's own prediction as the next input + for di in range(y_length): + decoder_output, decoder_hidden = self.decoder(decoder_input, + encoder_outputs, + decoder_hidden) + # print(decoder_output[0].size(),target_variable[di].size()) + ex_loss = criterion(input=decoder_output, target=y[di]) + loss.append(ex_loss) + + # Get most likely word index (highest value) from output + topv, topi = decoder_output.data.topk(1) + ni = topi[0][0] + + decoder_input = V(torch.LongTensor([[ni]])) # Chosen word is next input + if self.device_id: + decoder_input = decoder_input.cuda() + + # Stop at end of sentence (not necessary when using known targets) + if ni == EOS_token: + break + + toc = time.time() + if profile: logging.info("decoding time %.2f", toc - tic) + return sum(loss) / len(loss) + + def compute_loss_old(self, prep_ex, criterion, profile): + loss = [] # Added onto for each word + + x, y = prep_ex + # Get size of input and target sentences + x_length = x.size()[0] + y_length = y.size()[0] + + # Run words through encoder + tic = time.time() + encoder_hidden = self.encoder.init_hidden() + encoder_outputs, encoder_hidden = self.encoder(word_inputs=x, + hidden=encoder_hidden, + ) + toc = time.time() + if profile: logging.info("encoding time %.2f", toc - tic) + # print("encoder_outputs",encoder_outputs.size(),encoder_hidden.size()) + # return + # Prepare input and output variables + tic = time.time() + decoder_input = V(torch.LongTensor([[SOS_ID]])) + decoder_context = V(torch.zeros(1, self.decoder.hidden_size)) + decoder_hidden = torch.cat([encoder_hidden[0, :, :], encoder_hidden[1, :, :]], dim=-1).unsqueeze( + 0) # Use last hidden state from encoder to start decoder + # decoder_hidden = self.decoder.init_hidden() # Use last hidden state from encoder to start decoder + # print("decoder_hidden",decoder_hidden.size()) + if self.device_id: + decoder_input = decoder_input.cuda(self.device_id) + decoder_context = decoder_context.cuda(self.device_id) + + # Choose whether to use teacher forcing + use_teacher_forcing = True # random.random() < self.teacher_forcing_ratio + if use_teacher_forcing: + + # Teacher forcing: Use the ground-truth target as the next input + for di in range(y_length): + decoder_output, decoder_context, decoder_hidden, decoder_attention = self.decoder(decoder_input, + decoder_context, + decoder_hidden, + encoder_outputs) + # print(decoder_output[0].size(),target_variable[di].size()) + ex_loss = criterion(input=decoder_output, target=y[di]) + loss.append(ex_loss) + decoder_input = y[di] # Next target is next input + + else: + # Without teacher forcing: use network's own prediction as the next input + for di in range(y_length): + decoder_output, decoder_context, decoder_hidden, decoder_attention = self.decoder(decoder_input, + decoder_context, + decoder_hidden, + encoder_outputs) + # print(decoder_output[0].size(),target_variable[di].size()) + ex_loss = criterion(input=decoder_output, target=y[di]) + loss.append(ex_loss) + + # Get most likely word index (highest value) from output + topv, topi = decoder_output.data.topk(1) + ni = topi[0][0] + + decoder_input = V(torch.LongTensor([[ni]])) # Chosen word is next input + if self.device_id: + decoder_input = decoder_input.cuda() + + # Stop at end of sentence (not necessary when using known targets) + if ni == EOS_token: + break + + toc = time.time() + if profile: logging.info("decoding time %.2f", toc - tic) + return sum(loss) / len(loss) + + def prepare_example(self, example): + raw_x, raw_y, x, y, weight, is_eng = example + training_pair = variables_from_pair(x, y, + input_lang=self.fr_lang, + output_lang=self.en_lang) + vx = training_pair[0] + vy = training_pair[1] + return vx, vy, weight diff --git a/train_model.sh b/train_model.sh new file mode 100755 index 0000000..2efb1aa --- /dev/null +++ b/train_model.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +ME=`basename $0` # for usage message + +if [ "$#" -ne 3 ]; then # number of args + echo "USAGE: " + echo "$ME" + exit +fi +lang=$1 +seed=$2 +model=$3 +time python -m seq2seq.main \ + --lang ${lang} \ + --mono \ + --beam_width 1 \ + --save ${model} \ + --seed ${seed} + + + + + +if [[ $? == 0 ]] # success +then + : # do nothing +else # something went wrong + echo "SOME PROBLEM OCCURED"; # echo file with problems +fi diff --git a/train_model_on_files.sh b/train_model_on_files.sh new file mode 100755 index 0000000..98a2f8d --- /dev/null +++ b/train_model_on_files.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +ME=`basename $0` # for usage message + +if [ "$#" -ne 4 ]; then # number of args + echo "USAGE: ${ME} " + exit +fi +ftrain=$1 +ftest=$2 +seed=$3 +model=$4 + +time python -m seq2seq.main \ + --ftrain ${ftrain} \ + --ftest ${ftest} \ + --mono \ + --beam_width 1 \ + --save ${model} \ + --seed ${seed} + + + + + +if [[ $? == 0 ]] # success +then + : # do nothing +else # something went wrong + echo "SOME PROBLEM OCCURED"; # echo file with problems +fi diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/arguments.py b/utils/arguments.py new file mode 100644 index 0000000..088cb8c --- /dev/null +++ b/utils/arguments.py @@ -0,0 +1,44 @@ +import argparse + +PARSER = argparse.ArgumentParser(description='entity linker') +PARSER.add_argument('--iters', type=int, default=20, help='# train iters (default: 20)') +PARSER.add_argument('--maxsteps', type=int, default=500000, help='# train iters (default: 5)') +PARSER.add_argument('--batch_size', type=int, default=1, help='batch size (default: 1)') +PARSER.add_argument('--seed', type=int, default=42, metavar='N', help='random seed (default: 42)') +PARSER.add_argument('--restore', type=str, default=None, help='path from which to restore model') +PARSER.add_argument('--profile', action='store_true', help='restore model') +PARSER.add_argument('--save', type=str, default=None, help='save model') +PARSER.add_argument('--lang', type=str, default="hi", help='language') +PARSER.add_argument('--wdim', type=int, default=50, help='word vec size') +PARSER.add_argument('--hdim', type=int, default=20, help='rnn vec size') +PARSER.add_argument('--cell', type=str, default="gru", help='rnn type') +PARSER.add_argument('--wdrop', type=float, default=0.0, help='word dropout') +PARSER.add_argument('--lr', type=float, default=0.001, help='learning rate') +PARSER.add_argument('--clip', type=float, default=None, help='grad clipping') +PARSER.add_argument('--optimizer', type=str, default="adam", help='optimizer') +PARSER.add_argument('--extra', type=str, default=None, help='extra mined data') +PARSER.add_argument('--nat_or_eng', type=str, default="both", help='nat|eng|both') +PARSER.add_argument('--evalfreq', type=int, default=500, help='(default: 500)') +PARSER.add_argument('--logfreq', type=int, default=100, help='(default: 100)') +PARSER.add_argument('--patience', type=int, default=10, help='(default: 10)') +PARSER.add_argument('--reduction_factor', type=float, default=0.1, help='reduction factor for LR schedule') +PARSER.add_argument('--beam_width', type=int, default=1, help='(default: 1)') +PARSER.add_argument('--norm_by_length', dest='norm_by_length', action='store_true', help='norm score of final beam contents by length') +PARSER.add_argument('--single_token', dest='single_token', action='store_true', help='filter stuff that is not single token aligned') +PARSER.set_defaults(single_token=True) + +PARSER.add_argument('--max_output_length', type=int, default=75, help='(default: 1)') +PARSER.add_argument('--ftrain', type=str, help='train file') +PARSER.add_argument('--ftest', type=str, help='test/val file') +PARSER.add_argument('--frac', type=float, default=1.0, help='frac of train data') +PARSER.add_argument('--dump', type=str, default=None, help='to dump test predictions') +PARSER.add_argument('--device_id', type=int, default=None, help='gpu device') +PARSER.add_argument('--ncands', type=int, default=20, help='ncands') +PARSER.add_argument('--no-bidi', dest='bidi', action='store_false', help='do not use bidirectional') +PARSER.set_defaults(bidi=True) +PARSER.add_argument('--no-batch-first', dest='batch_first', action='store_false', help='do not use batch first') +PARSER.set_defaults(batch_first=True) +PARSER.add_argument('--mono', dest='mono', action='store_true', help='use monotonic transliteration model') +PARSER.add_argument('--interactive', action="store_true", dest="interactive") +PARSER.add_argument('--outfile', action="store", dest="outfile") + diff --git a/utils/news_evaluation_script/news_evaluation.py b/utils/news_evaluation_script/news_evaluation.py new file mode 100755 index 0000000..f243a8f --- /dev/null +++ b/utils/news_evaluation_script/news_evaluation.py @@ -0,0 +1,510 @@ +#!/usr/bin/python + +import codecs +import logging +import sys +import getopt +from os.path import basename +import xml.dom.minidom +from xml.dom.minidom import Node +# what we expect to find inside tag... +import editdistance +import numpy as np +RESULT_HEADER_ATTR = ('SourceLang', 'TargetLang', 'GroupID', 'RunID', 'RunType', 'Comments') +# ... and inside tag +CORPUS_HEADER_ATTR = ('SourceLang', 'TargetLang', 'CorpusID', 'CorpusType', 'CorpusSize', 'CorpusFormat') + +MAX_CANDIDATES = 10 + + +def usage(): + ''' + User's manual + ''' + print(''' +Transliteration results evaluation script for NEWS: +Named Entities Workshop - Shared Task on Transliteration + +Usage: + [python] %s [-h|--help] [-i|--input-file=] + [-o|--output-file=] + -t|--test-file= + --max-candidates= + [--map-n=] + +Options: + -h, --help : Print this help and quit + + --check-only : Only checks that the file is in correct format. + When this option is given, only one file is + accepted, either stdin or given with -i option. + + -i, --input-file : Input file with transliteration results in NEWS + XML format. If not given, standard input is used. + + -t, --test-file : Test file with transliteration references in NEWS + XML format. + + -o, --output-file : Output file with contribution of each source word + to each metric. If not given, no details are written. + The output file contains comma-separated values + and can be opened by a spreadsheet application + such as Microsoft Excel or OpenOffice Calc. + The values in the file are not divided by the + number of source names. + + --max-candidates : Maximum number of transliteration candidates + to consider. By default, maximum 10 candidates are + considered for evaluation according to the + NEWS 2009 whitepaper. + + +The input files must be in UTF-8. + +Example: + %s -i translit_results.xml -t test.xml -o evaluation_details.csv + +The detailed description of the metrics is in the NEWS 2010 whitepaper. + +For comments, suggestions and bug reports email to Vladimir Pervouchine +vpervouchine@i2r.a-star.edu.sg. + ''' % (basename(sys.argv[0]), basename(sys.argv[0]))) + + +def get_options(): + ''' + Extracts command line arguments + ''' + input_fname = None + output_fname = None + test_fname = None + max_candidates = MAX_CANDIDATES + check_only = False + silent = False + + try: + opts, args = getopt.gnu_getopt(sys.argv[1:], 'hi:o:t:', + ['help', 'input-file=', 'output-file=', 'test-file=', + 'check-only', 'silent']) + except getopt.GetoptError as err: + sys.stderr.write('Error: %s\n' % err) + usage() + sys.exit(1) + + for o, a in opts: + if o in ('-i', '--input-file'): + input_fname = a + elif o in ('-o', '--output-file'): + output_fname = a + elif o in ('-t', '--test-file'): + test_fname = a + elif o in ('-h', '--help'): + usage() + sys.exit() + elif o in ('--check-only',): + check_only = True + elif o in ('--silent',): + silent = True + elif o in ('--max-candidates',): + try: + max_candidates = int(a) + except ValueError: + sys.stderr.write('Error: --max-candidates takes integer argument (you provided %s).\n' % a) + sys.exit(1) + if max_candidates < 1: + sys.stderr.write('Error: --max-candidates must be above 0.\n') + sys.exit(1) + + else: + sys.stderr.write('Error: unknown option %s. Type --help to see the options.\n' % o) + sys.exit(1) + + if check_only: + if test_fname or output_fname: + sys.stderr.write('No test file or output file is required to check the input format.\n') + sys.exit(1) + else: + if not test_fname: + sys.stderr.write('Error: no test file provided.\n') + sys.exit(1) + + return input_fname, output_fname, test_fname, max_candidates, check_only, silent + + +def parse_xml(f_in, max_targets=None): + ''' + Parses XML input and test files with paranoid error checking. + Returns a tuple of header and content + Content is a dictionary with source names as keys and contains lists of target names. + If max_targets is given, the number of target names in the list is cut up to max_targets names. + Header is a dictionary of header data + ''' + + stderr = codecs.getwriter('utf-8')(sys.stderr) + + doc = xml.dom.minidom.parse(f_in) + if doc.encoding.lower() != 'utf-8': + raise IOError('Invalid encoding. UTF-8 is required but %s found' % doc.encoding) + + # try results + header = doc.getElementsByTagName('TransliterationTaskResults') + is_results = True + if not header: + # try corpus + is_results = False + header = doc.getElementsByTagName('TransliterationCorpus') + if not header: + raise IOError('Unknown file. TransliterationTaskResults and TransliterationCorpus tags are missing') + if len(header) > 1: + raise IOError('Invalid file. Several headers were found') + header = header[0] + + # parse the comments + header_data = {} + if is_results: + attr_list = RESULT_HEADER_ATTR + else: + attr_list = CORPUS_HEADER_ATTR + + for attr in attr_list: + header_data[attr] = header.getAttribute(attr) + + # parse the data + data = {} + for node in doc.getElementsByTagName('Name'): + # we ignore the name ID unless encounter error + # get the source name + s = node.getElementsByTagName('SourceName') + # import ipdb + # ipdb.set_trace() + if not s: + raise IOError('Invalid file format: one of nodes does not have ') + if s[0].childNodes[0].nodeType == Node.TEXT_NODE: + src_name = s[0].childNodes[0].data.strip('" ') # strip quotes and spaces in case someone adds them + src_name = src_name.upper() # convert to uppercase in case it's a language where case matters + else: + raise IOError('For Name ID %s no SourceName was found or its format is invalid' % node.getAttribute('ID')) + + # get the targets + t = node.getElementsByTagName('TargetName') + if not t: + raise IOError('Invalid file format: one of nodes does not have ') + # we'll read target names as tuples: (target_name, ID) so that the list can later be sorted + # according to the ID, which is going to be removed after that. + tgt_list = [] + for t_node in t: + # get the ID, which is the rank for transliteration candidates + try: + tgt_id = int(t_node.getAttribute('ID')) + except ValueError: + raise IOError( + 'For name ID %s (%s) one of target names have invalid ID' % (node.getAttribute('ID'), src_name)) + # get the word + if not t_node.childNodes: + raise IOError('For name ID %s (%s) one of the target names ID %s is empty' % ( + node.getAttribute('ID'), src_name, tgt_id)) + if t_node.childNodes[0].nodeType == Node.TEXT_NODE: + tgt_name = t_node.childNodes[0].data.strip('" ') + if tgt_name: + tgt_name = tgt_name.upper() # convert to uppercase in case it matters + tgt_list.append((tgt_name, tgt_id)) + else: + stderr.write( + 'Warning: Name ID %s (%s) contains empty target words\n' % (node.getAttribute('ID'), src_name)) + else: + raise IOError('For name ID %s (%s) one of target names ID %s have invalid format' % ( + node.getAttribute('ID'), src_name, tgt_id)) + + # sort by ID + if not tgt_list: + stderr.write('Warning: no non-empty target words found for name ID %s (%s). This name is ignored.\n' % ( + node.getAttribute('ID'), src_name)) + + else: + + tgt_list.sort(key=lambda x: x[1]) + # check for duplicate IDs: if there are any, they must be adjacent elements after sorting + # we only care for IDs to be unique in the results file because IDs are ranks there. + if is_results: + for i in range(len(tgt_list) - 1): + if tgt_list[i][1] == tgt_list[i + 1][1]: + raise IOError( + 'XML results file contains duplicate IDs for transliterations of word %s' % src_name) + + # cut up to max_targets + if max_targets: + tgt_list = tgt_list[0:max_targets] + + data[src_name] = [tgt[0] for tgt in tgt_list] # remove IDs, we don't need them anymore + + # test (codecs.getwriter('utf-8')(sys.stdout)).write('Name: %s\n' % (data[src_name][0])) + # test raise IOError('%s' % data[src_name][0]) + + return header_data, data, is_results + + +def LCS_length(s1, s2): + ''' + Calculates the length of the longest common subsequence of s1 and s2 + s1 and s2 must be anything iterable + The implementation is almost copy-pasted from Wikibooks.org + ''' + m = len(s1) + n = len(s2) + # An (m+1) times (n+1) matrix + C = [[0] * (n + 1) for i in range(m + 1)] + for i in range(1, m + 1): + for j in range(1, n + 1): + if s1[i - 1] == s2[j - 1]: + C[i][j] = C[i - 1][j - 1] + 1 + else: + C[i][j] = max(C[i][j - 1], C[i - 1][j]) + return C[m][n] + + +def levenshtein(s1, s2): + if len(s1) < len(s2): + return levenshtein(s2, s1) + + # len(s1) >= len(s2) + if len(s2) == 0: + return len(s1) + + previous_row = range(len(s2) + 1) + for i, c1 in enumerate(s1): + current_row = [i + 1] + for j, c2 in enumerate(s2): + insertions = previous_row[ + j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer + deletions = current_row[j] + 1 # than s2 + substitutions = previous_row[j] + (c1 != c2) + current_row.append(min(insertions, deletions, substitutions)) + previous_row = current_row + + return previous_row[-1] + + +def compute_edit_dist(ref, candidate): + ref = ref.replace(" ", "") + candidate = candidate.replace(" ", "") + return editdistance.eval(ref, candidate) + + +def f_score(candidate, references): + ''' + Calculates F-score for the candidate and its best matching reference + Returns F-score and best matching reference + ''' + # determine the best matching reference (the one with the shortest ED) + best_ref = references[0] + if len(candidate) == 0: + return 0.0, best_ref, 100, 0 + best_ref_lcs = LCS_length(candidate, references[0]) + for ref in references[1:]: + lcs = LCS_length(candidate, ref) + if (len(ref) - 2 * lcs) < (len(best_ref) - 2 * best_ref_lcs): + best_ref = ref + best_ref_lcs = lcs + + # try: + precision = float(best_ref_lcs) / float(len(candidate)) + recall = float(best_ref_lcs) / float(len(best_ref)) + # except: + # import ipdb + # ipdb.set_trace() + # edit_dist = levenshtein(best_ref,candidate) + # edit_dist = Levenshtein.distance(best_ref,candidate) + edit_dist = compute_edit_dist(ref=best_ref, candidate=candidate) + nrm_edit_dist = edit_dist / len(best_ref) + # print("best_ref:", best_ref, "candidate:", candidate, "edit_dist:", edit_dist) + if best_ref_lcs: + return 2 * precision * recall / (precision + recall), best_ref, edit_dist, nrm_edit_dist + else: + return 0.0, best_ref, edit_dist, nrm_edit_dist + + +def mean_average_precision(candidates, references, n): + ''' + Calculates mean average precision up to n candidates. + ''' + + total = 0.0 + num_correct = 0 + for k in range(n): + if k < len(candidates) and (candidates[k] in references): + num_correct += 1 + total += float(num_correct) / float(k + 1) + + return total / float(n) + + +def inverse_rank(candidates, reference): + ''' + Returns inverse rank of the matching candidate given the reference + Returns 0 if no match was found. + ''' + rank = 0 + while (rank < len(candidates)) and (candidates[rank] != reference): + rank += 1 + if rank == len(candidates): + return 0.0 + else: + return 1.0 / (rank + 1) + + +def evaluate(pred_dict, gold_dict): + ''' + REMEMBER! -- pred_dict and gold_dict should be word to lists dictionaries. + The list will be ordered in descending order in the pred_dict. + If you only have a single reference make sure its in a list. + + Evaluates all metrics to save looping over input_data several times + n is the map-n parameter + Returns acc, f_score, mrr, map_ref, map_n + ''' + mrr = {} + acc = {} + f = {} + f_best_match = {} + # map_n = {} + map_ref = {} + # map_sys = {} + acc_10 = {} + edit_dist = {} + nrm_edit_dist = {} + for src_word in gold_dict.keys(): + if src_word in pred_dict: + candidates = pred_dict[src_word] + references = gold_dict[src_word] + + acc[src_word] = max([int(candidates[0] == ref) for ref in references]) # either 1 or 0 + + f[src_word], f_best_match[src_word], edit_dist[src_word], nrm_edit_dist[src_word] = f_score(candidates[0], references) + + mrr[src_word] = max([inverse_rank(candidates, ref) for ref in references]) + + # map_n[src_word] = mean_average_precision(candidates, references, n) + map_ref[src_word] = mean_average_precision(candidates, references, len(references)) + # map_sys[src_word] = mean_average_precision(candidates, references, len(candidates)) + + ## compute accuracy at 10- Anoop + acc_10[src_word] = max([int(ref in candidates) for ref in references]) # either 1 or 0 + + else: + logging.error('Warning: No transliterations found for word %s\n' % src_word) + mrr[src_word] = 0.0 + acc[src_word] = 0.0 + f[src_word] = 0.0 + edit_dist[src_word] = np.infty + nrm_edit_dist[src_word] = 1.0 + f_best_match[src_word] = '' + # map_n[src_word] = 0.0 + map_ref[src_word] = 0.0 + # map_sys[src_word] = 0.0 + # Anoop + acc_10[src_word] = 0.0 + + return acc, f, f_best_match, mrr, map_ref, acc_10, edit_dist, nrm_edit_dist # added by Anoop + + +def write_details(output_fname, input_data, test_data, acc, f, f_best_match, mrr, map_ref, acc_10): + ''' + Writes detailed results to CSV file + ''' + if output_fname == '-': + f_out = codecs.getwriter('utf-8')(sys.stdout) + else: + f_out = codecs.open(output_fname, 'w', 'utf-8') + + f_out.write('%s\n' % ( + ','.join(['"Source word"', '"First candidate"', '"ACC"', '"ACC-10"', '"F-score"', '"Best matching reference"', + '"MRR"', '"MAP_ref"', '"References"']))) + + for src_word in test_data.keys(): + if src_word in input_data: + first_candidate = input_data[src_word][0] + else: + first_candidate = '' + + f_out.write('%s,%s,%f,%f,%f,%s,%f,%f,%s\n' % ( + src_word, first_candidate, acc[src_word], acc_10[src_word], f[src_word], f_best_match[src_word], + mrr[src_word], + map_ref[src_word], '"' + ' | '.join(test_data[src_word]) + '"')) + + if output_fname != '-': + f_out.close() + + +def main(): + input_fname, output_fname, test_fname, max_candidates, check_only, silent = get_options() + stderr = codecs.getwriter('utf-8')(sys.stderr) + + if not input_fname: + f = sys.stdin + else: + f = input_fname + try: + input_header, input_data, is_results = parse_xml(f, max_targets=max_candidates) + except IOError as e: + error_message = e.strerror + if not error_message: + error_message = e.message + stderr.write(u'Error encountered while parsing input: %s.\n' % error_message) + sys.exit(1) + + if check_only: + stdout = codecs.getwriter('utf-8')(sys.stdout) + + if not silent: + if is_results: + corpus_type = 'testing or reference' + else: + corpus_type = 'training or development' + stdout.write('This is %s corpus\n' % corpus_type) + for elem in input_header.keys(): + stdout.write('%30s : %-30s\n' % (elem, input_header[elem])) + stdout.write('Number of words: %d\n' % len(input_data)) + else: + stdout.write("OK\n") + + sys.exit() + + try: + test_header, test_data, is_results = parse_xml(test_fname) + except IOError as e: + error_message = e.strerror + if not error_message: + error_message = e.message + stderr.write(u'Error encountered while parsing test file. Here is what the parser said:\n%s.\n' % error_message) + sys.exit(1) + + acc, f, f_best_match, mrr, map_ref, acc_10 = evaluate(input_data, test_data) + + if output_fname: + write_details(output_fname, input_data, test_data, acc, f, f_best_match, mrr, map_ref, acc_10) + + N = len(acc) + acc_num = float(sum([acc[src_word] for src_word in acc.keys()])) + acc10_num = float(sum([acc_10[src_word] for src_word in acc_10.keys()])) + sys.stdout.write('ACC: %f (%d/%d)\n' % (acc_num / N, acc_num, N)) + sys.stdout.write('Mean F-score: %f\n' % (float(sum([f[src_word] for src_word in f.keys()])) / N)) + sys.stdout.write('MRR: %f\n' % (float(sum([mrr[src_word] for src_word in mrr.keys()])) / N)) + sys.stdout.write('MAP_ref: %f\n' % (float(sum([map_ref[src_word] for src_word in map_ref.keys()])) / N)) + sys.stdout.write('ACC@10: %f (%d/%d)\n' % (acc10_num / N, acc10_num, N)) + # sys.stdout.write('MAP_%d: %f\n' % (n, float(sum([map_n[src_word] for src_word in map_n.keys()]))/N)) + # sys.stdout.write('MAP_sys: %f\n' % (float(sum([map_sys[src_word] for src_word in map_sys.keys()]))/N)) + + +def test(): + stdout = codecs.getwriter('utf-8')(sys.stdout) + input_header, input_data, is_result = parse_xml('news_results.xml', max_targets=10) + test_header, test_data, is_result = parse_xml('news_test.xml') + acc, f, f_best_match, mrr, map_ref = evaluate(input_data, test_data) + for src_word in test_data.keys(): + stdout.write('%10s ACC=%f\tF-score=%f (%s)\tMRR=%f\tMAP_ref=%f\n' % ( + src_word, acc[src_word], f[src_word], f_best_match[src_word], mrr[src_word], map_ref[src_word])) + + +if __name__ == '__main__': + main() + # test() diff --git a/utils/news_evaluation_script/news_results.xml b/utils/news_evaluation_script/news_results.xml new file mode 100644 index 0000000..55f96b7 --- /dev/null +++ b/utils/news_evaluation_script/news_results.xml @@ -0,0 +1,108 @@ + + + + + + + + ABARBANEL + 阿巴巴纳尔 + 阿巴巴内尔 + 阿巴班尔 + 阿巴本尔 + 阿巴巴尼尔 + 阿巴班埃尔 + 阿巴班尔 + 阿巴尔班埃尔 + 阿巴巴奈 + 阿巴巴纳 + 阿巴尔班尔 + 埃巴巴纳尔 + 阿巴尔巴纳尔 + 阿巴巴恩尔 + 阿巴贝内尔 + 埃巴巴内尔 + 阿巴尔巴内尔 + 阿巴本埃尔 + 阿巴班克尔 + 阿巴贝恩尔 + + + + ABBELL + 阿贝尔 + 阿贝尔尔 + 阿布贝尔 + 阿卜贝尔 + 埃贝尔 + 阿布尔 + 阿比尔 + 艾贝尔 + 艾布贝尔 + 阿比尔 + 阿贝利 + 阿比埃尔 + 奥贝尔 + 阿贝莱 + 亚贝尔 + 阿比厄尔 + 阿比雷尔 + 亚伯贝尔 + 阿布比尔 + 埃布贝尔 + + + ABBOT + 阿伯特 + 阿比特 + 阿布伯特 + 阿比奥特 + 埃伯特 + 阿卜伯特 + 阿布特 + 艾伯特 + 阿布布特 + 阿布奥特 + 阿布鲍特 + 奥伯特 + 阿比厄特 + 亚伯特 + 阿伯得 + 阿伯克 + 艾布伯特 + 阿卜布特 + 阿伯托 + 阿比欧 + + + ABELE + 阿贝尔 + 阿伯利 + 阿布尔 + 阿伯尔 + 阿比尔 + 阿贝利 + 阿布尔 + 阿贝尔尔 + 埃布尔 + 阿贝勒 + 阿布斯 + 阿布尔尔 + 阿布利 + 阿贝尔斯 + 埃贝尔 + 阿比利 + 阿拜尔 + 埃布尔尔 + 埃比尔 + 阿布勒 + + + + diff --git a/utils/news_evaluation_script/news_test.xml b/utils/news_evaluation_script/news_test.xml new file mode 100644 index 0000000..2b1034c --- /dev/null +++ b/utils/news_evaluation_script/news_test.xml @@ -0,0 +1,36 @@ + + + + + + + + ABARBANEL + + 阿巴巴纳尔 + + + + + ABBELL + 阿贝尔尔 + + + + ABBOT + 阿布伯特 + 埃伯特 + + + ABELE + + 阿贝尔埃伯特 + + +