-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlexsub.py
executable file
·101 lines (89 loc) · 4.38 KB
/
lexsub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
""" Different models for performing lexical substitution task
(find substitution for a word in context)"""
from nltk.corpus import lin_thesaurus as lin
from nltk.corpus import wordnet as wn
import numpy as np
from numpy.linalg import norm
from gensim.models import KeyedVectors
import tools
from tools import process_candidates
WORD2VEC_PATH = "~/GoogleNews-vectors-negative300-SLIM.bin"
def lin_synonyms(word, pos):
fileid = 'sim%s.lsp' % pos.upper()
thes_entry = lin.scored_synonyms(word, fileid = fileid)
thes_entry = sorted(thes_entry, key = (lambda x : x[1]), reverse = True)
# return words ordered by score
return [syn for syn,score in thes_entry]
def wordnet_synonyms(word, pos):
if (pos == 'n'):
synset = wn.synsets(word, wn.NOUN)
# return synonym lemmas in no particular order
return [lemma.name() for s in synset for lemma in s.lemmas()]
else:
raise ValueError("unsupported part of speech: %r" % pos)
class LexSub(object):
"Find word substitutions for a word in context using word2vec skip-gram embedding"
def __init__(self, word_vectors = None, n_substitutes = 5,
candidate_generator = 'word2vec', n_candidates = 50):
"""
n_substitutes = number of lexical substitutes to generate
candidate_generator = word2vec, lin, wordnet
"""
self.n_substitutes = n_substitutes
# supported POS values
self.poses = ['n', 'a', 'v']
# number of generated candidates for substitution
self.n_candidates = n_candidates
if word_vectors is None:
self.word_vectors = KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary=True)
else:
self.word_vectors = word_vectors
if candidate_generator in ['word2vec', 'lin', 'wordnet']:
self.candidate_generator = candidate_generator
else:
raise ValueError("Invalid value candidate_generator: %r" % candidate_generator)
def get_candidates(self, word, POS):
if self.candidate_generator == 'word2vec':
words_scores = self.word_vectors.most_similar(positive=[word])
result = [word for word, score in words_scores]
if self.candidate_generator == 'lin':
result = lin_synonyms(word, POS)
if self.candidate_generator == 'wordnet':
result = wordnet_synonyms(word, POS)
# words to lower case, replace underscore, remove duplicates,
# remove target word and stop words, clip length
result = process_candidates(result, word)[:self.n_candidates]
assert(len(result) <= self.n_candidates)
return result
def get_substitutability(self, t, s, C):
""" get substitutability of substitution s for target t in context C
t = target word
s = candidate substitution
C = list of context words
"""
# 1. target score: how similar is it to the target word?
tscore = self.word_vectors.similarity(t, s)
# 2. context score: how similar is it to the context words?
cscores = [self.word_vectors.similarity(s, c) for c in C ]
cscore = sum(cscores)
return (tscore + cscore)/(len(C)+1)
def lex_sub(self, word_POS, sentence):
""" Get appropriate substitution for a word given context words
word_POS = word with part of speech in form word.POS e.g. dog.n
context_words = list of words in context
"""
w,_,POS = word_POS.partition('.')
# generate candidate substitutions
candidates = self.get_candidates(w, POS)
if sentence is None:
return candidates[:self.n_substitutes]
else:
context_words = tools.get_words(sentence)
# filter context words: exist in the word2vec vocab, not stop words
context_words = list(filter(lambda c : c in self.word_vectors.vocab
and c not in tools.stopwords,
context_words))
cand_scores = [self.get_substitutability(w, s, context_words) if s in self.word_vectors.vocab else 0 for s in candidates ]
assert(len(cand_scores) == len(candidates))
sorted_candidates = sorted(zip(candidates, cand_scores), key = lambda x : x[1], reverse=True )
return [sub for sub, score in sorted_candidates][:self.n_substitutes]