-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathgenerate_elmo.py
75 lines (62 loc) · 2.55 KB
/
generate_elmo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import argparse
import tensorflow as tf
import tensorflow_hub as hub
import h5py
import numpy as np
import json
print((tf.__version__))
from util import set_gpus
def Elmo(fn, outfn):
with open(fn) as f:
dev_examples = [json.loads(jsonline) for jsonline in f.readlines()]
sents = [example["sentences"] for example in dev_examples]
docids = [example["doc_key"] for example in dev_examples]
config = tf.ConfigProto()
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
with h5py.File(outfn) as fout:
#for line in fin:
for i in range(len(sents)):
if i % 100 == 0:
print(('Finished ' + str(i)))
doc = sents[i]
docid = docids[i]
for j in range(len(doc)):
sent = [doc[j]]
slen = [len(doc[j])]
lm_emb = sess.run(
lm_emb_op, feed_dict={
sentences: sent,
text_len: slen
}
)
sentence_id = docid + '_' + str(j)
ds = fout.create_dataset(
sentence_id, lm_emb.shape[1:], dtype='float32',
data=lm_emb[0, :, :, :] # [slen, lm_size, lm_layers]
)
fout.close
#### Model #####
parser = argparse.ArgumentParser()
parser.add_argument('--input', required=True)
parser.add_argument('--output', required=True)
args = parser.parse_args()
set_gpus(0)
elmo = hub.Module("https://tfhub.dev/google/elmo/1", trainable=True)
sentences = tf.placeholder('string', shape=(None, None))
text_len = tf.placeholder('int32', shape=(None))
print("READY TO PARSE ELMO")
lm_embeddings = elmo(
inputs={
"tokens": sentences,
"sequence_len": text_len
},
signature="tokens", as_dict=True)
word_emb = tf.expand_dims(lm_embeddings["word_emb"], 3) # [B, slen, 512]
lm_emb_op = tf.concat([
tf.concat([word_emb, word_emb], 2), # [B, slen, 1024, 1]
tf.expand_dims(lm_embeddings["lstm_outputs1"], 3),
tf.expand_dims(lm_embeddings["lstm_outputs2"], 3)], 3) # [B, slen, 1024, 3]
Elmo(args.input, args.output)