-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdocu_learn.py
163 lines (144 loc) · 6.03 KB
/
docu_learn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import glob
import os
from bs4 import BeautifulSoup
import bs4
import string
import flair
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, DocumentPoolEmbeddings, BertEmbeddings, ELMoEmbeddings
import torch
# create a StackedEmbedding object that combines glove and forward/backward flair embeddings
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_similarity_score
#import numpy as np
from docx import Document
import sys
import numpy as np
from itertools import islice
from collections import deque
import csv
from random import shuffle
from sklearn.externals import joblib
from time import time
import pickle
import umap
from sklearn.pipeline import make_union, Pipeline
from sklearn.base import TransformerMixin, BaseEstimator
#from sklearn.pipeline import Pipeline, make_pipeline
import eli5
from eli5.lime import TextExplainer
from eli5 import explain_prediction
from eli5.formatters import format_as_text, format_as_html
import pandas as pd
from IPython.display import display
from keras.callbacks import ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense
from keras.models import load_model
keras = True
increment = False
stacked_embeddings = DocumentPoolEmbeddings([
#WordEmbeddings('en'),
#WordEmbeddings('glove'),
WordEmbeddings('en-crawl'),#ELMoEmbeddings('original'),
#BertEmbeddings('bert-base-cased'),
#FlairEmbeddings('news-forward-fast'),
#FlairEmbeddings('news-backward-fast'),
]) #, mode='max')
def create_model(optimizer='adam', kernel_initializer='glorot_uniform', epochs = 5):
model = Sequential()
model.add(Dense(list_of_embeddings[1].size, activation='relu',kernel_initializer='he_uniform', use_bias = True))
model.add(Dense(11,activation='softmax',kernel_initializer=kernel_initializer, use_bias = True))
model.compile(loss='categorical_crossentropy',optimizer=optimizer, metrics=['accuracy'])
return model
def parse_string(a_str):
to_ret = "".join([c.lower() for c in a_str if c in string.ascii_letters or c in string.whitespace])
to_ret2 = to_ret.split()
to_ret3 = " ".join(to_ret2)
return to_ret3
class Text2Vec( BaseEstimator, TransformerMixin):
'''
def __init__():
self.X = None
'''
def fit(self, X, y=None):
return self
def transform(self, X):
list_of_emb = []
size_of_emb = stacked_embeddings.embedding_length
if not isinstance(X, str):
for doc in X:
p_str = parse_string(doc)
if not p_str:
list_of_emb.append(np.zeros((size_of_emb,), dtype=np.float32))##TODO: don't hard code vector size
else:
a_set = Sentence(p_str)
stacked_embeddings.embed(a_set)
list_of_emb.append(a_set.get_embedding().cpu().detach().numpy())
to_ret = np.array(list_of_emb)
else:
try:
p_str = parse_string(X)
if not p_str:
to_ret = np.zeros((size_of_emb,), dtype=np.float32)##TODO here too
else:
a_set = Sentence(p_str)
stacked_embeddings.embed(a_set)
to_ret = a_set.get_embedding().cpu().detach().numpy().reshape(1, -1)
except:
print(type(X))
print(X)
return to_ret
pipe = joblib.load('saved_card_classification.pkl')
if keras:
pipe.named_steps['model'].model = load_model('keras_model.h5')
te = TextExplainer(random_state=42, n_samples=3000, position_dependent=False)
def explain_pred(sentence):
te.fit(sentence, pipe.predict_proba)
#txt = format_as_text(te.explain_prediction(target_names=["green", "neutral", "red"]))
t_pred = te.explain_prediction(top = 20, target_names=["ANB", "CAP", "ECON", "EDU", "ENV", "EX", "FED", "HEG", "NAT", "POL", "TOP", "ORI", "QER","COL",])
txt = format_as_text(t_pred)
html = format_as_html(t_pred)
html_file = open("latest_prediction.html", "a+")
html_file.write(html)
html_file.close()
print(te.metrics_)
def print_misclass():
print("misclassified examples!!!")
print(np.where(Y_val != pipe.predict(X_val)))
with open('card_classification.csv', 'a') as csvfile:
spamwriter = csv.writer(csvfile)
done = False
while not done:
to_process = input("Please copy and paste a document to be classified Ctrl-shift-D or ctrl-D to exit")
print("MODEL PREDICTION:")
pred = pipe.predict(str(to_process))
print(pred)
explain_pred(str(to_process))
label = input("What is the ground truth label of this? Seperate labels with a space")
if label == "":
pass
elif label == "f":
break
elif label == "stop":
csvfile.close()
if keras and increment:
pipe.named_steps['model'].model.save('keras_model.h5')
pipe.named_steps['model'].model = None
joblib.dump(pipe, 'saved_card_classification.pkl')
print("Model Dumped!!!!")
done = True
sys.exit()
else:
the_labels = label.split()
if increment == True:
t_model = pipe.named_steps['model']
ppset = Sentence(str(to_process))
stacked_embeddings.embed(ppset)
the_emb = ppset.get_embedding().cpu().detach().numpy().reshape(1, -1)
t_model.partial_fit(the_emb, the_labels) ##INCREMENTAL LEARNING MODE ENGAGED
the_labels.append(str(to_process))
spamwriter.writerow(the_labels)
csvfile.flush()