forked from bnsreenu/python_for_microscopists
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path167-LSTM_text_generation_ENGLISH.py
207 lines (152 loc) · 6.66 KB
/
167-LSTM_text_generation_ENGLISH.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# https://youtu.be/zyCpntcVKSo
"""
@author: Sreenivas Bhattiprolu
Download text file from: http://www.gutenberg.org/ebooks/236
"""
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.optimizers import RMSprop
import numpy as np
import random
import sys
#LOAD TEXT
#Save notepad as UTF-8 (select from dropdown during saving)
filename = "files/the_jungle_book.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()
print(raw_text[0:1000])
#CLEAN TEXT
#Remove numbers
raw_text = ''.join(c for c in raw_text if not c.isdigit())
#How many total characters do we have in our training text?
chars = sorted(list(set(raw_text))) #List of every character
#Character sequences must be encoded as integers.
#Each unique character will be assigned an integer value.
#Create a dictionary of characters mapped to integer values
char_to_int = dict((c, i) for i, c in enumerate(chars))
#Do the reverse so we can print our predictions in characters and not integers
int_to_char = dict((i, c) for i, c in enumerate(chars))
# summarize the data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters in the text; corpus length: ", n_chars)
print("Total Vocab: ", n_vocab)
########################
#Now that we have characters we can create input/output sequences for training
#Remember that for LSTM input and output can be sequences... hence the term seq2seq
seq_length = 60 #Length of each input sequence
step = 10 #Instead of moving 1 letter at a time, try skipping a few.
sentences = [] # X values (Sentences)
next_chars = [] # Y values. The character that follows the sentence defined as X
for i in range(0, n_chars - seq_length, step): #step=1 means each sentence is offset just by a single letter
sentences.append(raw_text[i: i + seq_length]) #Sequence in
next_chars.append(raw_text[i + seq_length]) #Sequence out
n_patterns = len(sentences)
print('Number of sequences:', n_patterns)
#Have a look at sentences and next_chars to see the continuity...
############################
#Just like time series, X is the sequence / sentence and y is the next value
#that comes after the sentence...
# reshape input to be [samples, time steps, features]
#time steps = sequence length
#features = numbers of characters in our vocab (n_vocab)
#Vectorize all sentences: there are n_patterns sentences.
#For each sentence we have n_vocab characters available for seq_length
#Vectorization returns a vector for all sentences indicating the presence or absence
#of a character.
x = np.zeros((len(sentences), seq_length, n_vocab), dtype=np.bool)
y = np.zeros((len(sentences), n_vocab), dtype=np.bool)
for i, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
x[i, t, char_to_int[char]] = 1
y[i, char_to_int[next_chars[i]]] = 1
print(x.shape)
print(y.shape)
print(y[0:10])
##################################################
#Basic model with one LSTM
# build the model: a single LSTM
model = Sequential()
model.add(LSTM(128, input_shape=(seq_length, n_vocab)))
model.add(Dense(n_vocab, activation='softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.summary()
######################################
# Deeper model woth 2 LSTM
#To stack LSTM layers, we need to change the configuration of the prior
#LSTM layer to output a 3D array as input for the subsequent layer.
#We can do this by setting the return_sequences argument on the layer to True
#(defaults to False). This will return one output for each input time step and provide a 3D array.
#Below is the same example as above with return_sequences=True.
#model = Sequential()
#model.add(LSTM(128, input_shape=(seq_length, n_vocab), return_sequences=True))
#model.add(Dropout(0.2))
#model.add(LSTM(128))
#model.add(Dropout(0.2))
#model.add(Dense(n_vocab, activation='softmax'))
#optimizer = RMSprop(lr=0.01)
#model.compile(loss='categorical_crossentropy', optimizer=optimizer)
#model.summary()
###############
# define the checkpoint
from keras.callbacks import ModelCheckpoint
filepath="saved_weights/saved_weights-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
# Fit the model
history = model.fit(x, y,
batch_size=128,
epochs=50,
callbacks=callbacks_list)
model.save('my_saved_weights_jungle_book_50epochs.h5')
##########################################################################
from matplotlib import pyplot as plt
#plot the training and validation accuracy and loss at each epoch
loss = history.history['loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'y', label='Training loss')
plt.title('Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
####################################################################
###################################
#Generate characters
#We must provide a sequence of seq_lenth as input to start the generation process
#The prediction results is probabilities for each of the 48 characters at a specific
#point in sequence. Let us pick the one with max probability and print it out.
#Writing our own softmax function....
def sample(preds):
preds = np.asarray(preds).astype('float64')
preds = np.log(preds)
exp_preds = np.exp(preds) #exp of log (x), isn't this same as x??
preds = exp_preds / np.sum(exp_preds)
probas = np.random.multinomial(1, preds, 1)
return np.argmax(probas)
#Prediction
# load the network weights
filename = "my_saved_weights_jungle_book_50epochs.h5"
model.load_weights(filename)
#Pick a random sentence from the text as seed.
start_index = random.randint(0, n_chars - seq_length - 1)
#Initiate generated text and keep adding new predictions and print them out
generated = ''
sentence = raw_text[start_index: start_index + seq_length]
generated += sentence
print('----- Seed for our text prediction: "' + sentence + '"')
#sys.stdout.write(generated)
for i in range(400): # Number of characters including spaces
x_pred = np.zeros((1, seq_length, n_vocab))
for t, char in enumerate(sentence):
x_pred[0, t, char_to_int[char]] = 1.
preds = model.predict(x_pred, verbose=0)[0]
next_index = sample(preds)
next_char = int_to_char[next_index]
generated += next_char
sentence = sentence[1:] + next_char
sys.stdout.write(next_char)
sys.stdout.flush()
print()
############################################