Skip to content

Commit

Permalink
...
Browse files Browse the repository at this point in the history
  • Loading branch information
vdumoulin committed May 8, 2014
1 parent bd2c881 commit 5f0b707
Show file tree
Hide file tree
Showing 2 changed files with 223 additions and 17 deletions.
160 changes: 145 additions & 15 deletions code/pylearn2/models/rnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from theano.compat.python2x import OrderedDict
from pylearn2.models.model import Model
from pylearn2.space import CompositeSpace
from research.code.pylearn2.space import VectorSequenceSpace
from pylearn2.space import VectorSpace, VectorSequenceSpace
from research.code.pylearn2.datasets.timit import TIMITSequences
from pylearn2.utils import sharedX
from pylearn2.costs.cost import Cost, DefaultDataSpecsMixin
Expand All @@ -17,15 +17,16 @@ class ToyRNN(Model):
"""
WRITEME
"""
def __init__(self, nvis, nhid, irange=0.05, non_linearity='sigmoid',
use_ground_truth=True):
def __init__(self, nvis, nhid, hidden_transition_model, irange=0.05,
non_linearity='sigmoid', use_ground_truth=True):
allowed_non_linearities = {'sigmoid': T.nnet.sigmoid,
'tanh': T.tanh}
self.nvis = nvis
self.nhid = nhid
self.hidden_transition_model = hidden_transition_model
self.use_ground_truth = use_ground_truth
self.alpha = sharedX(1)
self.alpha_decrease_rate = 0.9
self.alpha_decrease_rate = 1.0#0.99

assert non_linearity in allowed_non_linearities
self.non_linearity = allowed_non_linearities[non_linearity]
Expand All @@ -35,6 +36,7 @@ def __init__(self, nvis, nhid, irange=0.05, non_linearity='sigmoid',
VectorSequenceSpace(dim=self.nvis),
VectorSequenceSpace(dim=62)
])
self.hidden_space = VectorSpace(dim=self.nhid)
self.output_space = VectorSequenceSpace(dim=1)
self.input_source = ('features', 'phones')
self.target_source = 'targets'
Expand All @@ -47,10 +49,6 @@ def __init__(self, nvis, nhid, irange=0.05, non_linearity='sigmoid',
V_value = numpy.random.uniform(low=-irange, high=irange,
size=(62, self.nhid))
self.V = sharedX(V_value, name='V')
# Hidden-to-hidden matrix
M_value = numpy.random.uniform(low=-irange, high=irange,
size=(self.nhid, self.nhid))
self.M = sharedX(M_value, name='M')
# Hidden biases
b_value = numpy.zeros(self.nhid)
self.b = sharedX(b_value, name='b')
Expand All @@ -63,19 +61,23 @@ def __init__(self, nvis, nhid, irange=0.05, non_linearity='sigmoid',
self.c = sharedX(c_value, name='c')

def fprop_step(self, features, phones, h_tm1, out):
h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0),
self.hidden_transition_model.input_space)
h = self.non_linearity(T.dot(features, self.W) +
T.dot(phones, self.V) +
T.dot(h_tm1, self.M) +
self.hidden_transition_model.fprop(h_tm1).flatten() +
self.b)
out = T.dot(h, self.U) + self.c
return h, out

def fprop_step_prime(self, truth, phones, features, h_tm1, out):
T.set_subtensor(features[-1],
(1 - self.alpha) * features[-1] + self.alpha * truth[-1])
features = T.set_subtensor(features[-1], (1 - self.alpha) *
features[-1] + self.alpha * truth[-1])
h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0),
self.hidden_transition_model.input_space)
h = self.non_linearity(T.dot(features, self.W) +
T.dot(phones, self.V) +
T.dot(h_tm1, self.M) +
self.hidden_transition_model.fprop(h_tm1).flatten() +
self.b)
out = T.dot(h, self.U) + self.c
features = T.concatenate([features[1:], out])
Expand Down Expand Up @@ -118,15 +120,143 @@ def fprop(self, data):
return out

def predict_next(self, features, phones, h_tm1):
h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0),
self.hidden_transition_model.input_space)
h = self.non_linearity(T.dot(features, self.W) +
T.dot(phones, self.V) +
self.hidden_transition_model.fprop(h_tm1).flatten() +
self.b)
out = T.dot(h, self.U) + self.c
return h, out

def get_params(self):
return [self.W, self.V, self.b, self.U, self.c] + \
self.hidden_transition_model.get_params()

def get_input_source(self):
return self.input_source

def get_target_source(self):
return self.target_source

def censor_updates(self, updates):
updates[self.alpha] = self.alpha_decrease_rate * self.alpha

def get_monitoring_channels(self, data):
rval = OrderedDict()
rval['alpha'] = self.alpha
return rval


class ToyRNNPhone(Model):
"""
WRITEME
"""
def __init__(self, nvis, nhid, hidden_transition_model, irange=0.05,
non_linearity='sigmoid', use_ground_truth=True):
allowed_non_linearities = {'sigmoid': T.nnet.sigmoid,
'tanh': T.tanh}
self.nvis = nvis
self.nhid = nhid
self.hidden_transition_model = hidden_transition_model
self.use_ground_truth = use_ground_truth
self.alpha = sharedX(1)
self.alpha_decrease_rate = 0.999

assert non_linearity in allowed_non_linearities
self.non_linearity = allowed_non_linearities[non_linearity]

# Space initialization
self.input_space = VectorSpace(dim=self.nvis)
self.hidden_space = VectorSpace(dim=self.nhid)
self.output_space = VectorSpace(dim=1)
self.input_source = 'features'
self.target_source = 'targets'

# Features-to-hidden matrix
W_value = numpy.random.uniform(low=-irange, high=irange,
size=(self.nvis, self.nhid))
self.W = sharedX(W_value, name='W')
# Hidden biases
b_value = numpy.zeros(self.nhid)
self.b = sharedX(b_value, name='b')
# Hidden-to-out matrix
U_value = numpy.random.uniform(low=-irange, high=irange,
size=(self.nhid, 1))
self.U = sharedX(U_value, name='U')
# Output bias
c_value = numpy.zeros(1)
self.c = sharedX(c_value, name='c')

def fprop_step(self, features, h_tm1, out):
h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0),
self.hidden_transition_model.input_space)
h = T.nnet.sigmoid(T.dot(features, self.W) +
self.hidden_transition_model.fprop(h_tm1).flatten() +
self.b)
out = T.dot(h, self.U) + self.c
return h, out

def fprop_step_prime(self, truth, features, h_tm1, out):
features = T.set_subtensor(features[-1], (1 - self.alpha) *
features[-1] + self.alpha * truth[-1])
h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0),
self.hidden_transition_model.input_space)
h = T.nnet.sigmoid(T.dot(features, self.W) +
self.hidden_transition_model.fprop(h_tm1).flatten() +
self.b)
out = T.dot(h, self.U) + self.c
features = T.concatenate([features[1:], out])
return features, h, out

def fprop(self, data):
if self.use_ground_truth:
self.input_space.validate(data)
features = data

init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid)
init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1)
init_out = T.unbroadcast(init_out, 0)

fn = lambda f, h, o: self.fprop_step(f, h, o)

((h, out), updates) = theano.scan(fn=fn,
sequences=[features],
outputs_info=[dict(initial=init_h,
taps=[-1]),
init_out])
return out
else:
self.input_space.validate(data)
features = data

init_in = features[0]
init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid)
init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1)
init_out = T.unbroadcast(init_out, 0)

fn = lambda t, f, h, o: self.fprop_step_prime(t, f, h, o)

((f, h, out), updates) = theano.scan(fn=fn,
sequences=[features],
outputs_info=[init_in,
dict(initial=init_h,
taps=[-1]),
init_out])
return out

def predict_next(self, features, h_tm1):
h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0),
self.hidden_transition_model.input_space)
h = T.nnet.sigmoid(T.dot(features, self.W) +
T.dot(phones, self.V) +
T.dot(h_tm1, self.M) +
self.hidden_transition_model.fprop(h_tm1).flatten() +
self.b)
out = T.dot(h, self.U) + self.c
return h, out

def get_params(self):
return [self.W, self.V, self.M, self.b, self.U, self.c]
return [self.W, self.b, self.U, self.c] + \
self.hidden_transition_model.get_params()

def get_input_source(self):
return self.input_source
Expand Down
80 changes: 78 additions & 2 deletions code/pylearn2/scripts/rnn/reconstruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pylearn2.utils import serial
from pylearn2.config import yaml_parse
from pylearn2.space import CompositeSpace
from research.code.pylearn2.space import VectorSequenceSpace
from pylearn2.space import VectorSequenceSpace, VectorSpace
import theano
from theano.sandbox.rng_mrg import MRG_RandomStreams
import scipy.io.wavfile as wf
Expand Down Expand Up @@ -81,6 +81,82 @@ def main(model_path):
wf.write("original.wav", 16000, original)


def main_phone(model_path):
print 'Loading model...'
model = serial.load(model_path)
std = numpy.sqrt(model.monitor.channels['train_objective'].val_record[-1])

dataset_yaml_src = model.dataset_yaml_src
dataset = yaml_parse.load(dataset_yaml_src)
data_specs = (VectorSpace(dim=model.nvis), 'features')
it = dataset.iterator(mode='sequential', data_specs=data_specs,
num_batches=1, batch_size=1)
original_sequence = dataset.X


X = T.vector('X')
h = T.vector('h')
out = T.vector('out')

next_h, pred = model.fprop_step(X, h, out)
fn = theano.function(inputs=[X, h, out], outputs=[next_h, pred],
on_unused_input='ignore')

# Reconstruction
numpy_h = numpy.zeros(model.nhid)
numpy_out = numpy.zeros(1)
x_t = numpy.copy(original_sequence[0])

reconstruction_list = [original_sequence[0]]
for __ in original_sequence:
numpy_h, numpy_out = fn(x_t, numpy_h, numpy_out)
sampled_numpy_out = numpy.random.normal(loc=numpy_out,
scale=std,
size=(1,))
# reconstruction_list.append(sampled_numpy_out)
reconstruction_list.append(numpy_out)
x_t[:-1] = x_t[1:]
# x_t[-1] = sampled_numpy_out
x_t[-1] = numpy_out

numpy_reconstruction = numpy.concatenate(reconstruction_list)
numpy_reconstruction = (numpy_reconstruction * dataset._std + dataset._mean) * \
dataset._mean_norm
numpy_reconstruction = numpy.cast['int16'](numpy_reconstruction)
wf.write("reconstruction.wav", 16000, numpy_reconstruction)

# One-on-one prediction
numpy_h = numpy.zeros(model.nhid)
numpy_out = numpy.zeros(1)

prediction_list = [numpy.copy(original_sequence[0])]
for x_t in original_sequence:
numpy_h, numpy_out = fn(x_t, numpy_h, numpy_out)
prediction_list.append(numpy_out)

numpy_prediction = numpy.concatenate(prediction_list)
numpy_prediction = (numpy_prediction * dataset._std + dataset._mean) * \
dataset._mean_norm
numpy_prediction = numpy.cast['int16'](numpy_prediction)
wf.write("prediction.wav", 16000, numpy_prediction)

original= numpy.concatenate([original_sequence[0],
original_sequence[1:, -1]])
original = (original * dataset._std + dataset._mean) * dataset._mean_norm
original= numpy.cast['int16'](original)
wf.write("original.wav", 16000, original)

from matplotlib import pyplot
pyplot.figure()
pyplot.subplot(3, 1, 1)
pyplot.plot(original, 'r')
pyplot.subplot(3, 1, 2)
pyplot.plot(numpy_prediction, 'b')
pyplot.subplot(3, 1, 3)
pyplot.plot(numpy_reconstruction, 'g')
pyplot.savefig('phone_audio.png')


if __name__ == '__main__':
# Argument parsing
parser = argparse.ArgumentParser()
Expand All @@ -89,4 +165,4 @@ def main(model_path):

model_path = args.model_path

main(model_path)
main_phone(model_path)

0 comments on commit 5f0b707

Please sign in to comment.