...

vdumoulin · May 8, 2014 · 5f0b707 · 5f0b707
1 parent bd2c881
commit 5f0b707
Show file tree

Hide file tree

Showing 2 changed files with 223 additions and 17 deletions.
diff --git a/code/pylearn2/models/rnn.py b/code/pylearn2/models/rnn.py
@@ -7,7 +7,7 @@
 from theano.compat.python2x import OrderedDict
 from pylearn2.models.model import Model
 from pylearn2.space import CompositeSpace
-from research.code.pylearn2.space import VectorSequenceSpace
+from pylearn2.space import VectorSpace, VectorSequenceSpace
 from research.code.pylearn2.datasets.timit import TIMITSequences
 from pylearn2.utils import sharedX
 from pylearn2.costs.cost import Cost, DefaultDataSpecsMixin
@@ -17,15 +17,16 @@ class ToyRNN(Model):
     """
     WRITEME
     """
-    def __init__(self, nvis, nhid, irange=0.05, non_linearity='sigmoid',
-                 use_ground_truth=True):
+    def __init__(self, nvis, nhid, hidden_transition_model, irange=0.05,
+                 non_linearity='sigmoid', use_ground_truth=True):
         allowed_non_linearities = {'sigmoid': T.nnet.sigmoid,
                                    'tanh': T.tanh}
         self.nvis = nvis
         self.nhid = nhid
+        self.hidden_transition_model = hidden_transition_model
         self.use_ground_truth = use_ground_truth
         self.alpha = sharedX(1)
-        self.alpha_decrease_rate = 0.9
+        self.alpha_decrease_rate = 1.0#0.99
 
         assert non_linearity in allowed_non_linearities
         self.non_linearity = allowed_non_linearities[non_linearity]
@@ -35,6 +36,7 @@ def __init__(self, nvis, nhid, irange=0.05, non_linearity='sigmoid',
             VectorSequenceSpace(dim=self.nvis),
             VectorSequenceSpace(dim=62)
         ])
+        self.hidden_space = VectorSpace(dim=self.nhid)
         self.output_space = VectorSequenceSpace(dim=1)
         self.input_source = ('features', 'phones')
         self.target_source = 'targets'
@@ -47,10 +49,6 @@ def __init__(self, nvis, nhid, irange=0.05, non_linearity='sigmoid',
         V_value = numpy.random.uniform(low=-irange, high=irange,
                                        size=(62, self.nhid))
         self.V = sharedX(V_value, name='V')
-        # Hidden-to-hidden matrix
-        M_value = numpy.random.uniform(low=-irange, high=irange,
-                                       size=(self.nhid, self.nhid))
-        self.M = sharedX(M_value, name='M')
         # Hidden biases
         b_value = numpy.zeros(self.nhid)
         self.b = sharedX(b_value, name='b')
@@ -63,19 +61,23 @@ def __init__(self, nvis, nhid, irange=0.05, non_linearity='sigmoid',
         self.c = sharedX(c_value, name='c')
 
     def fprop_step(self, features, phones, h_tm1, out):
+        h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0),
+                                            self.hidden_transition_model.input_space)
         h = self.non_linearity(T.dot(features, self.W) +
                                T.dot(phones, self.V) +
-                               T.dot(h_tm1, self.M) +
+                               self.hidden_transition_model.fprop(h_tm1).flatten() +
                                self.b)
         out = T.dot(h, self.U) + self.c
         return h, out
 
     def fprop_step_prime(self, truth, phones, features, h_tm1, out):
-        T.set_subtensor(features[-1],
-                        (1 - self.alpha) * features[-1] + self.alpha * truth[-1])
+        features = T.set_subtensor(features[-1], (1 - self.alpha) *
+                                   features[-1] + self.alpha * truth[-1])
+        h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0),
+                                            self.hidden_transition_model.input_space)
         h = self.non_linearity(T.dot(features, self.W) +
                                T.dot(phones, self.V) +
-                               T.dot(h_tm1, self.M) +
+                               self.hidden_transition_model.fprop(h_tm1).flatten() +
                                self.b)
         out = T.dot(h, self.U) + self.c
         features = T.concatenate([features[1:], out])
@@ -118,15 +120,143 @@ def fprop(self, data):
             return out
 
     def predict_next(self, features, phones, h_tm1):
+        h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0),
+                                            self.hidden_transition_model.input_space)
+        h = self.non_linearity(T.dot(features, self.W) +
+                               T.dot(phones, self.V) +
+                               self.hidden_transition_model.fprop(h_tm1).flatten() +
+                               self.b)
+        out = T.dot(h, self.U) + self.c
+        return h, out
+
+    def get_params(self):
+        return [self.W, self.V, self.b, self.U, self.c] + \
+               self.hidden_transition_model.get_params()
+
+    def get_input_source(self):
+        return self.input_source
+
+    def get_target_source(self):
+        return self.target_source
+
+    def censor_updates(self, updates):
+        updates[self.alpha] = self.alpha_decrease_rate * self.alpha
+
+    def get_monitoring_channels(self, data):
+        rval = OrderedDict()
+        rval['alpha'] = self.alpha
+        return rval
+
+
+class ToyRNNPhone(Model):
+    """
+    WRITEME
+    """
+    def __init__(self, nvis, nhid, hidden_transition_model, irange=0.05,
+                 non_linearity='sigmoid', use_ground_truth=True):
+        allowed_non_linearities = {'sigmoid': T.nnet.sigmoid,
+                                   'tanh': T.tanh}
+        self.nvis = nvis
+        self.nhid = nhid
+        self.hidden_transition_model = hidden_transition_model
+        self.use_ground_truth = use_ground_truth
+        self.alpha = sharedX(1)
+        self.alpha_decrease_rate = 0.999
+
+        assert non_linearity in allowed_non_linearities
+        self.non_linearity = allowed_non_linearities[non_linearity]
+
+        # Space initialization
+        self.input_space = VectorSpace(dim=self.nvis)
+        self.hidden_space = VectorSpace(dim=self.nhid)
+        self.output_space = VectorSpace(dim=1)
+        self.input_source = 'features'
+        self.target_source = 'targets'
+
+        # Features-to-hidden matrix
+        W_value = numpy.random.uniform(low=-irange, high=irange,
+                                       size=(self.nvis, self.nhid))
+        self.W = sharedX(W_value, name='W')
+        # Hidden biases
+        b_value = numpy.zeros(self.nhid)
+        self.b = sharedX(b_value, name='b')
+        # Hidden-to-out matrix
+        U_value = numpy.random.uniform(low=-irange, high=irange,
+                                       size=(self.nhid, 1))
+        self.U = sharedX(U_value, name='U')
+        # Output bias
+        c_value = numpy.zeros(1)
+        self.c = sharedX(c_value, name='c')
+
+    def fprop_step(self, features, h_tm1, out):
+        h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0),
+                                            self.hidden_transition_model.input_space)
+        h = T.nnet.sigmoid(T.dot(features, self.W) +
+                           self.hidden_transition_model.fprop(h_tm1).flatten() +
+                           self.b)
+        out = T.dot(h, self.U) + self.c
+        return h, out
+
+    def fprop_step_prime(self, truth, features, h_tm1, out):
+        features = T.set_subtensor(features[-1], (1 - self.alpha) *
+                                   features[-1] + self.alpha * truth[-1])
+        h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0),
+                                            self.hidden_transition_model.input_space)
+        h = T.nnet.sigmoid(T.dot(features, self.W) +
+                           self.hidden_transition_model.fprop(h_tm1).flatten() +
+                           self.b)
+        out = T.dot(h, self.U) + self.c
+        features = T.concatenate([features[1:], out])
+        return features, h, out
+
+    def fprop(self, data):
+        if self.use_ground_truth:
+            self.input_space.validate(data)
+            features = data
+
+            init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid)
+            init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1)
+            init_out = T.unbroadcast(init_out, 0)
+
+            fn = lambda f, h, o: self.fprop_step(f, h, o)
+
+            ((h, out), updates) = theano.scan(fn=fn,
+                                              sequences=[features],
+                                              outputs_info=[dict(initial=init_h,
+                                                                 taps=[-1]),
+                                                            init_out])
+            return out
+        else:
+            self.input_space.validate(data)
+            features = data
+
+            init_in = features[0]
+            init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid)
+            init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1)
+            init_out = T.unbroadcast(init_out, 0)
+
+            fn = lambda t, f, h, o: self.fprop_step_prime(t, f, h, o)
+
+            ((f, h, out), updates) = theano.scan(fn=fn,
+                                                 sequences=[features],
+                                                 outputs_info=[init_in,
+                                                               dict(initial=init_h,
+                                                                    taps=[-1]),
+                                                               init_out])
+            return out
+
+    def predict_next(self, features, h_tm1):
+        h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0),
+                                            self.hidden_transition_model.input_space)
         h = T.nnet.sigmoid(T.dot(features, self.W) +
-                           T.dot(phones, self.V) +
-                           T.dot(h_tm1, self.M) +
+                           self.hidden_transition_model.fprop(h_tm1).flatten() +
                            self.b)
         out = T.dot(h, self.U) + self.c
         return h, out
 
     def get_params(self):
-        return [self.W, self.V, self.M, self.b, self.U, self.c]
+        return [self.W, self.b, self.U, self.c] + \
+               self.hidden_transition_model.get_params()
 
     def get_input_source(self):
         return self.input_source

diff --git a/code/pylearn2/scripts/rnn/reconstruct.py b/code/pylearn2/scripts/rnn/reconstruct.py
@@ -11,7 +11,7 @@
 from pylearn2.utils import serial
 from pylearn2.config import yaml_parse
 from pylearn2.space import CompositeSpace
-from research.code.pylearn2.space import VectorSequenceSpace
+from pylearn2.space import VectorSequenceSpace, VectorSpace
 import theano
 from theano.sandbox.rng_mrg import MRG_RandomStreams
 import scipy.io.wavfile as wf
@@ -81,6 +81,82 @@ def main(model_path):
     wf.write("original.wav", 16000, original)
 
 
+def main_phone(model_path):
+    print 'Loading model...'
+    model = serial.load(model_path)
+    std = numpy.sqrt(model.monitor.channels['train_objective'].val_record[-1])
+
+    dataset_yaml_src = model.dataset_yaml_src
+    dataset = yaml_parse.load(dataset_yaml_src)
+    data_specs = (VectorSpace(dim=model.nvis), 'features')
+    it = dataset.iterator(mode='sequential', data_specs=data_specs,
+                          num_batches=1, batch_size=1)
+    original_sequence = dataset.X
+
+
+    X = T.vector('X')
+    h = T.vector('h')
+    out = T.vector('out')
+
+    next_h, pred = model.fprop_step(X, h, out)
+    fn = theano.function(inputs=[X, h, out], outputs=[next_h, pred],
+                         on_unused_input='ignore')
+
+    # Reconstruction
+    numpy_h = numpy.zeros(model.nhid)
+    numpy_out = numpy.zeros(1)
+    x_t = numpy.copy(original_sequence[0])
+
+    reconstruction_list = [original_sequence[0]]
+    for __ in original_sequence:
+        numpy_h, numpy_out = fn(x_t, numpy_h, numpy_out)
+        sampled_numpy_out = numpy.random.normal(loc=numpy_out,
+                                                scale=std,
+                                                size=(1,))
+        # reconstruction_list.append(sampled_numpy_out)
+        reconstruction_list.append(numpy_out)
+        x_t[:-1] = x_t[1:]
+        # x_t[-1] = sampled_numpy_out
+        x_t[-1] = numpy_out
+
+    numpy_reconstruction = numpy.concatenate(reconstruction_list)
+    numpy_reconstruction = (numpy_reconstruction * dataset._std + dataset._mean) * \
+                           dataset._mean_norm
+    numpy_reconstruction = numpy.cast['int16'](numpy_reconstruction)
+    wf.write("reconstruction.wav", 16000, numpy_reconstruction)
+
+    # One-on-one prediction
+    numpy_h = numpy.zeros(model.nhid)
+    numpy_out = numpy.zeros(1)
+
+    prediction_list = [numpy.copy(original_sequence[0])]
+    for x_t in original_sequence:
+        numpy_h, numpy_out = fn(x_t, numpy_h, numpy_out)
+        prediction_list.append(numpy_out)
+
+    numpy_prediction = numpy.concatenate(prediction_list)
+    numpy_prediction = (numpy_prediction * dataset._std + dataset._mean) * \
+                           dataset._mean_norm
+    numpy_prediction = numpy.cast['int16'](numpy_prediction)
+    wf.write("prediction.wav", 16000, numpy_prediction)
+
+    original= numpy.concatenate([original_sequence[0],
+                                 original_sequence[1:, -1]])
+    original = (original * dataset._std + dataset._mean) * dataset._mean_norm
+    original= numpy.cast['int16'](original)
+    wf.write("original.wav", 16000, original)
+
+    from matplotlib import pyplot
+    pyplot.figure()
+    pyplot.subplot(3, 1, 1)
+    pyplot.plot(original, 'r')
+    pyplot.subplot(3, 1, 2)
+    pyplot.plot(numpy_prediction, 'b')
+    pyplot.subplot(3, 1, 3)
+    pyplot.plot(numpy_reconstruction, 'g')
+    pyplot.savefig('phone_audio.png')
+
+
 if __name__ == '__main__':
     # Argument parsing
     parser = argparse.ArgumentParser()
@@ -89,4 +165,4 @@ def main(model_path):
 
     model_path = args.model_path
 
-    main(model_path)
+    main_phone(model_path)