From d6345d85695e4adb66cadf29aae5a4e53efca245 Mon Sep 17 00:00:00 2001
From: Vincent Dumoulin <vi.dumoulin@gmail.com>
Date: Thu, 6 Nov 2014 14:47:40 -0500
Subject: [PATCH] Add NADE model

---
 code/pylearn2/costs/__init__.py               |   0
 code/pylearn2/costs/nade.py                   |  37 ++
 .../models/directed_probabilistic/__init__.py | 299 +++++++++++
 .../models/directed_probabilistic/nade.py     | 475 ++++++++++++++++++
 code/pylearn2/scripts/nade.yaml               |  46 ++
 code/pylearn2/utils/unrolled_scan.py          | 131 +++++
 6 files changed, 988 insertions(+)
 create mode 100644 code/pylearn2/costs/__init__.py
 create mode 100644 code/pylearn2/costs/nade.py
 create mode 100644 code/pylearn2/models/directed_probabilistic/__init__.py
 create mode 100644 code/pylearn2/models/directed_probabilistic/nade.py
 create mode 100644 code/pylearn2/scripts/nade.yaml
 create mode 100755 code/pylearn2/utils/unrolled_scan.py

diff --git a/code/pylearn2/costs/__init__.py b/code/pylearn2/costs/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/code/pylearn2/costs/nade.py b/code/pylearn2/costs/nade.py
new file mode 100644
index 0000000..db82863
--- /dev/null
+++ b/code/pylearn2/costs/nade.py
@@ -0,0 +1,37 @@
+"""
+Neural autoregressive density estimator (NADE)-related costs
+"""
+__authors__ = "Vincent Dumoulin"
+__copyright__ = "Copyright 2014, Universite de Montreal"
+__credits__ = ["Vincent Dumoulin"]
+__license__ = "3-clause BSD"
+__maintainer__ = "Vincent Dumoulin"
+
+
+import theano.tensor as T
+from pylearn2.costs.cost import Cost, DefaultDataSpecsMixin
+from pylearn2.utils import wraps
+
+
+class NADECost(DefaultDataSpecsMixin, Cost):
+    """
+    NADE negative log-likelihood
+    """
+    @wraps(Cost.expr)
+    def expr(self, model, data, ** kwargs):
+        self.get_data_specs(model)[0].validate(data)
+        X = data
+        return -T.mean(model.log_likelihood(X))
+
+
+class CNADECost(DefaultDataSpecsMixin, Cost):
+    """
+    CNADE negative log-likelihood
+    """
+    supervised = True
+
+    @wraps(Cost.expr)
+    def expr(self, model, data, ** kwargs):
+        self.get_data_specs(model)[0].validate(data)
+        X, Y = data
+        return -T.mean(model.log_likelihood(X, Y))
diff --git a/code/pylearn2/models/directed_probabilistic/__init__.py b/code/pylearn2/models/directed_probabilistic/__init__.py
new file mode 100644
index 0000000..fdffcc8
--- /dev/null
+++ b/code/pylearn2/models/directed_probabilistic/__init__.py
@@ -0,0 +1,299 @@
+"""
+Directed probabilistic models
+"""
+__authors__ = "Vincent Dumoulin"
+__copyright__ = "Copyright 2014, Universite de Montreal"
+__credits__ = ["Vincent Dumoulin"]
+__license__ = "3-clause BSD"
+__maintainer__ = "Vincent Dumoulin"
+
+
+import numpy
+import theano.tensor as T
+from theano.compat import OrderedDict
+from theano.tensor.shared_randomstreams import RandomStreams
+from pylearn2.models.model import Model
+from pylearn2.utils import sharedX
+from pylearn2.utils import wraps
+from pylearn2.space import VectorSpace, NullSpace
+
+
+theano_rng = RandomStreams(seed=23541)
+
+
+class Distribution(Model):
+    """
+    WRITEME
+    """
+    def _initialize_weights(self, dim_0, dim_1):
+        """
+        Initialize a (dim_0, dim_1)-shaped weight matrix
+
+        Parameters
+        ----------
+        dim_0 : int
+            First dimension of the weights matrix
+        dim_1 : int
+            Second dimension of the weights matrix
+
+        Returns
+        -------
+        rval : `numpy.ndarray`
+            A (dim_0, dim_1)-shaped, properly initialized weights matrix
+        """
+        rval = (2 * numpy.random.normal(size=(dim_0, dim_1)) - 1) / dim_0
+        return rval
+
+    def get_layer_monitoring_channels(self):
+        rval = OrderedDict()
+
+        for param in self.get_params():
+            rval[param.name + "_min"] = param.min()
+            rval[param.name + "_max"] = param.max()
+            rval[param.name + "_mean"] = param.mean()
+
+        return rval
+
+
+class JointDistribution(Distribution):
+    def _sample(self, num_samples):
+        raise NotImplementedError()
+
+    def sample(self, num_samples, return_log_likelihood=False,
+               return_probabilities=False):
+        """
+        Samples from the modeled joint distribution p(x)
+
+        Parameters
+        ----------
+        num_samples : int
+            Number of samples to draw
+        return_log_likelihood : bool, optional
+            If `True`, returns the log-likelihood of the samples in addition to
+            the samples themselves. Defaults to `False`.
+        return_probabilities : bool, optional
+            If `True`, returns the probabilities from which samples were drawn
+            in addition to the samples themselves. Defaults to `False`.
+
+        Returns
+        -------
+        samples : tensor-like
+            Batch of `num_samples` samples from p(x)
+        log_likelihood : tensor-like, optional
+            Log-likelihood of the drawn samples according to p(x). Returned
+            only if `return_log_likelihood` is set to `True`.
+        probabilities : tensor-like, optional
+            Probabilities from which the samples were drawn. Returned only if
+            `return_probabilities` is set to `True`.
+        """
+        rval = self._sample(num_samples=num_samples)
+        samples, log_likelihood, probabilities = rval
+
+        if not return_log_likelihood and not return_probabilities:
+            return samples
+        else:
+            rval = [samples]
+            if return_log_likelihood:
+                rval.append(log_likelihood)
+            if return_probabilities:
+                rval.append(probabilities)
+            return tuple(rval)
+
+    def _log_likelihood(self, X):
+        raise NotImplementedError()
+
+    def log_likelihood(self, X):
+        """
+        Computes the log-likelihood of a batch of observed examples on a
+        per-example basis
+
+        Parameters
+        ----------
+        X : tensor-like
+            Batch of observed examples
+
+        Returns
+        -------
+        rval : tensor-like
+            Log-likelihood for the batch of visible examples, with shape
+            (X.shape[0],)
+        """
+        return self._log_likelihood(X=X)
+
+
+class ConditionalDistribution(Distribution):
+    def _sample(self, num_samples):
+        raise NotImplementedError()
+
+    def sample(self, Y, return_log_likelihood=False,
+               return_probabilities=False):
+        """
+        Samples from the conditional distribution p(x | y)
+
+        Parameters
+        ----------
+        return_log_likelihood : bool, optional
+            If `True`, returns the conditional log-likelihood of the samples in
+            addition to the samples themselves. Defaults to `False`.
+        return_probabilities : bool, optional
+            If `True`, returns the conditional probabilities from which samples
+            were drawn in addition to the samples themselves. Defaults to
+            `False`.
+
+        Returns
+        -------
+        samples : tensor-like
+            Batch of `num_samples` samples from p(x)
+        log_likelihood : tensor-like, optional
+            Log-likelihood of the drawn samples according to p(x | y). Returned
+            only if `return_log_likelihood` is set to `True`.
+        probabilities : tensor-like, optional
+            Probabilities from which the samples were drawn. Returned only if
+            `return_probabilities` is set to `True`.
+        """
+        rval = self._sample(Y=Y)
+        samples, log_likelihood, probabilities = rval
+
+        if not return_log_likelihood and not return_probabilities:
+            return samples
+        else:
+            rval = [samples]
+            if return_log_likelihood:
+                rval.append(log_likelihood)
+            if return_probabilities:
+                rval.append(probabilities)
+            return tuple(rval)
+
+    def _log_likelihood(self, X, Y):
+        raise NotImplementedError()
+
+    def log_likelihood(self, X, Y):
+        """
+        Computes the conditional log-likelihood of a batch of observed examples
+        on a per-example basis
+
+        Parameters
+        ----------
+        X : tensor-like
+            Batch of observed examples
+        Y : tensor-like
+            Batch of conditioning examples
+
+        Returns
+        -------
+        rval : tensor-like
+            Conditional Log-likelihood for the batch of visible examples, with
+            shape (X.shape[0],)
+        """
+        return self._log_likelihood(X=X, Y=Y)
+
+
+class ProductOfBernoulli(JointDistribution):
+    """
+    Random binary vector whose distribution is a product of Bernoulli
+    distributions, i.e.
+
+        p(v) = \prod_i v_i ** p_i * (1 - v_i) ** (1 - p_i)
+    """
+    def __init__(self, dim):
+        """
+        Parameters
+        ----------
+        dim : int
+            Dimension of the random binary vector
+        """
+        self.dim = dim
+
+        # Parameter initialization
+        b_value = numpy.zeros(self.dim)
+        self.b = sharedX(b_value, 'b')
+        self.p = T.nnet.sigmoid(self.b)
+
+        # Space initialization
+        self.input_space = NullSpace()
+        self.output_space = VectorSpace(dim=self.dim)
+
+    def _sample(self, num_samples):
+        samples = theano_rng.uniform((num_samples, self.dim)) <= self.p
+        log_likelihood = self.log_likelihood(samples)
+        probabilities = T.zeros_like(samples) + self.p
+        return samples, log_likelihood, probabilities
+
+    def _log_likelihood(self, X):
+        return (X * T.log(self.p) + (1 - X) * T.log(1 - self.p)).sum(axis=1)
+
+    @wraps(Model.get_params)
+    def get_params(self):
+        return [self.b]
+
+
+class StochasticSigmoid(ConditionalDistribution):
+    """
+    Implements the conditional distribution of a random binary vector x given
+    an input vector y as a product of Bernoulli distributions, i.e.
+
+        p(x | y) = \prod_i p(x_i | y),
+
+    where
+
+        p(x_i | y) = sigmoid(y.W_i + b_i)
+    """
+    def __init__(self, dim, dim_cond, clamp_sigmoid=False):
+        """
+        Parameters
+        ----------
+        dim : int
+            Dimension of the modeled vector x
+        dim_cond : int
+            Dimension of the conditioning vector y
+        """
+        self.dim_cond = dim_cond
+        self.dim = dim
+        self.clamp_sigmoid = clamp_sigmoid
+
+        # Bias initialization
+        b_value = numpy.zeros(self.dim)
+        self.b = sharedX(b_value, 'b')
+
+        # Weights initialization
+        W_value = self._initialize_weights(self.dim_cond, self.dim)
+        self.W = sharedX(W_value, 'W')
+
+        # Space initialization
+        self.input_space = VectorSpace(dim=self.dim_cond)
+        self.target_space = VectorSpace(dim=self.dim)
+
+    def _sigmoid(self, x):
+        """
+        WRITEME
+
+        Parameters
+        ----------
+        x : WRITEME
+        """
+        if self.clamp_sigmoid:
+            return T.nnet.sigmoid(x)*0.9999 + 0.000005
+        else:
+            return T.nnet.sigmoid(x)
+
+    def _sample(self, Y):
+        batch_size = Y.shape[0]
+        probabilities = self._sigmoid(T.dot(Y, self.W) + self.b)
+        samples = theano_rng.uniform((batch_size, self.dim)) <= probabilities
+        log_likelihood = (
+            samples * T.log(probabilities) +
+            (1 - samples) * T.log(1 - probabilities)
+        ).sum(axis=1)
+        
+        return samples, log_likelihood, probabilities
+
+    def _log_likelihood(self, X, Y):
+        p = self._sigmoid(T.dot(Y, self.W) + self.b)
+        return (X * T.log(p) + (1 - X) * T.log(1 - p)).sum(axis=1)
+
+    @wraps(Model.get_params)
+    def get_params(self):
+        return [self.W, self.b]
+
+    def get_weights(self):
+        return self.W.get_value()
diff --git a/code/pylearn2/models/directed_probabilistic/nade.py b/code/pylearn2/models/directed_probabilistic/nade.py
new file mode 100644
index 0000000..2b3fcaa
--- /dev/null
+++ b/code/pylearn2/models/directed_probabilistic/nade.py
@@ -0,0 +1,475 @@
+"""
+Neural autoregressive density estimator (NADE) implementation
+"""
+__authors__ = "Vincent Dumoulin"
+__copyright__ = "Copyright 2014, Universite de Montreal"
+__credits__ = ["Jorg Bornschein", "Vincent Dumoulin"]
+__license__ = "3-clause BSD"
+__maintainer__ = "Vincent Dumoulin"
+
+
+import numpy
+import theano
+import theano.tensor as T
+from theano.tensor.shared_randomstreams import RandomStreams
+from pylearn2.models.model import Model
+from pylearn2.utils import sharedX
+from pylearn2.space import VectorSpace
+# from research.code.pylearn2.utils.unrolled_scan import unrolled_scan
+from research.code.pylearn2.models.directed_probabilistic import (
+    JointDistribution, ConditionalDistribution
+)
+
+
+theano_rng = RandomStreams(seed=2341)
+
+
+class NADEBase(Model):
+    """
+    WRITEME
+    """
+    def __init__(self, dim, dim_hid, clamp_sigmoid=False, unroll_scan=1):
+        """
+        Parameters
+        ----------
+        dim : int
+            Number of observed binary variables
+        dim_hid : int
+            Number of latent binary variables
+        clamp_sigmoid : bool, optional
+            WRITEME. Defaults to `False`.
+        unroll_scan : int, optional
+            WRITEME. Defaults to 1.
+        """
+        super(NADEBase, self).__init__()
+
+        self.dim = dim
+        self.dim_hid = dim_hid
+        self.clamp_sigmoid = clamp_sigmoid
+        self.unroll_scan = unroll_scan
+
+        self.input_space = VectorSpace(dim=self.dim)
+
+        # Visible biases
+        b_value = numpy.zeros(self.dim)
+        self.b = sharedX(b_value, 'b')
+        # Hidden biases
+        c_value = numpy.zeros(self.dim_hid)
+        self.c = sharedX(c_value, 'c')
+        # Encoder weights
+        W_value = self._initialize_weights(self.dim, self.dim_hid)
+        self.W = sharedX(W_value, 'W')
+        # Decoder weights
+        V_value = self._initialize_weights(self.dim_hid, self.dim)
+        self.V = sharedX(V_value, 'V')
+
+    def _initialize_weights(self, dim_0, dim_1):
+        """
+        Initialize a (dim_0, dim_1)-shaped weight matrix
+
+        Parameters
+        ----------
+        dim_0 : int
+            First dimension of the weights matrix
+        dim_1 : int
+            Second dimension of the weights matrix
+
+        Returns
+        -------
+        rval : `numpy.ndarray`
+            A (dim_0, dim_1)-shaped, properly initialized weights matrix
+        """
+        rval = (2 * numpy.random.normal(size=(dim_0, dim_1)) - 1) / dim_0
+        return rval
+
+    def sigmoid(self, x):
+        """
+        WRITEME
+
+        Parameters
+        ----------
+        x : WRITEME
+        """
+        if self.clamp_sigmoid:
+            return T.nnet.sigmoid(x)*0.9999 + 0.000005
+        else:
+            return T.nnet.sigmoid(x)
+
+    def get_params(self):
+        """
+        Returns
+        -------
+        params : list of tensor-like
+            The model's parameters
+        """
+        return [self.b, self.c, self.W, self.V]
+
+    def get_weights(self):
+        """
+        Aliases to `NADE.get_encoder_weights`
+        """
+        return self.get_encoder_weights()
+
+    def set_weights(self, weights):
+        """
+        Aliases to `NADE.set_encoder_weights`
+        """
+        self.set_encoder_weights(weights)
+
+    def get_encoder_weights(self):
+        """
+        Returns
+        -------
+        rval : `numpy.ndarray`
+            Encoder weights
+        """
+        return self.W.get_value()
+
+    def set_encoder_weights(self, weights):
+        """
+        Sets encoder weight values
+
+        Parameters
+        ----------
+        weights : `numpy.ndarray`
+            Encoder weight values to assign to self.W
+        """
+        self.W.set_value(weights)
+
+    def get_decoder_weights(self):
+        """
+        Returns
+        -------
+        rval : `numpy.ndarray`
+            Decoder weights
+        """
+        return self.V.get_value()
+
+    def set_decoder_weights(self, weights):
+        """
+        Sets decoder weight values
+
+        Parameters
+        ----------
+        weights : `numpy.ndarray`
+            Decoder weight values to assign to self.V
+        """
+        self.V.set_value(weights)
+
+    def get_visible_biases(self):
+        """
+        Returns
+        -------
+        rval : `numpy.ndarray`
+            Visible biases
+        """
+        return self.b.get_value()
+
+    def set_visible_biases(self, biases):
+        """
+        Sets visible bias values
+
+        Parameters
+        ----------
+        biases : `numpy.ndarray`
+            Visible bias values to assign to self.b
+        """
+        self.b.set_value(biases)
+
+    def get_hidden_biases(self):
+        """
+        Returns
+        -------
+        rval : `numpy.ndarray`
+            Hidden biases
+        """
+        return self.c.get_value()
+
+    def set_hidden_biases(self, biases):
+        """
+        Sets hidden bias values
+
+        Parameters
+        ----------
+        biases : `numpy.ndarray`
+            Hidden bias values to assign to self.c
+        """
+        self.c.set_value(biases)
+
+    def _base_log_likelihood(self, X, W, V, b, c):
+        """
+        Computes the log-likelihood of a batch of visible examples
+
+        Parameters
+        ----------
+        X : tensor-like
+            Batch of visible examples
+        W : tensor-like
+            Encoder weights
+        V : tensor-like
+            Decoder weights
+        b : tensor-like
+            Visible biases
+        c : tensor-like
+            Hidden biases
+
+        Returns
+        -------
+        rval : tensor-like
+            Log-likelihood for the batch of visible examples
+        """
+        # Transformation matrix. A 3D tensor of the form
+        #
+        # [[[      0,       0, ...,       0],
+        #   [X[0, 0],       0, ...,       0],
+        #   [X[0, 0], X[0, 1], ...,       0],
+        #   [X[0, 0], X[0, 1], ..., X[0, d]]],
+        #                ...
+        #  [[      0,       0, ...,       0],
+        #   [X[n, 0],       0, ...,       0],
+        #   [X[n, 0], X[n, 1], ...,       0],
+        #   [X[n, 0], X[n, 1], ..., X[n, d]]]]
+        #
+        # Its purpose is make the `W_{., <i} v_{<i}` matrix product for all
+        # examples in the X batch.
+        M = (X.dimshuffle(0, 'x', 1) * T.ones((X.shape[1], X.shape[1])) *
+             numpy.asarray(numpy.tril(numpy.ones((self.dim, self.dim)),
+                                      k=-1),
+                           dtype=theano.config.floatX))
+        # The dot product of M with W will produce a tensor of the form
+        #
+        # [[[ h_1(X[0, :]) ],
+        #       ...
+        #   [ h_d(X[0, :]) ]],
+        #       ...
+        #  [[ h_1(X[n, :]) ],
+        #       ...
+        #   [ h_d(X[n, :]) ]]]
+        #
+        h = self.sigmoid(T.dot(M, W) + c)
+        # The elementwise product of V.T and h (where V.T is broadcasted to a
+        # 3D tensor with one V.T for each example in the X batch) and sum over
+        # the h_i axis will produce a matrix whose rows correspond to examples
+        # in the X batch and colomns are (W.T)_i h_i.
+        p = self.sigmoid((V.T * h).sum(axis=2) + b)
+
+        return (X * T.log(p) + (1 - X) * T.log(1 - p)).sum(axis=1)
+
+    def _base_scan_log_likelihood(self, X, W, V, b, c):
+        """
+        A slower, Scan version of `NADE._log_likelihood`.
+
+        Parameters
+        ----------
+        X : tensor-like
+            Batch of visible examples
+        W : tensor-like
+            Encoder weights
+        V : tensor-like
+            Decoder weights
+        b : tensor-like
+            Visible biases
+        c : tensor-like
+            Hidden biases
+
+        Returns
+        -------
+        rval : tensor-like
+            Log-likelihood for the batch of visible examples
+        """
+        batch_size = X.shape[0]
+        # Accumulator for hidden layer activations, initialized with bias
+        # values broadcasted to X's shape
+        a_init = T.zeros([batch_size, self.dim_hid],
+                         dtype=theano.config.floatX) + c
+        # Accumulator for log-posterior distribution log p(v), initialized with
+        # zeroes
+        log_p_init = T.zeros([batch_size], dtype=theano.config.floatX)
+
+        # Function computing -log p(v_i)
+        def one_iter(v_i, W_i, V_i, b_i, a, log_p):
+            h_i = self.sigmoid(a)
+            p_i = self.sigmoid(T.dot(h_i, V_i) + b_i)
+            log_p += v_i * T.log(p_i) + (1 - v_i) * T.log(1 - p_i)
+            a += T.outer(v_i, W_i)
+            return a, log_p
+
+        [a, log_p], updates = unrolled_scan(fn=one_iter,
+                                            sequences=[X.T, W, V.T, b],
+                                            outputs_info=[a_init, log_p_init],
+                                            unroll=self.unroll_scan)
+        rval = log_p[-1, :]
+        return rval
+
+    def _base_sample(self, num_samples, W, V, b, c):
+        """
+        Samples from p(v)
+
+        Parameters
+        ----------
+        num_samples : int
+            Number of samples to draw
+        W : tensor-like
+            Encoder weights
+        V : tensor-like
+            Decoder weights
+        b : tensor-like
+            Visible biases
+        c : tensor-like
+            Hidden biases
+
+        Returns
+        -------
+        v : tensor-like
+            Batch of `num_samples` samples from p(v)
+        p : tensor-like
+            Joint probability distribution from which v was drawn
+        log_likelihood : tensor-like
+            Log-likelihood of the batch of samples
+        """
+        # Accumulator for hidden layer activations, initialized with bias
+        # values and broadcasted to the right shape
+        a_init = T.zeros((num_samples, self.dim_hid),
+                         dtype=theano.config.floatX) + c
+        # Accumulator for conditional distribution p(v_i | v_{<i}), initialized
+        # with zeroes
+        p_init = T.zeros((num_samples,), dtype=theano.config.floatX)
+        # Accumulator for visible samples, initialized with zeroes
+        v_init = T.zeros((num_samples,), dtype=theano.config.floatX)
+        # Accumulator for log-likelihood log p(v) of visible samples
+        log_likelihood_init = T.zeros((num_samples,),
+                                      dtype=theano.config.floatX)
+
+        # Function sampling from p(v_i)
+        def one_iter(W_i, V_i, b_i, a, v_lt_i, p_lt_i, log_likelihood):
+            h_i = self.sigmoid(a)
+            p_i = self.sigmoid(T.dot(h_i, V_i) + b_i)
+            v_i = 1. * (theano_rng.uniform([num_samples]) <= p_i)
+            log_likelihood += v_i * T.log(p_i) + (1 - v_i) * T.log(1 - p_i)
+            a += T.outer(v_i, W_i)
+            return a, v_i, p_i, log_likelihood
+
+        [a, v, p, log_likelihood], updates = unrolled_scan(
+            fn=one_iter,
+            sequences=[W, V.T, b],
+            outputs_info=[a_init, v_init, p_init, log_likelihood_init],
+            unroll=self.unroll_scan
+        )
+
+        return v.T, log_likelihood[-1, :], p.T
+
+
+class NADE(NADEBase, JointDistribution):
+    """
+    An implementation of Larochelle's and Murray's neural autoregressive
+    density estimator (NADE)
+    """
+    def _log_likelihood(self, X):
+        return self._base_log_likelihood(X, self.W, self.V, self.b, self.c)
+
+    def _sample(self, num_samples):
+        return self._base_sample(num_samples, self.W, self.V, self.b, self.c)
+
+
+class CNADE(NADEBase, ConditionalDistribution):
+    """
+    An implementation of Larochelle's and Murray's neural autoregressive
+    density estimator (NADE) conditioned on an external observation
+    """
+    def __init__(self, dim, dim_hid, dim_cond, clamp_sigmoid=False, unroll_scan=1):
+        """
+        Parameters
+        ----------
+        dim : int
+            Number of observed binary variables
+        dim_hid : int
+            Number of latent binary variables
+        dim_cond : int
+            Number of conditioning variables
+        clamp_sigmoid : bool, optional
+            WRITEME. Defaults to `False`.
+        unroll_scan : int, optional
+            WRITEME. Defaults to 1.
+        """
+        super(CNADE, self).__init__(dim=dim, dim_hid=dim_hid,
+                                    clamp_sigmoid=clamp_sigmoid,
+                                    unroll_scan=unroll_scan)
+
+        self.dim_cond = dim_cond
+
+        # Conditioning weights matrix for visible biases
+        U_b_value = self._initialize_weights(self.dim_cond, self.dim)
+        self.U_b = sharedX(U_b_value, 'U_b')
+        # Conditioning weights matrix for hidden biases
+        U_c_value = self._initialize_weights(self.dim_cond, self.dim_hid)
+        self.U_c = sharedX(U_c_value, 'U_c')
+
+    def get_params(self):
+        """
+        Returns
+        -------
+        params : list of tensor-like
+            The model's parameters
+        """
+        params = super(CNADE, self).get_params()
+        params.extend([self.U_c, self.U_b])
+        return params
+
+    def _log_likelihood(self, X, Y):
+        # Conditioned visible biases, shape is (batch_size, self.dim_hid)
+        b_cond = self.b + T.dot(Y, self.U_b)
+        # Conditioned hidden biases, shape is (batch_size, self.dim_hid)
+        c_cond = self.c + T.dot(Y, self.U_c)
+
+        return self._base_log_likelihood(X, self.W, self.V, b_cond, c_cond)
+
+    def _sample(self, Y):
+        num_samples = Y.shape[0]
+        # Conditioned visible biases, shape is (batch_size, self.dim_hid)
+        b_cond = self.b + T.dot(Y, self.U_b)
+        # Conditioned hidden biases, shape is (batch_size, self.dim_hid)
+        c_cond = self.c + T.dot(Y, self.U_c)
+
+        # Here we give b_cond.T as argument because it will be used in a scan
+        # loop which systematically loops over the first axis.
+        return self._base_sample(num_samples, self.W, self.V, b_cond.T, c_cond)
+
+    def get_visible_conditioning_weights(self):
+        """
+        Returns
+        -------
+        rval : `numpy.ndarray`
+            Visible conditioning weights
+        """
+        return self.U_b.get_value()
+
+    def set_visible_conditioning_weights(self, weights):
+        """
+        Sets visible conditioning weight values
+
+        Parameters
+        ----------
+        weights : `numpy.ndarray`
+            Visible conditioning weight values to assign to self.U_b
+        """
+        self.U_b.set_value(weights)
+
+    def get_hidden_conditioning_weights(self):
+        """
+        Returns
+        -------
+        rval : `numpy.ndarray`
+            Hidden conditioning weights
+        """
+        return self.U_c.get_value()
+
+    def set_hidden_conditioning_weights(self, weights):
+        """
+        Sets hidden conditioning weight values
+
+        Parameters
+        ----------
+        weights : `numpy.ndarray`
+            Hidden conditioning weight values to assign to self.U_c
+        """
+        self.U_c.set_value(weights)
diff --git a/code/pylearn2/scripts/nade.yaml b/code/pylearn2/scripts/nade.yaml
new file mode 100644
index 0000000..78df5de
--- /dev/null
+++ b/code/pylearn2/scripts/nade.yaml
@@ -0,0 +1,46 @@
+!obj:pylearn2.train.Train {
+    dataset: &train !obj:pylearn2.datasets.binarized_mnist.BinarizedMNIST {
+        which_set: 'train',
+    },
+    model: !obj:research.code.pylearn2.models.directed_probabilistic.nade.NADE {
+        dim: 784,
+        dim_hid: 500,
+    },
+    algorithm: !obj:pylearn2.training_algorithms.sgd.SGD {
+        batch_size: 100,
+        learning_rate: 1e-3,
+        learning_rule: !obj:pylearn2.training_algorithms.learning_rule.Momentum {
+            init_momentum: 0.05,
+        },
+        monitoring_dataset: {
+            'train' : *train,
+            'valid' : !obj:pylearn2.datasets.binarized_mnist.BinarizedMNIST {
+                which_set: 'valid',
+            },
+            'test' : !obj:pylearn2.datasets.binarized_mnist.BinarizedMNIST {
+                which_set: 'test',
+            },
+        },
+        cost: !obj:research.code.pylearn2.costs.nade.NADECost {},
+        termination_criterion: !obj:pylearn2.termination_criteria.EpochCounter {
+            max_epochs: 1500
+        },
+        update_callbacks: [
+            !obj:pylearn2.training_algorithms.sgd.ExponentialDecay {
+                decay_factor: 1.0005,
+                min_lr:       0.0001
+            },
+        ],
+    },
+    extensions: [
+        !obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest {
+            channel_name: 'valid_objective',
+            save_path: "nade_best.pkl",
+        },
+        !obj:pylearn2.training_algorithms.learning_rule.MomentumAdjustor {
+            final_momentum: .95,
+            start: 5,
+            saturate: 6
+        },
+    ],
+}
diff --git a/code/pylearn2/utils/unrolled_scan.py b/code/pylearn2/utils/unrolled_scan.py
new file mode 100755
index 0000000..6ef5649
--- /dev/null
+++ b/code/pylearn2/utils/unrolled_scan.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python 
+
+from __future__ import division
+
+
+import numpy as np
+
+import theano 
+import theano.tensor as T
+
+def unrolled_scan(fn, sequences=None, outputs_info=None, non_sequences=None, 
+         n_steps=None, truncate_gradient=-1, go_backwards=False, 
+         mode=None, name=None, profile=False, unroll=8):
+    """ Unrolling version of theano.scan """
+    if unroll == 1:
+        return theano.scan(fn, sequences=sequences, 
+                    outputs_info=outputs_info, 
+                    non_sequences=non_sequences,
+                    n_steps=n_steps, truncate_gradient=truncate_gradient, 
+                    go_backwards= go_backwards, mode=mode, name=name, 
+                    profile=profile)
+
+    if sequences is None:
+        sequences = []
+    if outputs_info is None:
+        outputs_info = []
+    if non_sequences is None:
+        non_sequences = []
+
+    n_seq  = len(sequences)
+    n_out  = len(outputs_info)
+    n_nseq = len(non_sequences)
+
+    def unrolled_fn(*args):
+        if len(args) != (n_seq+n_out+n_nseq):
+            raise ValueError('Scan function %s takes %d arguments but expeted to receive %d'
+                    % (fn, len(args), (n_seq+n_out+n_nseq)))
+
+        seq_args , args = args[:n_seq], args[n_seq:]
+        out_args , args = args[:n_out], args[n_out:]
+        nseq_args, args = args[:n_nseq], args[n_nseq:]
+        assert len(args) == 0
+
+        
+        for i in xrange(unroll):
+            seq_args_i = [arg[i] for arg in seq_args]
+            all_args = list(seq_args_i)+list(out_args)+list(nseq_args)
+            out_args = fn(*all_args)
+
+            if not isinstance(out_args, (tuple, list)):     
+                out_args = (out_args,)
+            assert len(out_args) == n_out
+        if len(out_args) == 1:
+            out_args = out_args[0]
+        return out_args
+    
+    def reshape_arg(arg):
+        new_shape = [arg.shape[0]//unroll, unroll]+[arg.shape[i] for i in xrange(1, arg.ndim)]
+        return arg.reshape(new_shape)
+        #return arg.reshape( [arg.shape[0]//unroll, unroll] ) # +arg.shape[1:], ndim=arg.ndim+1 )
+        #return arg.reshape( [arg.shape[0]//unroll, unroll]+arg.shape[1:], ndim=arg.ndim+1 )
+    sequences = [reshape_arg(arg) for arg in sequences]
+
+    if len(sequences) == 0:
+        sequences = None
+    if len(outputs_info) == 0:
+        outputs_info = None
+    if len(non_sequences) == 0:
+        non_sequences = None
+
+    return theano.scan(unrolled_fn, sequences=sequences, 
+        outputs_info=outputs_info, 
+        non_sequences=non_sequences,
+        n_steps=n_steps, truncate_gradient=truncate_gradient, 
+        go_backwards= go_backwards, mode=mode, name=name, 
+        profile=profile)
+
+
+#-----------------------------------------------------------------------------
+if __name__ == "__main__":
+    import logging
+    from time import time
+    import ipdb
+    theano.config.exception_verbosity = 'high'
+
+    def benchmark(fn, tries=4, iterations=100):
+        t_best = np.inf
+        t_worst = 0.
+
+        for t in xrange(tries):
+            t0 = time()
+            for i in xrange(iterations):
+                fn()
+            t = (time()-t0) / iterations
+            t_best = min(t_best, t)
+            t_worst = max(t_worst, t)
+        print "  t_best = %f ms    t_worst = %f ms" %(t_best*1000, t_worst*1000)
+
+    #-------------------------------------------------------------------------
+
+    i = T.arange(100)
+    A = theano.shared(np.random.normal(size=(10,10)))
+    
+    def fn1(seq, acc):
+        return T.dot(acc, A)
+    
+    print "-"*78
+    print "Unrolled SCAN:"
+    outputs, updates = unrolled_scan(fn1, name='fn1',
+        sequences=[i], outputs_info=[T.ones_like(A)],
+        unroll=10
+    )
+    f_fn1 = theano.function([], outputs[-1], name='fn1')
+
+    res = f_fn1()
+    print res.shape
+    print res
+    benchmark(f_fn1)
+
+    print "-"*78
+    print "Normal SCAN:"
+    outputs, updates = theano.scan(fn1, name='fn1',
+        sequences=[i], outputs_info=[T.ones_like(A)]
+    )
+    f_fn1 = theano.function([], outputs[-1], name='fn1')
+    
+    res = f_fn1()
+    print res.shape
+    print res
+    benchmark(f_fn1)
+