jh941213
diff --git a/‎docs/_downloads/00262d1f472765b291ae3a1a5bf86bae/sequence_models_tutorial.py
+253 b/‎docs/_downloads/00262d1f472765b291ae3a1a5bf86bae/sequence_models_tutorial.py
+253
diff --git a/‎docs/_downloads/00d6184d3efebb111179b7188e4c6142/vmap_recipe.py
+123 b/‎docs/_downloads/00d6184d3efebb111179b7188e4c6142/vmap_recipe.py
+123
@@ -0,0 +1,253 @@
+# -*- coding: utf-8 -*-
+r"""
+Sequence Models and Long Short-Term Memory Networks
+===================================================
+
+At this point, we have seen various feed-forward networks. That is,
+there is no state maintained by the network at all. This might not be
+the behavior we want. Sequence models are central to NLP: they are
+models where there is some sort of dependence through time between your
+inputs. The classical example of a sequence model is the Hidden Markov
+Model for part-of-speech tagging. Another example is the conditional
+random field.
+
+A recurrent neural network is a network that maintains some kind of
+state. For example, its output could be used as part of the next input,
+so that information can propagate along as the network passes over the
+sequence. In the case of an LSTM, for each element in the sequence,
+there is a corresponding *hidden state* :math:`h_t`, which in principle
+can contain information from arbitrary points earlier in the sequence.
+We can use the hidden state to predict words in a language model,
+part-of-speech tags, and a myriad of other things.
+
+
+LSTMs in Pytorch
+~~~~~~~~~~~~~~~~~
+
+Before getting to the example, note a few things. Pytorch's LSTM expects
+all of its inputs to be 3D tensors. The semantics of the axes of these
+tensors is important. The first axis is the sequence itself, the second
+indexes instances in the mini-batch, and the third indexes elements of
+the input. We haven't discussed mini-batching, so let's just ignore that
+and assume we will always have just 1 dimension on the second axis. If
+we want to run the sequence model over the sentence "The cow jumped",
+our input should look like
+
+.. math::
+
+
+   \begin{bmatrix}
+   \overbrace{q_\text{The}}^\text{row vector} \\
+   q_\text{cow} \\
+   q_\text{jumped}
+   \end{bmatrix}
+
+Except remember there is an additional 2nd dimension with size 1.
+
+In addition, you could go through the sequence one at a time, in which
+case the 1st axis will have size 1 also.
+
+Let's see a quick example.
+"""
+
+# Author: Robert Guthrie
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+torch.manual_seed(1)
+
+######################################################################
+
+lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
+inputs = [torch.randn(1, 3) for _ in range(5)]  # make a sequence of length 5
+
+# initialize the hidden state.
+hidden = (torch.randn(1, 1, 3),
+          torch.randn(1, 1, 3))
+for i in inputs:
+    # Step through the sequence one element at a time.
+    # after each step, hidden contains the hidden state.
+    out, hidden = lstm(i.view(1, 1, -1), hidden)
+
+# alternatively, we can do the entire sequence all at once.
+# the first value returned by LSTM is all of the hidden states throughout
+# the sequence. the second is just the most recent hidden state
+# (compare the last slice of "out" with "hidden" below, they are the same)
+# The reason for this is that:
+# "out" will give you access to all hidden states in the sequence
+# "hidden" will allow you to continue the sequence and backpropagate,
+# by passing it as an argument  to the lstm at a later time
+# Add the extra 2nd dimension
+inputs = torch.cat(inputs).view(len(inputs), 1, -1)
+hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
+out, hidden = lstm(inputs, hidden)
+print(out)
+print(hidden)
+
+
+######################################################################
+# Example: An LSTM for Part-of-Speech Tagging
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# In this section, we will use an LSTM to get part of speech tags. We will
+# not use Viterbi or Forward-Backward or anything like that, but as a
+# (challenging) exercise to the reader, think about how Viterbi could be
+# used after you have seen what is going on. In this example, we also refer
+# to embeddings. If you are unfamiliar with embeddings, you can read up
+# about them `here <https://tutorials.pytorch.kr/beginner/nlp/word_embeddings_tutorial.html>`__.
+#
+# The model is as follows: let our input sentence be
+# :math:`w_1, \dots, w_M`, where :math:`w_i \in V`, our vocab. Also, let
+# :math:`T` be our tag set, and :math:`y_i` the tag of word :math:`w_i`.
+# Denote our prediction of the tag of word :math:`w_i` by
+# :math:`\hat{y}_i`.
+#
+# This is a structure prediction, model, where our output is a sequence
+# :math:`\hat{y}_1, \dots, \hat{y}_M`, where :math:`\hat{y}_i \in T`.
+#
+# To do the prediction, pass an LSTM over the sentence. Denote the hidden
+# state at timestep :math:`i` as :math:`h_i`. Also, assign each tag a
+# unique index (like how we had word\_to\_ix in the word embeddings
+# section). Then our prediction rule for :math:`\hat{y}_i` is
+#
+# .. math::  \hat{y}_i = \text{argmax}_j \  (\log \text{Softmax}(Ah_i + b))_j
+#
+# That is, take the log softmax of the affine map of the hidden state,
+# and the predicted tag is the tag that has the maximum value in this
+# vector. Note this implies immediately that the dimensionality of the
+# target space of :math:`A` is :math:`|T|`.
+#
+#
+# Prepare data:
+
+def prepare_sequence(seq, to_ix):
+    idxs = [to_ix[w] for w in seq]
+    return torch.tensor(idxs, dtype=torch.long)
+
+
+training_data = [
+    # Tags are: DET - determiner; NN - noun; V - verb
+    # For example, the word "The" is a determiner
+    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
+    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
+]
+word_to_ix = {}
+# For each words-list (sentence) and tags-list in each tuple of training_data
+for sent, tags in training_data:
+    for word in sent:
+        if word not in word_to_ix:  # word has not been assigned an index yet
+            word_to_ix[word] = len(word_to_ix)  # Assign each word with a unique index
+print(word_to_ix)
+tag_to_ix = {"DET": 0, "NN": 1, "V": 2}  # Assign each tag with a unique index
+
+# These will usually be more like 32 or 64 dimensional.
+# We will keep them small, so we can see how the weights change as we train.
+EMBEDDING_DIM = 6
+HIDDEN_DIM = 6
+
+######################################################################
+# Create the model:
+
+
+class LSTMTagger(nn.Module):
+
+    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
+        super(LSTMTagger, self).__init__()
+        self.hidden_dim = hidden_dim
+
+        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
+
+        # The LSTM takes word embeddings as inputs, and outputs hidden states
+        # with dimensionality hidden_dim.
+        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
+
+        # The linear layer that maps from hidden state space to tag space
+        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
+
+    def forward(self, sentence):
+        embeds = self.word_embeddings(sentence)
+        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
+        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
+        tag_scores = F.log_softmax(tag_space, dim=1)
+        return tag_scores
+
+######################################################################
+# Train the model:
+
+
+model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
+loss_function = nn.NLLLoss()
+optimizer = optim.SGD(model.parameters(), lr=0.1)
+
+# See what the scores are before training
+# Note that element i,j of the output is the score for tag j for word i.
+# Here we don't need to train, so the code is wrapped in torch.no_grad()
+with torch.no_grad():
+    inputs = prepare_sequence(training_data[0][0], word_to_ix)
+    tag_scores = model(inputs)
+    print(tag_scores)
+
+for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
+    for sentence, tags in training_data:
+        # Step 1. Remember that Pytorch accumulates gradients.
+        # We need to clear them out before each instance
+        model.zero_grad()
+
+        # Step 2. Get our inputs ready for the network, that is, turn them into
+        # Tensors of word indices.
+        sentence_in = prepare_sequence(sentence, word_to_ix)
+        targets = prepare_sequence(tags, tag_to_ix)
+
+        # Step 3. Run our forward pass.
+        tag_scores = model(sentence_in)
+
+        # Step 4. Compute the loss, gradients, and update the parameters by
+        #  calling optimizer.step()
+        loss = loss_function(tag_scores, targets)
+        loss.backward()
+        optimizer.step()
+
+# See what the scores are after training
+with torch.no_grad():
+    inputs = prepare_sequence(training_data[0][0], word_to_ix)
+    tag_scores = model(inputs)
+
+    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
+    # for word i. The predicted tag is the maximum scoring tag.
+    # Here, we can see the predicted sequence below is 0 1 2 0 1
+    # since 0 is index of the maximum value of row 1,
+    # 1 is the index of maximum value of row 2, etc.
+    # Which is DET NOUN VERB DET NOUN, the correct sequence!
+    print(tag_scores)
+
+
+######################################################################
+# Exercise: Augmenting the LSTM part-of-speech tagger with character-level features
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# In the example above, each word had an embedding, which served as the
+# inputs to our sequence model. Let's augment the word embeddings with a
+# representation derived from the characters of the word. We expect that
+# this should help significantly, since character-level information like
+# affixes have a large bearing on part-of-speech. For example, words with
+# the affix *-ly* are almost always tagged as adverbs in English.
+#
+# To do this, let :math:`c_w` be the character-level representation of
+# word :math:`w`. Let :math:`x_w` be the word embedding as before. Then
+# the input to our sequence model is the concatenation of :math:`x_w` and
+# :math:`c_w`. So if :math:`x_w` has dimension 5, and :math:`c_w`
+# dimension 3, then our LSTM should accept an input of dimension 8.
+#
+# To get the character level representation, do an LSTM over the
+# characters of a word, and let :math:`c_w` be the final hidden state of
+# this LSTM. Hints:
+#
+# * There are going to be two LSTM's in your new model.
+#   The original one that outputs POS tag scores, and the new one that
+#   outputs a character-level representation of each word.
+# * To do a sequence model over characters, you will have to embed characters.
+#   The character embeddings will be the input to the character LSTM.
+#
@@ -0,0 +1,123 @@
+"""
+torch.vmap
+==========
+This tutorial introduces torch.vmap, an autovectorizer for PyTorch operations.
+torch.vmap is a prototype feature and cannot handle a number of use cases;
+however, we would like to gather use cases for it to inform the design. If you
+are considering using torch.vmap or think it would be really cool for something,
+please contact us at https://github.com/pytorch/pytorch/issues/42368.
+
+So, what is vmap?
+-----------------
+vmap is a higher-order function. It accepts a function `func` and returns a new
+function that maps `func` over some dimension of the inputs. It is highly
+inspired by JAX's vmap.
+
+Semantically, vmap pushes the "map" into PyTorch operations called by `func`,
+effectively vectorizing those operations.
+"""
+import torch
+# NB: vmap is only available on nightly builds of PyTorch.
+# You can download one at pytorch.org if you're interested in testing it out.
+from torch import vmap
+
+####################################################################
+# The first use case for vmap is making it easier to handle
+# batch dimensions in your code. One can write a function `func`
+# that runs on examples and then lift it to a function that can
+# take batches of examples with `vmap(func)`. `func` however
+# is subject to many restrictions:
+#
+# - it must be functional (one cannot mutate a Python data structure
+#   inside of it), with the exception of in-place PyTorch operations.
+# - batches of examples must be provided as Tensors. This means that
+#   vmap doesn't handle variable-length sequences out of the box.
+#
+# One example of using `vmap` is to compute batched dot products. PyTorch
+# doesn't provide a batched `torch.dot` API; instead of unsuccessfully
+# rummaging through docs, use `vmap` to construct a new function:
+
+torch.dot                            # [D], [D] -> []
+batched_dot = torch.vmap(torch.dot)  # [N, D], [N, D] -> [N]
+x, y = torch.randn(2, 5), torch.randn(2, 5)
+batched_dot(x, y)
+
+####################################################################
+# `vmap` can be helpful in hiding batch dimensions, leading to a simpler
+# model authoring experience.
+batch_size, feature_size = 3, 5
+weights = torch.randn(feature_size, requires_grad=True)
+
+# Note that model doesn't work with a batch of feature vectors because
+# torch.dot must take 1D tensors. It's pretty easy to rewrite this
+# to use `torch.matmul` instead, but if we didn't want to do that or if
+# the code is more complicated (e.g., does some advanced indexing
+# shenanigins), we can simply call `vmap`. `vmap` batches over ALL
+# inputs, unless otherwise specified (with the in_dims argument,
+# please see the documentation for more details).
+def model(feature_vec):
+    # Very simple linear model with activation
+    return feature_vec.dot(weights).relu()
+
+examples = torch.randn(batch_size, feature_size)
+result = torch.vmap(model)(examples)
+expected = torch.stack([model(example) for example in examples.unbind()])
+assert torch.allclose(result, expected)
+
+####################################################################
+# `vmap` can also help vectorize computations that were previously difficult
+# or impossible to batch. This bring us to our second use case: batched
+# gradient computation.
+#
+# - https://github.com/pytorch/pytorch/issues/8304
+# - https://github.com/pytorch/pytorch/issues/23475
+#
+# The PyTorch autograd engine computes vjps (vector-Jacobian products).
+# Using vmap, we can compute (batched vector) - jacobian products.
+#
+# One example of this is computing a full Jacobian matrix (this can also be
+# applied to computing a full Hessian matrix).
+# Computing a full Jacobian matrix for some function f: R^N -> R^N usually
+# requires N calls to `autograd.grad`, one per Jacobian row.
+
+# Setup
+N = 5
+def f(x):
+    return x ** 2
+
+x = torch.randn(N, requires_grad=True)
+y = f(x)
+basis_vectors = torch.eye(N)
+
+# Sequential approach
+jacobian_rows = [torch.autograd.grad(y, x, v, retain_graph=True)[0]
+                 for v in basis_vectors.unbind()]
+jacobian = torch.stack(jacobian_rows)
+
+# Using `vmap`, we can vectorize the whole computation, computing the
+# Jacobian in a single call to `autograd.grad`.
+def get_vjp(v):
+    return torch.autograd.grad(y, x, v)[0]
+
+jacobian_vmap = vmap(get_vjp)(basis_vectors)
+assert torch.allclose(jacobian_vmap, jacobian)
+
+####################################################################
+# The third main use case for vmap is computing per-sample-gradients.
+# This is something that the vmap prototype cannot handle performantly
+# right now. We're not sure what the API for computing per-sample-gradients
+# should be, but if you have ideas, please comment in
+# https://github.com/pytorch/pytorch/issues/7786.
+
+def model(sample, weight):
+    # do something...    
+    return torch.dot(sample, weight)
+
+def grad_sample(sample):
+    return torch.autograd.functional.vjp(lambda weight: model(sample), weight)[1]
+
+# The following doesn't actually work in the vmap prototype. But it
+# could be an API for computing per-sample-gradients.
+
+# batch_of_samples = torch.randn(64, 5)
+# vmap(grad_sample)(batch_of_samples)