Add ResnetTransformer

yezhengkai · yezhengkai · commit a69433fd9ddb · 2021-05-02T21:30:02.000+08:00
diff --git a/im2latex/models/__init__.py b/im2latex/models/__init__.py
@@ -1 +1,2 @@
 from .cnn_lstm import CNNLSTM
+from .resnet_transformer import ResnetTransformer
diff --git a/im2latex/models/resnet_transformer.py b/im2latex/models/resnet_transformer.py
@@ -0,0 +1,199 @@
+import argparse
+import math
+from typing import Any, Dict
+
+import torch
+import torch.nn as nn
+import torchvision
+
+from .transformer_util import PositionalEncoding, PositionalEncodingImage, generate_square_subsequent_mask
+
+TF_DIM = 256
+TF_FC_DIM = 1024
+TF_DROPOUT = 0.4
+TF_LAYERS = 4
+TF_NHEAD = 4
+RESNET_DIM = 512  # hard-coded
+
+
+class ResnetTransformer(nn.Module):
+    """Process the line through a Resnet and process the resulting sequence with a Transformer decoder"""
+
+    def __init__(self, data_config: Dict[str, Any], args: argparse.Namespace = None,) -> None:
+        super().__init__()
+        self.data_config = data_config
+        self.input_dims = data_config["input_dims"]
+        self.num_classes = len(data_config["mapping"])
+        inverse_mapping = {val: ind for ind, val in enumerate(data_config["mapping"])}
+        self.start_token = inverse_mapping["<S>"]
+        self.end_token = inverse_mapping["<E>"]
+        self.padding_token = inverse_mapping["<P>"]
+        self.unknown_token = inverse_mapping["<U>"]
+        self.max_output_length = data_config["output_dims"][0] + 2
+        self.args = vars(args) if args is not None else {}
+
+        self.dim = self.args.get("tf_dim", TF_DIM)
+        tf_fc_dim = self.args.get("tf_fc_dim", TF_FC_DIM)
+        tf_nhead = self.args.get("tf_nhead", TF_NHEAD)
+        tf_dropout = self.args.get("tf_dropout", TF_DROPOUT)
+        tf_layers = self.args.get("tf_layers", TF_LAYERS)
+
+        # ## Encoder part - should output  vector sequence of length self.dim per sample
+        resnet = torchvision.models.resnet18(pretrained=False)
+        self.resnet = torch.nn.Sequential(*(list(resnet.children())[:-2]))  # Exclude AvgPool and Linear layers
+        # Resnet will output (B, RESNET_DIM, _H, _W) logits where _H = input_H // 32, _W = input_W // 32
+
+        # self.encoder_projection = nn.Conv2d(RESNET_DIM, self.dim, kernel_size=(2, 1), stride=(2, 1), padding=0)
+        self.encoder_projection = nn.Conv2d(RESNET_DIM, self.dim, kernel_size=1)
+        # encoder_projection will output (B, dim, _H, _W) logits
+
+        if isinstance(self.input_dims, list):
+            _, max_hight, max_width = max(self.input_dims)
+            self.enc_pos_encoder = PositionalEncodingImage(
+                d_model=self.dim, max_h=max_hight, max_w=max_width
+            )  # Max (Ho, Wo)
+        else:
+            self.enc_pos_encoder = PositionalEncodingImage(
+                d_model=self.dim, max_h=self.input_dims[1], max_w=self.input_dims[2]
+            )  # Max (Ho, Wo)
+
+        # ## Decoder part
+        self.embedding = nn.Embedding(self.num_classes, self.dim)
+        self.fc = nn.Linear(self.dim, self.num_classes)
+
+        self.dec_pos_encoder = PositionalEncoding(d_model=self.dim, max_len=self.max_output_length)
+
+        self.y_mask = generate_square_subsequent_mask(self.max_output_length)
+
+        self.transformer_decoder = nn.TransformerDecoder(
+            nn.TransformerDecoderLayer(d_model=self.dim, nhead=tf_nhead, dim_feedforward=tf_fc_dim, dropout=tf_dropout),
+            num_layers=tf_layers,
+        )
+
+        self.init_weights()  # This is empirically important
+
+    def init_weights(self):
+        initrange = 0.1
+        self.embedding.weight.data.uniform_(-initrange, initrange)
+        self.fc.bias.data.zero_()
+        self.fc.weight.data.uniform_(-initrange, initrange)
+
+        nn.init.kaiming_normal_(self.encoder_projection.weight.data, a=0, mode="fan_out", nonlinearity="relu")
+        if self.encoder_projection.bias is not None:
+            _fan_in, fan_out = nn.init._calculate_fan_in_and_fan_out(  # pylint: disable=protected-access
+                self.encoder_projection.weight.data
+            )
+            bound = 1 / math.sqrt(fan_out)
+            nn.init.normal_(self.encoder_projection.bias, -bound, bound)
+
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        x
+            (B, H, W) image
+
+        Returns
+        -------
+        torch.Tensor
+            (Sx, B, E) logits
+        """
+        _B, C, _H, _W = x.shape
+        if C == 1:
+            x = x.repeat(1, 3, 1, 1)
+        x = self.resnet(x)  # (B, RESNET_DIM, _H // 32, _W // 32),   (B, 512, 18, 20) in the case of IAMParagraphs
+        x = self.encoder_projection(x)  # (B, E, _H // 32, _W // 32),   (B, 256, 18, 20) in the case of IAMParagraphs
+
+        # x = x * math.sqrt(self.dim)  # (B, E, _H // 32, _W // 32)  # This prevented any learning
+        x = self.enc_pos_encoder(x)  # (B, E, Ho, Wo);     Ho = _H // 32, Wo = _W // 32
+        x = torch.flatten(x, start_dim=2)  # (B, E, Ho * Wo)
+        x = x.permute(2, 0, 1)  # (Sx, B, E);    Sx = Ho * Wo
+        return x
+
+    def decode(self, x, y):
+        """
+        Parameters
+        ----------
+        x
+            (B, H, W) image
+        y
+            (B, Sy) with elements in [0, C-1] where C is num_classes
+
+        Returns
+        -------
+        torch.Tensor
+            (Sy, B, C) logits
+        """
+        y_padding_mask = y == self.padding_token
+        y = y.permute(1, 0)  # (Sy, B)
+        y = self.embedding(y) * math.sqrt(self.dim)  # (Sy, B, E)
+        y = self.dec_pos_encoder(y)  # (Sy, B, E)
+        Sy = y.shape[0]
+        y_mask = self.y_mask[:Sy, :Sy].type_as(x)
+        output = self.transformer_decoder(
+            tgt=y, memory=x, tgt_mask=y_mask, tgt_key_padding_mask=y_padding_mask
+        )  # (Sy, B, E)
+        output = self.fc(output)  # (Sy, B, C)
+        return output
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        x
+            (B, H, W) image
+        y
+            (B, Sy) with elements in [0, C-1] where C is num_classes
+
+        Returns
+        -------
+        torch.Tensor
+            (B, C, Sy) logits
+        """
+        x = self.encode(x)  # (Sx, B, E)
+        output = self.decode(x, y)  # (Sy, B, C)
+        return output.permute(1, 2, 0)  # (B, C, Sy)
+
+    def predict(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        x
+            (B, H, W) image
+
+        Returns
+        -------
+        torch.Tensor
+            (B, Sy) with elements in [0, C-1] where C is num_classes
+        """
+        B = x.shape[0]
+        S = self.max_output_length
+        x = self.encode(x)  # (Sx, B, E)
+
+        output_tokens = (torch.ones((B, S)) * self.padding_token).type_as(x).long()  # (B, S)
+        output_tokens[:, 0] = self.start_token  # Set start token
+        for Sy in range(1, S):
+            y = output_tokens[:, :Sy]  # (B, Sy)
+            output = self.decode(x, y)  # (Sy, B, C)
+            output = torch.argmax(output, dim=-1)  # (Sy, B)
+            output_tokens[:, Sy] = output[-1:]  # Set the last output token
+
+            # Early stopping of prediction loop to speed up prediction
+            if ((output_tokens[:, Sy] == self.end_token) | (output_tokens[:, Sy] == self.padding_token)).all():
+                break
+
+        # Set all tokens after end token to be padding
+        for Sy in range(1, S):
+            ind = (output_tokens[:, Sy - 1] == self.end_token) | (output_tokens[:, Sy - 1] == self.padding_token)
+            output_tokens[ind, Sy] = self.padding_token
+
+        return output_tokens  # (B, Sy)
+
+    @staticmethod
+    def add_to_argparse(parser):
+        parser.add_argument("--tf_dim", type=int, default=TF_DIM)
+        parser.add_argument("--tf_fc_dim", type=int, default=TF_DIM)
+        parser.add_argument("--tf_dropout", type=float, default=TF_DROPOUT)
+        parser.add_argument("--tf_layers", type=int, default=TF_LAYERS)
+        parser.add_argument("--tf_nhead", type=int, default=TF_NHEAD)
+        return parser
diff --git a/im2latex/models/transformer_util.py b/im2latex/models/transformer_util.py
@@ -0,0 +1,76 @@
+"""Position Encoding and other utilities for Tranformers"""
+import math
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+
+# Hide lines below until Lab 7
+class PositionalEncodingImage(nn.Module):
+    """
+    Module used to add 2-D positional encodings to the feature-map produced by the encoder.
+
+    Following https://arxiv.org/abs/2103.06450 by Sumeet Singh.
+    """
+
+    def __init__(self, d_model: int, max_h: int = 2000, max_w: int = 2000) -> None:
+        super().__init__()
+        self.d_model = d_model
+        assert d_model % 2 == 0, f"Embedding depth {d_model} is not even"
+        pe = self.make_pe(d_model=d_model, max_h=max_h, max_w=max_w)  # (d_model, max_h, max_w)
+        self.register_buffer("pe", pe)
+
+    @staticmethod
+    def make_pe(d_model: int, max_h: int, max_w: int) -> torch.Tensor:
+        pe_h = PositionalEncoding.make_pe(d_model=d_model // 2, max_len=max_h)  # (max_h, 1 d_model // 2)
+        pe_h = pe_h.permute(2, 0, 1).expand(-1, -1, max_w)  # (d_model // 2, max_h, max_w)
+
+        pe_w = PositionalEncoding.make_pe(d_model=d_model // 2, max_len=max_w)  # (max_w, 1, d_model // 2)
+        pe_w = pe_w.permute(2, 1, 0).expand(-1, max_h, -1)  # (d_model // 2, max_h, max_w)
+
+        pe = torch.cat([pe_h, pe_w], dim=0)  # (d_model, max_h, max_w)
+        return pe
+
+    def forward(self, x: Tensor) -> Tensor:
+        """pytorch.nn.module.forward"""
+        # x.shape = (B, d_model, H, W)
+        assert x.shape[1] == self.pe.shape[0]  # type: ignore
+        x = x + self.pe[:, : x.size(2), : x.size(3)]  # type: ignore
+        return x
+
+
+# Hide lines above until Lab 7
+
+
+class PositionalEncoding(torch.nn.Module):
+    """Classic Attention-is-all-you-need positional encoding."""
+
+    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000) -> None:
+        super().__init__()
+        self.dropout = torch.nn.Dropout(p=dropout)
+        pe = self.make_pe(d_model=d_model, max_len=max_len)  # (max_len, 1, d_model)
+        self.register_buffer("pe", pe)
+
+    @staticmethod
+    def make_pe(d_model: int, max_len: int) -> torch.Tensor:
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(1)
+        return pe
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x.shape = (S, B, d_model)
+        assert x.shape[2] == self.pe.shape[2]  # type: ignore
+        x = x + self.pe[: x.size(0)]  # type: ignore
+        return self.dropout(x)
+
+
+def generate_square_subsequent_mask(size: int) -> torch.Tensor:
+    """Generate a triangular (size, size) mask."""
+    mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1)
+    mask = mask.float().masked_fill(mask == 0, float("-inf")).masked_fill(mask == 1, float(0.0))
+    return mask

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`from .cnn_lstm import CNNLSTM`
	`2`	`+from .resnet_transformer import ResnetTransformer`