NestedTensor import 20210730

cpuhrsch · facebook-github-bot · commit 9b6dcbcba7bd · 2021-07-30T11:19:46.000-07:00
Summary: Code import

Reviewed By: janeyx99

Differential Revision: D30016452

fbshipit-source-id: 211add77274159d590343e05920211b2a886b58f
diff --git a/nestedtensor/__init__.py b/nestedtensor/__init__.py
@@ -11,6 +11,8 @@
 from .nested.nested import transpose_nchw_nhwc
 from .nested.nested import transpose_nhwc_nchw
 
+from .nested.fuser import fuse_conv_bn
+
 from . import nested
 
 from . import _C
diff --git a/nestedtensor/csrc/UnaryOps.cpp b/nestedtensor/csrc/UnaryOps.cpp
@@ -59,7 +59,7 @@ Tensor& NestedTensor_clamp_out(
     Tensor& result) {
   apply_nested_tensor(
       [min, max](const at::Tensor self, at::Tensor result) {
-        at::native::clamp_out(self, min, max, result);
+        at::clamp_out(result, self, min, max);
       },
       self,
       result);
diff --git a/nestedtensor/csrc/conv2d.cpp b/nestedtensor/csrc/conv2d.cpp
@@ -28,7 +28,9 @@ Tensor NestedTensor_conv2d(
   TORCH_CHECK(get_dim(input) == 4, "Expected input to be dim 4, but got ", get_dim(input), ".");
 #ifdef WITH_CUDA
   auto self_opt_sizes = get_opt_sizes(input);
-  if (is_nested_tensor_impl(input) && !is_nested_tensor_impl(weight) && input.dtype() == torch::kFloat16) {
+  if (is_nested_tensor_impl(input) &&
+      !is_nested_tensor_impl(weight) &&
+      (input.dtype() == torch::kFloat16 || input.dtype() == torch::kFloat32)) {
     if (get_dim(input) == 4 && !bias && weight.size(2) == 1 && weight.size(3) == 1 &&
         stride[0] == 1 && stride[1] == 1 &&
         padding[0] == 0 && padding[1] == 0 &&
@@ -38,7 +40,7 @@ Tensor NestedTensor_conv2d(
         *self_opt_sizes[1] &&
         get_is_cuda(input)
       ) {
-      if (get_is_contiguous(input, c10::MemoryFormat::ChannelsLast) && input.dtype() == torch::kHalf) {
+      if (get_is_contiguous(input, c10::MemoryFormat::ChannelsLast)) {
         Tensor input_buffer = get_buffer(input);
         input_buffer = input_buffer.view({-1, weight.size(1)});
         at::Tensor result_buffer = at::matmul(input_buffer, 
@@ -56,7 +58,7 @@ Tensor NestedTensor_conv2d(
             }, new_sizes);
         return wrap_buffer(result_buffer.view(-1), new_sizes, new_strides);
       }
-      if (get_is_contiguous(input) && input.dtype() == torch::kHalf) {
+      if (get_is_contiguous(input)) {
         input = transpose_nchw_nhwc(input);
         Tensor input_buffer = get_buffer(input);
         input_buffer = input_buffer.reshape({-1, weight.size(1)});
diff --git a/nestedtensor/csrc/mha.cpp b/nestedtensor/csrc/mha.cpp
@@ -55,14 +55,14 @@ at::Tensor min_mha(
 
   q = q * torch::tensor(scaling);
 
-  q = q.reshape({-1, -1, num_heads, head_dim}).transpose(1, 2);
-  k = k.reshape({-1, -1, num_heads, head_dim}).transpose(1, 2);
-  v = v.reshape({-1, -1, num_heads, head_dim}).transpose(1, 2);
+  q = q.reshape({*opt_sizes[0], -1, num_heads, head_dim}).transpose(1, 2);
+  k = k.reshape({*opt_sizes[0], -1, num_heads, head_dim}).transpose(1, 2);
+  v = v.reshape({*opt_sizes[0], -1, num_heads, head_dim}).transpose(1, 2);
   auto attn_output_weights = at::matmul(q, k.transpose(2, 3));
   attn_output_weights = at::softmax(attn_output_weights, -1);
   attn_output_weights = at::dropout(attn_output_weights, dropout_p, training);
   auto attn_output = at::matmul(attn_output_weights, v);
-  attn_output = attn_output.transpose(1, 2).reshape({-1, -1, edim});
+  attn_output = attn_output.transpose(1, 2).reshape({*opt_sizes[0], -1, edim});
   attn_output = at::matmul(attn_output, out_proj_weight.t());
   attn_output = attn_output + out_proj_bias;
   return attn_output;
diff --git a/nestedtensor/csrc/shape.cpp b/nestedtensor/csrc/shape.cpp
@@ -13,13 +13,8 @@ Tensor NestedTensor_view(const Tensor& self, IntArrayRef size) {
   TORCH_CHECK(
       int64_t(size.size()) > self_data->nested_dim(),
       "view cannot be exclusive to nested dimensions.");
-  for (int64_t i = 0; i < self_data->nested_dim(); i++) {
-    if (size[i] >= 0) {
-      throw std::runtime_error(
-          "Cannot view explicitly along irregular dimension " +
-          std::to_string(i) + ". Please use -1 as a placeholder.");
-    }
-  }
+  auto self_opt_sizes = get_opt_sizes(self);
+  TORCH_CHECK(*self_opt_sizes[0] == size[0], "First dimension must be unchanged.");
   int64_t nested_dim = self_data->nested_dim();
   std::vector<int64_t> target_shape;
   for (int64_t i = nested_dim; i < int64_t(size.size()); i++) {
@@ -38,13 +33,8 @@ Tensor NestedTensor_reshape(const Tensor& self, IntArrayRef size) {
   TORCH_CHECK(
       int64_t(size.size()) > self_data->nested_dim(),
       "Reshape cannot be exclusive to nested dimensions.");
-  for (int64_t i = 0; i < self_data->nested_dim(); i++) {
-    if (size[i] >= 0) {
-      throw std::runtime_error(
-          "Cannot reshape explicitly along irregular dimension " +
-          std::to_string(i) + ". Please use -1 as a placeholder.");
-    }
-  }
+  auto self_opt_sizes = get_opt_sizes(self);
+  TORCH_CHECK(*self_opt_sizes[0] == size[0], "First dimension must be unchanged.");
   int64_t nested_dim = self_data->nested_dim();
   std::vector<int64_t> target_shape;
   for (int64_t i = nested_dim; i < int64_t(size.size()); i++) {
diff --git a/nestedtensor/nested/fuser.py b/nestedtensor/nested/fuser.py
@@ -0,0 +1,89 @@
+import torch.fx as fx
+from typing import Type, Dict, Any, Tuple, Iterable
+import torch
+import copy
+from torch.fx import symbolic_trace
+import time
+
+def _parent_name(target : str) -> Tuple[str, str]:
+    """
+    Splits a qualname into parent path and last atom.
+    For example, `foo.bar.baz` -> (`foo.bar`, `baz`)
+    """
+    *parent, name = target.rsplit('.', 1)
+    return parent[0] if parent else '', name
+
+# Works for length 2 patterns with 2 modules
+def matches_module_pattern(pattern: Iterable[Type], node: fx.Node, modules: Dict[str, Any]):
+    if len(node.args) == 0:
+        return False
+    nodes: Tuple[Any, fx.Node] = (node.args[0], node)
+    for expected_type, current_node in zip(pattern, nodes):
+        if not isinstance(current_node, fx.Node):
+            return False
+        if current_node.op != 'call_module':
+            return False
+        if not isinstance(current_node.target, str):
+            return False
+        if current_node.target not in modules:
+            return False
+        if type(modules[current_node.target]) is not expected_type:
+            return False
+    return True
+
+
+def replace_node_module(node: fx.Node, modules: Dict[str, Any], new_module: torch.nn.Module):
+    assert(isinstance(node.target, str))
+    parent_name, name = _parent_name(node.target)
+    setattr(modules[parent_name], name, new_module)
+
+def computeUpdatedConvWeightAndBias(
+        bn_rv,
+        bn_eps,
+        bn_w,
+        bn_b,
+        bn_rm,
+        conv_w,
+        conv_b=None):
+    orig_dtype = bn_rv.dtype
+    bn_var_rsqrt = (bn_w / torch.sqrt(bn_rv.to(torch.double) + bn_eps))
+    new_w = (conv_w * (bn_var_rsqrt).reshape(-1, 1, 1, 1)).to(orig_dtype)
+    if conv_b is None:
+        return new_w
+    new_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b
+    return new_w, new_b
+
+def fuse_conv_bn_eval(conv, bn):
+    assert(not (conv.training or bn.training)), "Fusion only for eval!"
+    fused_conv = copy.deepcopy(conv)
+    fused_conv.bias = None
+
+    fused_conv.weight = \
+        torch.nn.Parameter(computeUpdatedConvWeightAndBias(bn.running_var, bn.eps, bn.weight, bn.bias, bn.running_mean, fused_conv.weight))
+
+    return fused_conv
+
+def fuse_conv_bn(model: torch.nn.Module, inplace=False) -> torch.nn.Module:
+    """
+    Fuses convolution/BN layers for inference purposes. Will deepcopy your
+    model by default, but can modify the model inplace as well.
+    """
+    patterns = [(torch.nn.Conv2d, torch.nn.BatchNorm2d)]
+    if not inplace:
+        model = copy.deepcopy(model)
+    fx_model = fx.symbolic_trace(model)
+    modules = dict(fx_model.named_modules())
+    new_graph = copy.deepcopy(fx_model.graph)
+
+    for pattern in patterns:
+        for node in new_graph.nodes:
+            if matches_module_pattern(pattern, node, modules):
+                if len(node.args[0].users) > 1:  # Output of conv is used by other nodes
+                    continue
+                conv = modules[node.args[0].target]
+                bn = modules[node.target]
+                fused_conv = fuse_conv_bn_eval(conv, bn)
+                replace_node_module(node.args[0], modules, fused_conv)
+                node.replace_all_uses_with(node.args[0])
+                new_graph.erase_node(node)
+    return fx.GraphModule(fx_model, new_graph)
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.1.4+33fb247'
-git_version = '33fb2477c856f8185f1e9c1e9a6ca28065e43cf9'
+__version__ = '0.1.4+66764fd'
+git_version = '66764fd10e9b6f9c0710840d0cb17369b9d994be'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION
diff --git a/test/test_nested_tensor_integration.py b/test/test_nested_tensor_integration.py
@@ -3,6 +3,12 @@
 import unittest
 from utils_test_case import TestCase
 
+try:
+    import classy_vision
+    TEST_CLASSY_VISION=True
+except ModuleNotFoundError:
+    TEST_CLASSY_VISION=False
+
 
 def ntnt(x): return nestedtensor.nested_tensor(x, requires_grad=True)
 def ntnt_nograd(x): return nestedtensor.nested_tensor(x, requires_grad=False)
@@ -180,6 +186,32 @@ def test_transformer_forward(self):
         for t0, t1 in zip(res_nt.unbind(), [res_0, res_1]):
             self.assertEqual(t0, t1)
 
+    @unittest.skipIf(not TEST_CLASSY_VISION, "No classy vision")
+    def test_fusion_resnext101_32x4d(self):
+        @torch.inference_mode()
+        def _test(dtype, use_channels_last):
+            from classy_vision.models import build_model
+            from torch.fx import symbolic_trace
+            model = build_model({"name": "resnext101_32x4d"}).eval().cuda()
+            model._initialize_weights(False)
+            fused = symbolic_trace(model)
+            fused = nestedtensor.fuse_conv_bn(fused)
+            model = model.to(dtype)
+            fused = fused.to(dtype)
+            data = torch.randn(2, 3, 50, 50, device=torch.device('cuda'), dtype=dtype)
+            if use_channels_last:
+                data = data.contiguous(memory_format=torch.channels_last)
+            ref_output = model(data)
+            new_output = fused(data)
+            if dtype == torch.float16:
+                self.assertEqual(ref_output, new_output, prec=2e-3)
+            else:
+                self.assertEqual(ref_output, new_output)
+        _test(torch.float16, False)
+        _test(torch.float32, False)
+        # _test(torch.float16, True)
+        _test(torch.float32, True)
+
 
 if __name__ == "__main__":
     unittest.main()