NestedTensor import 20210805

cpuhrsch · facebook-github-bot · commit 4cc2a375ae7d · 2021-08-05T09:16:10.000-07:00
Summary: Import from GH

Reviewed By: mthrok

Differential Revision: D30133587

fbshipit-source-id: 6b054a74bf05a13235b4c6dbb8e464ea7f595518
diff --git a/nestedtensor/__init__.py b/nestedtensor/__init__.py
@@ -12,6 +12,8 @@
 from .nested.nested import transpose_nhwc_nchw
 
 from .nested.fuser import fuse_conv_bn
+from .nested.fuser import fuse_conv_relu
+from .nested.fuser import fuse_conv_add_relu
 
 from . import nested
 
diff --git a/nestedtensor/csrc/UnaryOps.cpp b/nestedtensor/csrc/UnaryOps.cpp
@@ -103,14 +103,14 @@ Tensor NestedTensor_clamp_max(const Tensor& self, const c10::Scalar& min) {
 
 Tensor& NestedTensor_clamp_max_out(
     const Tensor& self,
-    const Scalar& min,
+    const Scalar& max,
     Tensor& result) {
   apply_nested_tensor(
-      [min](const Tensor self, Tensor result) {
-        at::native::clamp_max_out(self, min, result);
+      [max](Tensor result, const Tensor tensor) {
+        at::clamp_max_out(result, tensor, max);
       },
-      self,
-      result);
+      result,
+      self);
   return result;
 }
 
diff --git a/nestedtensor/nested/fuser.py b/nestedtensor/nested/fuser.py
@@ -87,3 +87,163 @@ def fuse_conv_bn(model: torch.nn.Module, inplace=False) -> torch.nn.Module:
                 node.replace_all_uses_with(node.args[0])
                 new_graph.erase_node(node)
     return fx.GraphModule(fx_model, new_graph)
+
+class Conv2dReLU(torch.nn.Module):
+    def __init__(self,
+                 weight,
+                 bias,
+                 stride,
+                 padding,
+                 dilation,
+                 groups):
+        super(Conv2dReLU, self).__init__()
+        self.weight = weight
+        self.weight_is_channels_last = False
+        self.bias = bias
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.slow_fusion = False
+        if self.weight.size(2) == 7 and self.weight.size(3) == 7:
+            self.slow_fusion = True
+
+    def forward(self, inp):
+        # NOTE: This will be faster once https://github.com/pytorch/pytorch/pull/62482 lands
+        if not self.slow_fusion and inp.is_contiguous(memory_format=torch.contiguous_format):
+            inp = inp.to(memory_format=torch.channels_last)
+        if self.slow_fusion and inp.is_contiguous(memory_format=torch.channels_last):
+            inp = inp.to(memory_format=torch.contiguous_format)
+        if not self.slow_fusion and not self.weight_is_channels_last:
+            self.weight.data = self.weight.to(memory_format=torch.channels_last)
+            inp = inp.to(memory_format=torch.channels_last)
+            self.weight_is_channels_last = True
+        return torch.cudnn_convolution_relu(inp,
+                                            self.weight,
+                                            self.bias,
+                                            self.stride,
+                                            self.padding,
+                                            self.dilation,
+                                            self.groups)
+
+class Conv2dAddReLU(torch.nn.Module):
+    def __init__(self,
+                 weight,
+                 bias,
+                 stride,
+                 padding,
+                 dilation,
+                 groups):
+        super(Conv2dAddReLU, self).__init__()
+        self.weight = weight
+        self.weight_is_channels_last = False
+        self.bias = bias
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.slow_fusion = False
+        if self.weight.size(2) == 7 and self.weight.size(3) == 7:
+            self.slow_fusion = True
+
+    def forward(self, inp, add_input):
+        # TODO: Reactivate this once cudnn_convolution_add_relu is fixed.
+        # weight = self.weight.to(memory_format=torch.contiguous_format)
+        # if not self.slow_fusion and inp.is_contiguous(memory_format=torch.contiguous_format):
+        #     inp = inp.to(memory_format=torch.channels_last)
+        #     add_input = add_input.to(memory_format=torch.channels_last)
+        # if self.slow_fusion and inp.is_contiguous(memory_format=torch.channels_last):
+        #     inp = inp.to(memory_format=torch.contiguous_format)
+        #     add_input = add_input.to(memory_format=torch.contiguous_format)
+        # if not self.slow_fusion and not self.weight_is_channels_last:
+        #     self.weight.data = self.weight.to(memory_format=torch.channels_last)
+        #     inp = inp.to(memory_format=torch.channels_last)
+        #     add_input = add_input.to(memory_format=torch.channels_last)
+        #     self.weight_is_channels_last = True
+        # return torch.cudnn_convolution_add_relu(inp,
+        #                                         self.weight,
+        #                                         add_input,
+        #                                         1.0,
+        #                                         self.bias,
+        #                                         self.stride,
+        #                                         self.padding,
+        #                                         self.dilation,
+        #                                         self.groups)
+        out = torch.conv2d(inp,
+                           self.weight,
+                           self.bias,
+                           self.stride,
+                           self.padding,
+                           self.dilation,
+                           self.groups)
+        out.add_(add_input)
+        out.relu_()
+        return out
+
+def fuse_conv_relu(model: torch.nn.Module, inplace=False) -> torch.nn.Module:
+    """
+    Fuses convolution/BN layers for inference purposes. Will deepcopy your
+    model by default, but can modify the model inplace as well.
+    """
+    patterns = [(torch.nn.Conv2d, torch.nn.ReLU)]
+    if not inplace:
+        model = copy.deepcopy(model)
+    fx_model = fx.symbolic_trace(model)
+    modules = dict(fx_model.named_modules())
+    new_graph = copy.deepcopy(fx_model.graph)
+
+    for pattern in patterns:
+        for node in new_graph.nodes:
+            if matches_module_pattern(pattern, node, modules):
+                if len(node.args[0].users) > 1:  # Output of conv is used by other nodes
+                    continue
+                conv = modules[node.args[0].target]
+                relu = modules[node.target]
+                fused_conv = Conv2dReLU(conv.weight, conv.bias, conv.stride, conv.padding, conv.dilation, conv.groups)
+                replace_node_module(node.args[0], modules, fused_conv)
+                node.replace_all_uses_with(node.args[0])
+                new_graph.erase_node(node)
+
+
+    last_nodes = []
+    count = 0
+    for node in new_graph.nodes:
+        if count == 31:
+            break
+        if (node.op == "call_function" or node.op == "call_module"):
+            last_nodes.append(node)
+            if len(last_nodes) == 4:
+                last_nodes = last_nodes[1:]
+        if len(last_nodes) < 3:
+            continue
+        is_match = True
+        is_match = is_match and (last_nodes[0].op == "call_module")
+        is_match = is_match and (last_nodes[1].op == "call_function")
+        is_match = is_match and (last_nodes[2].op == "call_module")
+        is_match = is_match and isinstance(modules[last_nodes[0].target], torch.nn.Conv2d)
+        is_match = is_match and (str(last_nodes[1]).split("_")[0] == "add")
+        is_match = is_match and isinstance(modules[last_nodes[2].target], torch.nn.ReLU)
+        if (is_match):
+            conv = modules[last_nodes[1].args[0].target]
+            fused_conv = Conv2dAddReLU(conv.weight, conv.bias, conv.stride, conv.padding, conv.dilation, conv.groups)
+            replace_node_module(last_nodes[2], modules, fused_conv)
+            last_nodes[2].args = (last_nodes[0].args[0], last_nodes[1].args[1])
+            new_graph.erase_node(last_nodes[1])
+            new_graph.erase_node(last_nodes[0])
+            count += 1
+    return fx.GraphModule(fx_model, new_graph)
+
+
+def fuse_conv_add_relu(model: torch.nn.Module, inplace=False) -> torch.nn.Module:
+    """
+    Fuses convolution/BN layers for inference purposes. Will deepcopy your
+    model by default, but can modify the model inplace as well.
+    """
+    if not inplace:
+        model = copy.deepcopy(model)
+    fx_model = fx.symbolic_trace(model)
+    modules = dict(fx_model.named_modules())
+    new_graph = copy.deepcopy(fx_model.graph)
+
+    new_graph.lint()
+    return fx.GraphModule(fx_model, new_graph)
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.1.4+66764fd'
-git_version = '66764fd10e9b6f9c0710840d0cb17369b9d994be'
+__version__ = '0.1.4+da883d9'
+git_version = 'da883d94a7cb250db7ec7d6d152764e6e8e8788a'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION
diff --git a/test/test_nested_tensor_integration.py b/test/test_nested_tensor_integration.py
@@ -2,6 +2,7 @@
 import nestedtensor
 import unittest
 from utils_test_case import TestCase
+from utils import debug_on
 
 try:
     import classy_vision
@@ -194,22 +195,24 @@ def _test(dtype, use_channels_last):
             from torch.fx import symbolic_trace
             model = build_model({"name": "resnext101_32x4d"}).eval().cuda()
             model._initialize_weights(False)
-            fused = symbolic_trace(model)
-            fused = nestedtensor.fuse_conv_bn(fused)
+            # This is needed to allow tracing, but for makes no difference for resnext
+            model = model.classy_model
+            fused = nestedtensor.fuse_conv_bn(model)
+            fused = nestedtensor.fuse_conv_relu(fused)
             model = model.to(dtype)
             fused = fused.to(dtype)
             data = torch.randn(2, 3, 50, 50, device=torch.device('cuda'), dtype=dtype)
+            ref_output = model(data)
             if use_channels_last:
                 data = data.contiguous(memory_format=torch.channels_last)
-            ref_output = model(data)
             new_output = fused(data)
             if dtype == torch.float16:
                 self.assertEqual(ref_output, new_output, prec=2e-3)
             else:
                 self.assertEqual(ref_output, new_output)
-        _test(torch.float16, False)
         _test(torch.float32, False)
-        # _test(torch.float16, True)
+        _test(torch.float16, False)
+        _test(torch.float16, True)
         _test(torch.float32, True)