[1.8] Fix onnx mixed precision export for layernorm & fuseLogSoftmaxNllLoss (pytorch#52510)

BowenBao · shubhambhokare1 · web-flow · commit 8e7eebfc9a0b · 2021-02-19T14:40:53.000-08:00
Co-authored-by: Shubham Bhokare &lt;32080845+shubhambhokare1@users.noreply.github.com&gt;
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py b/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py
@@ -2,6 +2,8 @@
 import onnxruntime  # noqa
 import torch
 
+from torch.cuda.amp import autocast
+
 from test_pytorch_common import skipIfUnsupportedMinOpsetVersion
 from test_pytorch_common import skipIfNoCuda
 
@@ -24,6 +26,43 @@ def forward(self, x):
         x = torch.randn(2, 4, 5, 6, requires_grad=True, dtype=torch.float16, device=torch.device('cuda'))
         self.run_test(GeluModel(), x, rtol=1e-3, atol=1e-5)
 
+    @skipIfUnsupportedMinOpsetVersion(9)
+    @skipIfNoCuda
+    def test_layer_norm_fp16(self):
+        class LayerNormModel(torch.nn.Module):
+            def __init__(self):
+                super(LayerNormModel, self).__init__()
+                self.layer_norm = torch.nn.LayerNorm([10, 10])
+
+            def forward(self, x):
+                return self.layer_norm(x)
+
+        x = torch.randn(20, 5, 10, 10, requires_grad=True, dtype=torch.float16, device=torch.device('cuda'))
+        self.run_test(LayerNormModel(), x, rtol=1e-3, atol=1e-5)
+
+
+    @skipIfUnsupportedMinOpsetVersion(12)
+    @skipIfNoCuda
+    def test_softmaxCrossEntropy_fusion_fp16(self):
+        class FusionModel(torch.nn.Module):
+            def __init__(self):
+                super(FusionModel, self).__init__()
+                self.loss = torch.nn.NLLLoss(reduction='none')
+                self.m = torch.nn.LogSoftmax(dim=1)
+
+            @autocast()
+            def forward(self, input, target):
+                output = self.loss(self.m(2 * input), target)
+                return output
+
+        N, C = 5, 4
+        input = torch.randn(N, 16, dtype=torch.float16, device=torch.device('cuda'))
+        target = torch.empty(N, dtype=torch.long, device=torch.device('cuda')).random_(0, C)
+
+        # using test data containing default ignore_index=-100
+        target[target == 1] = -100
+        self.run_test(FusionModel(), (input, target))
+
 TestONNXRuntime_cuda.setUp = TestONNXRuntime.setUp
 TestONNXRuntime_cuda.run_test = TestONNXRuntime.run_test
 
diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp
@@ -668,14 +668,32 @@ static void fuseLogSoftmaxNllLoss(Block* b) {
       auto prev = it->input(0)->node();
       Node* origNllLossNode = *it;
       Node* origLogSoftmaxNode;
+
+      // Check for patterns especially in cases with autocasting enabled
+      // in which a cast node is inserted before the NegativeLogLikelihoodLoss
+      // node and this causes the patterns below not to be recognizable by the
+      // fuseLogSoftmaxNllLoss function
+      // For example if the input is 2D
+      // graph(%input : Half(3, 5),
+      // %target : Long(3)):
+      // %4 : Half(3, 5) = onnx::LogSoftmaxaxis=1
+      // %8 : Float = onnx::Cast[to=1](%4)
+      // %9 : Float(3) = onnx::NegativeLogLikelihoodLoss[reduction="none"]
+      // return (%8)
+      Node* castNode = nullptr;
+      if (prev->kind() == onnx::Cast) {
+        castNode = prev;
+        prev = prev->input(0)->node();
+      }
+
       if (prev->kind() == onnx::LogSoftmax) {
         // if the input is 2D
         // graph(%input : Float(3, 5),
         // %target : Long(3)):
         // %4 : Float(3, 5) = onnx::LogSoftmaxaxis=1
         // %8 : Float(3) = onnx::NegativeLogLikelihoodLoss[reduction="none"]
         // return (%8)
-        origLogSoftmaxNode = it->input(0)->node();
+        origLogSoftmaxNode = prev;
       } else if (
           prev->kind() == onnx::Transpose &&
           prev->input(0)->node()->kind() == onnx::LogSoftmax) {
@@ -751,6 +769,19 @@ static void fuseLogSoftmaxNllLoss(Block* b) {
         continue;
       }
 
+      // If the pattern indeed consists of a cast node before the
+      // NegativeLogLikelihoodLoss node, place a cast node in the beginning
+      // of the pattern instead
+      if (castNode != nullptr) {
+        auto onnx_type = castNode->i(attr::to);
+        Node* cast_node = b->owningGraph()->create(onnx::Cast, 1);
+        cast_node->addInput(origLogSoftmaxNode->inputs().at(0));
+        cast_node->i_(attr::to, onnx_type);
+        cast_node->insertBefore(origLogSoftmaxNode);
+        origLogSoftmaxNode->replaceInputWith(
+            origLogSoftmaxNode->inputs().at(0), cast_node->output());
+      }
+
       Node* softmaxCrossEntropyNode = b->owningGraph()->create(
           onnx::SoftmaxCrossEntropyLoss, it->outputs().size());
       for (size_t i = 0; i < softmaxCrossEntropyNode->outputs().size(); ++i) {
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
@@ -296,6 +296,22 @@ def _is_fp(value):
             return (type == 'Float') or (type == 'Double') or (type == 'Half')
     return False
 
+def _generate_wrapped_number(g, scalar):
+    """
+    Create a wrapped number based on https://github.com/pytorch/pytorch/issues/9515
+    A Tensor is a considered a "wrapped number" if it is
+    auto-wrapped from a C++ or Python number type. Integer types are
+    wrapped as 0-dim int64 tensors and floating-point types are
+    wrapped as 0-dim double tensors.
+
+    The input to this function is constant value. If the data type 
+    is a floating point type, it is converted to a 0-dim double
+    tensor, else it is converted to a 0-dim tensor of its original type
+    """
+    assert not isinstance(scalar, torch.Tensor)
+    if isinstance(scalar, float):
+        return g.op("Constant", value_t=torch.tensor(scalar, dtype=torch.double))
+    return g.op("Constant", value_t=torch.tensor(scalar))
 
 def _sort_helper(g, input, dim, decending=True, out=None):
     if out is not None:
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
@@ -1319,8 +1319,8 @@ def layer_norm(g, input, normalized_shape, weight, bias, eps, cudnn_enable):
 
     axes = [-i for i in range(len(normalized_shape), 0, -1)]
 
-    two_cst = g.op("Constant", value_t=torch.tensor(2.))
-    eps_cst = g.op("Constant", value_t=torch.tensor(eps))
+    two_cst = sym_help._generate_wrapped_number(g, 2.)
+    eps_cst = sym_help._generate_wrapped_number(g, eps)
 
     mean = g.op("ReduceMean", input, axes_i=axes)
     numerator = sub(g, input, mean)