[dicp][ascend] Optimize stable_diffusion performance on ascend. (#704)

pdx1989 · web-flow · commit b33227d4661f · 2024-03-01T20:48:30.000+08:00
* Optimize stable_diffusion performance.

* Add switch config for sd bmm_fp16.

* Fix ci llama hf case.

* Fix review comments.

* Remove redundant singleton design.

* Redesign infer_shape interface.

* Remove unused codes.

* Clean more code.

* Add empty line for code-style.
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/ascend.py b/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
@@ -1116,13 +1116,13 @@ def MatMul(name, x1, x2, trans_x1: bool, trans_x2: bool):
         return op.to_node()
 
     @staticmethod
-    def BatchMatMul(name, x1, x2, adj_x1: bool, adj_x2: bool):
+    def BatchMatMul(name, x1, x2, adj_x1: bool, adj_x2: bool, keep_dtype=1):
         op = OP(name, "BatchMatMul")
         op.set_input("x1", x1)
         op.set_attr_bool("adj_x1", adj_x1)
         op.set_input("x2", x2)
         op.set_attr_bool("adj_x2", adj_x2)
-        op.set_attr_int("_keep_dtype", 1)
+        op.set_attr_int("_keep_dtype", keep_dtype)
         return op.to_node()
 
     @staticmethod
diff --git a/dicp/dicp/vendor/AscendGraph/conversion.py b/dicp/dicp/vendor/AscendGraph/conversion.py
@@ -1,3 +1,4 @@
+import os
 import functools
 import operator
 import _operator
@@ -26,6 +27,8 @@
 prims = torch.ops.prims
 conversions = {}
 
+sd_fp16 = int(os.environ.get("SD_FP16", 0))
+
 
 def get_reduction_str(r):
     if r == 0:
@@ -1173,7 +1176,7 @@ def mm(self, x, y):
     @register_conversion(aten.bmm.default)
     def bmm(self, x, y):
         out_dtype = fx_traceback.get_current_meta()['val'].dtype
-        bmm = self.get_proxy(ascend_op.BatchMatMul, (x, y, False, False))
+        bmm = self.get_proxy(ascend_op.BatchMatMul, (x, y, False, False, sd_fp16 ^ 1))
         return self.get_proxy(ascend_op.Cast, (bmm, get_ascend_dtype(out_dtype)))
 
     @register_conversion(torch.torch.ops.aten.addmm)
@@ -1292,6 +1295,8 @@ def _softmax(self, x, dim=-1, half_to_float=False):
         if isinstance(dim, int):
             dim = [dim]
         assert (half_to_float is False)
+        if sd_fp16 is not None and int(sd_fp16) == 1:
+            x = self.get_proxy(ascend_op.Cast, (x, get_ascend_dtype(torch.float16)))
         return self.get_proxy(ascend_op.SoftmaxV2, (x, dim))
 
     @register_conversion(torch.ops.aten.sum.default)
diff --git a/dicp/dicp/vendor/AscendGraph/opset_convert.py b/dicp/dicp/vendor/AscendGraph/opset_convert.py
@@ -3,6 +3,7 @@
 from dicp.dynamo_bridge.compile_fx import is_torch_210
 from dicp.vendor.AscendGraph.ascend_op import MatMul, CastToCpu, IdentityInp
 from dicp.vendor.AscendGraph.conversion import AtenToAscendTransformer
+from ...dynamo_bridge.graph import GraphTransformer
 
 if is_torch_210:
     from dicp.dynamo_bridge.op_transformer import BackendPatternMatcherTransformer
@@ -18,7 +19,7 @@ def transform(self, gm: torch.fx.graph_module):
         for n in gm.graph.nodes:
             if hasattr(n, 'op') and n.op == 'placeholder':
                 fake_tensor = n.meta['val']
-                memo = fake_tensor.fake_mode.fake_tensor_converter.tensor_memo     
+                memo = fake_tensor.fake_mode.fake_tensor_converter.tensor_memo
                 for key in memo:
                     if id(memo[key].fake_device) == id(fake_tensor.fake_device):
                         memory_format = torch_dipu.get_native_memory_format(key())
@@ -86,6 +87,9 @@ def ascendgraph_opset_convert(
 
     # For bug in pytorch
     # Avoid for dynamic shape
+    gt = GraphTransformer(gm, "ascendgraph")
+    gt.infer_shape_dtype()
+    gm = gt.gm
     if is_torch_210 and not symint_in_inputs(list(gm.graph.nodes)):
         gm = BackendPatternMatcherTransformer(
             ascend_pattern_matcher, ascend_patterns_cls_list).transform(gm)
diff --git a/dicp/dicp/vendor/AscendGraph/pattern_replacement.py b/dicp/dicp/vendor/AscendGraph/pattern_replacement.py
@@ -44,6 +44,8 @@ def pattern(self, repeat, dim, input_shape, empty_device, view_1_shape,
     def replacement(self, repeat, dim):
         return torch.ops.aten.repeat_interleave.self_int(self, repeat, dim)
 
+Muls = torch.fx.wrap(ascend_op.Muls.get_singleton())
+Shape = torch.fx.wrap(ascend_op.Shape.get_singleton())
 Const = torch.fx.wrap(ascend_op.Const.get_singleton())
 Transpose = torch.fx.wrap(ascend_op.Transpose.get_singleton())
 Identity = torch.fx.wrap(ascend_op.Identity.get_singleton())
@@ -71,6 +73,27 @@ def replacement(x1, x2, dtype):
         return BatchMatMul(x1, reshape, adj_x1=False, adj_x2=True)
 
 
+@register_ascend_pattern
+class FuseBmmTransposeMulsPattern(BackendPatternBase):
+    @staticmethod
+    def pattern(x1, x2, c1, c2):
+        transpose = Transpose(x2, c1)
+        muls = Muls(transpose, 0.3535533905932738)
+        identity = Identity(muls, None)
+        identity1 = Identity(identity, None)
+        reshape = Reshape(identity1, c2)
+        return BatchMatMul(x1, reshape, False, False, 0)
+
+    @staticmethod
+    def replacement(x1, x2, c1, c2):
+        x2 = Reshape(x2, c2)
+        perm = Permute(x2, [0, 2, 1])
+        shape = Shape(perm)
+        reshape = Reshape(x2, shape)
+        muls = Muls(reshape, 0.3535533905932738)
+        return BatchMatMul(x1, muls, adj_x1=False, adj_x2=True, keep_dtype=0)
+
+
 # @pandaoxin negotiate with @tangzhiyi
 # another submit would implement
 # @register_ascend_pattern
diff --git a/dicp/test/model/test_stable_diffusion.py b/dicp/test/model/test_stable_diffusion.py
@@ -39,6 +39,8 @@ def test_inference(
         prompt = "A photo of an astronaut riding a horse on mars."
         utils.update_dynamo_config(dynamic=dynamic)
         torch_dipu.dipu.set_device(device)
+        if backend == "ascendgraph":
+            os.environ["SD_FP16"] = "1"
 
         # CPU
         torch.manual_seed(1)