Add layout options to gemm

bertmaher · facebook-github-bot · commit 4445aa2bff1c · 2024-10-01T14:49:45.000-07:00
Summary:
We were only benchmarking `row-major x row-major` gemms (also called
`TT` or `transpose-transpose`, because FORTRAN), which is actually not the
common case; `nn.Linear` will use column-major layouts for weights, which means
`TN` is actually much more common.

Reviewed By: adamomainz

Differential Revision: D63714661

fbshipit-source-id: 735c25c59ddeb6596afd9b19f463af92036a830b
diff --git a/torchbenchmark/operators/gemm/data_io.py b/torchbenchmark/operators/gemm/data_io.py
diff --git a/torchbenchmark/operators/gemm/operator.py b/torchbenchmark/operators/gemm/operator.py
@@ -9,6 +9,8 @@
 import torch._inductor.config as inductor_config
 import triton
 
+from torchbenchmark import REPO_PATH
+
 from torchbenchmark.util.triton_op import (
     BenchmarkOperator,
     BenchmarkOperatorMetrics,
@@ -19,7 +21,6 @@
     register_x_val,
 )
 
-from .data_io import parse_args, read_shapes_from_csv
 from .kernels import matmul as kernels
 from .partition_k import matmul_partition_k
 from .persistent_matmul import (
@@ -88,6 +89,35 @@
 ]
 
 
+def parse_args(args: List[str]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="TorchBench Gemm operator Benchmark")
+    parser.add_argument("--m", type=int)
+    parser.add_argument("--k", type=int)
+    parser.add_argument("--n", type=int)
+    parser.add_argument("--bias", type=int)
+    parser.add_argument("--input", type=str)
+    parser.add_argument("--splitk", action="store_true", default=False)
+    parser.add_argument("--llama", action="store_true", default=False)
+    parser.add_argument("--layout", type=str, default="tn")
+    args = parser.parse_args(args)
+    return args
+
+
+def read_shapes_from_csv(csv_path: str) -> List[List[int]]:
+    input_file_path = os.path.join(
+        REPO_PATH, "torchbenchmark", "operators", "gemm", csv_path
+    )
+    shapes = []
+    with open(input_file_path, "r") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            shape = [
+                int(row.get(f)) if row.get(f) else None for f in ("M", "N", "K", "Bias")
+            ]
+            shapes.append(shape)
+    return shapes
+
+
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["speedup", "tflops"]
     DEFAULT_PRECISION = "fp16"
@@ -98,6 +128,7 @@ def __init__(
         super().__init__(tb_args, extra_args)
         self.use_cuda_graphs = False
         gemm_args = parse_args(self.extra_args)
+        self.layout = gemm_args.layout
         if gemm_args.input:
             self.shapes = read_shapes_from_csv(gemm_args.input)
         elif gemm_args.splitk:
@@ -261,6 +292,11 @@ def get_input_iter(self) -> Generator:
             w = self._scaled_randn(
                 (k, n), scale=k, device=self.device, dtype=self.dtype
             )
+            # Convert inputs to column-major if layout is "n" (non-transposed)
+            if self.layout[0] == "n":
+                a = a.T.contiguous().T
+            if self.layout[1] == "n":
+                w = w.T.contiguous().T
             if not bias == None:
                 bias = torch.randn(
                     (bias), device=self.device, dtype=self.dtype
diff --git a/torchbenchmark/operators/gemm/triton_matmul.py b/torchbenchmark/operators/gemm/triton_matmul.py
@@ -201,8 +201,6 @@ def leaky_relu(x):
 def matmul(a, b, activation=""):
     # Check constraints.
     assert a.shape[1] == b.shape[0], "Incompatible dimensions"
-    assert a.is_contiguous(), "Matrix A must be contiguous"
-    assert b.is_contiguous(), "Matrix B must be contiguous"
     M, K = a.shape
     K, N = b.shape
     # Allocates output.