ananthsub
diff --git a/‎aten/src/ATen/miopen/Descriptors.cpp
Lines changed: 12 additions & 6 deletions b/‎aten/src/ATen/miopen/Descriptors.cpp
Lines changed: 12 additions & 6 deletions
diff --git a/‎aten/src/ATen/miopen/Descriptors.h
Lines changed: 6 additions & 18 deletions b/‎aten/src/ATen/miopen/Descriptors.h
Lines changed: 6 additions & 18 deletions
diff --git a/‎aten/src/ATen/native/ConvUtils.h
Lines changed: 30 additions & 0 deletions b/‎aten/src/ATen/native/ConvUtils.h
Lines changed: 30 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/Convolution.cpp
Lines changed: 14 additions & 9 deletions b/‎aten/src/ATen/native/Convolution.cpp
Lines changed: 14 additions & 9 deletions
@@ -90,17 +90,17 @@ std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
 
 void TensorDescriptor::print() { std::cout << *this; }
 
-void FilterDescriptor::set(const at::Tensor &t, int64_t pad) {
+void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad) {
   auto dim = t.ndimension();
   if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX)
 #define _STR(X) #X
 #define STR(X) _STR(X)
     throw std::runtime_error("MIOpen supports only up to " STR(MIOPEN_DIM_MAX) " dimensions");
 #undef _STR
 #undef STR
-  if (!t.is_contiguous()) {
-    throw std::runtime_error("MIOpen filters (a.k.a. weights) must be contiguous");
-  }
+  TORCH_CHECK(t.is_contiguous(memory_format),
+      "MIOpen filters (a.k.a. weights) must be contiguous");
+
   int size[MIOPEN_DIM_MAX];
   int stride[MIOPEN_DIM_MAX];
   for (int i = 0; i < dim; ++i) {
@@ -109,9 +109,15 @@ void FilterDescriptor::set(const at::Tensor &t, int64_t pad) {
   for (int i = dim; i < pad; ++i) {
     size[i] = (int) 1;
   }
-  for (int i = dim - 1; i >=0; --i) {
-    stride[i] = (i == dim - 1) ? 1 : stride[i+1] * size[i+1];
+
+  for (int i = pad; i >= dim; --i ) {
+      stride[i] = 1;
   }
+  for (int i = dim-1 ; i >=0; --i ) {
+      // Pass-through
+      stride[i] = t.stride(i);
+  }
+
   dim = std::max(dim, pad);
   set(getDataType(t), (int) dim, size, stride);
 }
 
@@ -18,20 +18,6 @@ inline int dataSize(miopenDataType_t dataType)
   }
 }
 
-// This function modifies 'stride' in place so that the stride for
-// dim i is the product of the sizes of dims i+1 to the end.
-static inline void fixSizeOneDimStride(int dim, const int *size, int *stride) {
-  int64_t z = 1;
-  for(int d = dim-1; d >= 0; d--)
-  {
-    if (size[d] == 1) {
-      stride[d] = z;
-    } else {
-      z *= size[d];
-    }
-  }
-}
-
 template <typename T, miopenStatus_t (*dtor)(T*)>
 struct DescriptorDeleter {
   void operator()(T* x) {
@@ -96,7 +82,6 @@ class TensorDescriptor
 
 private:
   void set(miopenDataType_t dataType, int dim, int* size, int* stride) {
-    fixSizeOneDimStride(dim, size, stride);
     MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride));
   }
 };
@@ -108,12 +93,15 @@ class FilterDescriptor
                       &miopenCreateTensorDescriptor,
                       &miopenDestroyTensorDescriptor>
 {
-public:
-  void set(const at::Tensor &t, int64_t pad = 0);
+ public:
+  void set(const at::Tensor &t, int64_t pad = 0) {
+    set(t, at::MemoryFormat::Contiguous, pad);
+  }
+
+  void set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad = 0);
 
 private:
   void set(miopenDataType_t dataType, int dim, int* size, int* stride) {
-    fixSizeOneDimStride(dim, size, stride);
     MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride));
   }
 };
 
@@ -1,5 +1,6 @@
 #pragma once
 #include <ATen/detail/CUDAHooksInterface.h>
+#include <c10/util/env.h>
 
 namespace at { namespace native {
 
@@ -106,4 +107,33 @@ static inline bool cudnn_conv_use_channels_last(const at::Tensor& input, const a
   return can_use_cudnn_channels_last_2d || can_use_cudnn_channels_last_3d;
 }
 
+static inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+
+  // disable NHWC for float64 input.
+  if (!at::detail::getCUDAHooks().compiledWithMIOpen() ||
+      input.scalar_type() == at::kDouble ||
+      weight.scalar_type() == at::kDouble) {
+    return false;
+  }
+
+  bool can_use_miopen_channels_last_2d = false;
+#if defined(USE_ROCM) && (ROCM_VERSION >= 40300)
+  // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
+  // See #64427
+  static c10::optional<bool> PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC");
+
+  auto input_memory_format = input.suggest_memory_format();
+  auto weight_memory_format = weight.suggest_memory_format();
+
+  can_use_miopen_channels_last_2d = PYTORCH_MIOPEN_SUGGEST_NHWC &&  *PYTORCH_MIOPEN_SUGGEST_NHWC && (
+            ( (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
+            (weight_memory_format == at::MemoryFormat::ChannelsLast) )
+        );
+#endif
+
+  bool can_use_miopen_channels_last_3d = false;
+
+  return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d;
+}
+
 }} // namespace at::native
@@ -838,9 +838,14 @@ at::Tensor _convolution(
     weight = view4d(weight);
   }
 
-  at::MemoryFormat cudnn_memory_format = at::MemoryFormat::Contiguous;
-  if (cudnn_conv_use_channels_last(input, weight)) {
-    cudnn_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
+  at::MemoryFormat backend_memory_format = at::MemoryFormat::Contiguous;
+
+  if (detail::getCUDAHooks().compiledWithCuDNN() && cudnn_conv_use_channels_last(input, weight)) {
+    backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
+  }
+
+  if (detail::getCUDAHooks().compiledWithMIOpen() && miopen_conv_use_channels_last(input, weight)) {
+    backend_memory_format = (k == 5) ? at::MemoryFormat::Contiguous /*at::MemoryFormat::ChannelsLast3d*/ : at::MemoryFormat::ChannelsLast;
   }
 
   Tensor output;
@@ -853,15 +858,15 @@ at::Tensor _convolution(
       auto dilation = params.dilation;
       if (params.use_cudnn_depthwise(input, weight)) {
         output = at::cudnn_convolution(
-            input.contiguous(cudnn_memory_format), weight,
+            input.contiguous(backend_memory_format), weight,
             padding, stride, dilation, params.groups, params.benchmark, params.deterministic, params.allow_tf32);
         if (bias.defined()) {
           output.add_(reshape_bias(input.dim(), bias));
         }
 
       } else if (params.use_miopen(input, weight, bias.defined())){
         output = at::miopen_depthwise_convolution(
-            input.contiguous(), weight, bias,
+            input.contiguous(backend_memory_format), weight, bias,
             padding, stride, dilation, params.groups, params.benchmark, params.deterministic);
       } else {
           if (input.ndimension() == 4) {
@@ -882,14 +887,14 @@ at::Tensor _convolution(
 
     if (params.transposed) {
       output = at::cudnn_convolution_transpose(
-          input.contiguous(cudnn_memory_format), weight,
+          input.contiguous(backend_memory_format), weight,
           params.padding, params.output_padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic, params.allow_tf32);
       if (bias.defined()) {
         output.add_(reshape_bias(input.dim(), bias));
       }
     } else {
       output = at::cudnn_convolution(
-          input.contiguous(cudnn_memory_format), weight,
+          input.contiguous(backend_memory_format), weight,
           params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic, params.allow_tf32);
       if (bias.defined()) {
         output.add_(reshape_bias(input.dim(), bias));
@@ -905,11 +910,11 @@ at::Tensor _convolution(
 
     if (params.transposed) {
       output = at::miopen_convolution_transpose(
-          input.contiguous(), weight, bias,
+          input.contiguous(backend_memory_format), weight, bias,
           params.padding, params.output_padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic);
     } else {
       output = at::miopen_convolution(
-          input.contiguous(), weight, bias,
+          input.contiguous(backend_memory_format), weight, bias,
           params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic);
     }
   } else if (params.use_mkldnn(input, weight)) {