pytorch
diff --git a/‎.github/workflows/float8nocompile_test.yaml
Lines changed: 0 additions & 53 deletions b/‎.github/workflows/float8nocompile_test.yaml
Lines changed: 0 additions & 53 deletions
diff --git a/‎.github/workflows/torchao_experimental_test.yml
Lines changed: 4 additions & 9 deletions b/‎.github/workflows/torchao_experimental_test.yml
Lines changed: 4 additions & 9 deletions
diff --git a/‎dev-requirements.txt
Lines changed: 3 additions & 0 deletions b/‎dev-requirements.txt
Lines changed: 3 additions & 0 deletions
diff --git a/‎scripts/clean_release_notes.py
Lines changed: 1 addition & 1 deletion b/‎scripts/clean_release_notes.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/quantization/test_galore_quant.py
Lines changed: 2 additions & 0 deletions b/‎test/quantization/test_galore_quant.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎test/quantization/test_qat.py
Lines changed: 71 additions & 0 deletions b/‎test/quantization/test_qat.py
Lines changed: 71 additions & 0 deletions
diff --git a/‎torchao/_executorch_ops.py
Lines changed: 2 additions & 0 deletions b/‎torchao/_executorch_ops.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎torchao/csrc/cuda/fp6_llm/fp6_linear.cu
Lines changed: 38 additions & 14 deletions b/‎torchao/csrc/cuda/fp6_llm/fp6_linear.cu
Lines changed: 38 additions & 14 deletions
diff --git a/‎torchao/csrc/cuda/fp6_llm/kernel_matmul.cuh
Lines changed: 14 additions & 5 deletions b/‎torchao/csrc/cuda/fp6_llm/kernel_matmul.cuh
Lines changed: 14 additions & 5 deletions
diff --git a/‎torchao/experimental/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎torchao/experimental/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
@@ -37,10 +37,8 @@ jobs:
           # of torch and torchao, which we do not want to use
           pip install executorch
           pip install torch==2.7.0.dev20250311 --index-url "https://download.pytorch.org/whl/nightly/cpu" --force-reinstall
-          pip install numpy
-          pip install pytest
-          pip install parameterized
-          USE_CPP=1 TOCHAO_BUILD_KLEIDIAI=1 pip install .
+          pip install -r dev-requirements.txt
+          USE_CPP=1 TORCHAO_BUILD_KLEIDIAI=1 pip install .
       - name: Run python tests
         run: |
           conda activate venv
@@ -99,11 +97,8 @@ jobs:
           python -c "import torch; print(torch.__version__)"
       - name: Install requirements
         run: |
-          pip install cmake
-          pip install parameterized
-          pip install pyyaml
-          pip install numpy
-          pip install importlib-metadata
+          pip install -r dev-requirements.txt
+          pip install pyyaml importlib-metadata
       - name: Print pip freeze
         run: |
           pip freeze
 
@@ -26,6 +26,9 @@ importlib_metadata
 # Custom CUDA Extensions
 ninja
 
+# CPU kernels
+cmake<4.0.0,>=3.19.0
+
 # Linting
 ruff==0.6.8
 pre-commit
@@ -223,7 +223,7 @@ def format_commit(commit_line: str) -> str:
       After:  * Commit title (https://github.com/pytorch/ao/pull/123)
     """
     # Remove author, put PR link in parentheses
-    commit_line = re.sub(" by @.* in (.*)", r" (\\g<1>)", commit_line)
+    commit_line = re.sub(" by @.* in (.*)", r" (\g<1>)", commit_line)
     # Capitalize first letter
     commit_line = commit_line.lstrip("* ")
     commit_line = "* " + commit_line[0].upper() + commit_line[1:]
 
@@ -38,6 +38,7 @@
 
 
 @pytest.mark.skip("skipping for now, see comments below")
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
 @pytest.mark.parametrize(
     "dim1,dim2,dtype,signed,blocksize",
     TEST_CONFIGS,
@@ -89,6 +90,7 @@ def test_galore_quantize_blockwise(dim1, dim2, dtype, signed, blocksize):
     TEST_CONFIGS,
 )
 @skip_if_rocm("ROCm enablement in progress")
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
 def test_galore_dequant_blockwise(dim1, dim2, dtype, signed, blocksize):
     g = torch.randn(dim1, dim2, device="cuda", dtype=dtype) * 0.01
 
 
@@ -133,6 +133,18 @@ def forward(self, x):
         return x
 
 
+class M4(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(512, 256, bias=False).to(torch.float)
+
+    def example_inputs(self):
+        return (torch.randn(1, 512).to(torch.float),)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
 class ModelWithLinearBias(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -1389,6 +1401,65 @@ def test_qat_linear_bias(self):
         example_inputs = m.example_inputs()
         m(*example_inputs)
 
+    @unittest.skipIf(
+        not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
+    )
+    def test_fake_quantize_per_token_vs_convert(self):
+        """
+        Test that the following produce the exact same numerics:
+          1. FakeQuantizer with asymmetric per_token config
+          2. torchao.quantization.utils.per_token_dynamic_quant
+        """
+        from torchao.quantization.utils import per_token_dynamic_quant
+
+        torch.manual_seed(self.SEED)
+        x = torch.randn(1, 235, 2048)
+        config = FakeQuantizeConfig(torch.int8, "per_token", is_symmetric=False)
+        fake_quantizer = FakeQuantizer(config)
+        fake_quantizer_out = fake_quantizer(x)
+        baseline_out = per_token_dynamic_quant(x)
+        torch.testing.assert_close(fake_quantizer_out, baseline_out, atol=0, rtol=0)
+
+    @unittest.skipIf(
+        not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
+    )
+    def test_qat_8da4w_prepare_vs_convert(self):
+        """
+        Test that the prepare and convert steps of Int8DynActInt4QATQuantizer produces
+        numerics that match exactly over N trials.
+        """
+        from torchao.quantization.qat import Int8DynActInt4WeightQATQuantizer
+        from torchao.quantization.utils import compute_error
+
+        num_trials = 1000
+        group_size = 16
+        non_inf_sqnr = []
+
+        for seed in range(self.SEED, self.SEED + num_trials):
+            torch.manual_seed(seed)
+            m = M4()
+            torch.manual_seed(seed)
+            x = m.example_inputs()
+
+            quantizer = Int8DynActInt4WeightQATQuantizer(groupsize=group_size)
+            prepared = quantizer.prepare(m)
+            prepared_out = prepared(*x)
+            converted = quantizer.convert(prepared)
+            converted_out = converted(*x)
+            sqnr = compute_error(prepared_out, converted_out).item()
+            if sqnr != float("inf"):
+                non_inf_sqnr.append(sqnr)
+
+        avg_sqnr = (
+            sum(non_inf_sqnr) / len(non_inf_sqnr) if len(non_inf_sqnr) > 0 else -1
+        )
+        fail_message = "%s/%s trials did not match exactly, average sqnr = %s" % (
+            len(non_inf_sqnr),
+            num_trials,
+            avg_sqnr,
+        )
+        self.assertEqual(len(non_inf_sqnr), 0, fail_message)
+
 
 if __name__ == "__main__":
     unittest.main()
@@ -5,6 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 import torch
 
+# TODO: delete these ops
+
 
 def _quantized_decomposed_quantize_per_channel_group_wrapper(*args, **kwargs):
     """
 
@@ -21,6 +21,7 @@
 //
 // MODIFICATION NOTE (2024-09-25): added SM75 support (https://github.com/pytorch/ao/pull/942):
 // - Modified the TilingConfig parameters for SM75 to deal with smaller shared memory
+// - Added proper architecture check at both host and device level
 //
 
 
@@ -98,7 +99,24 @@ void        fpx_linear_kernel(cudaStream_t    stream,
     static_assert(std::is_same<InputDataType, half>::value || std::is_same<InputDataType, __nv_bfloat16>::value, "Type must be 'half' or '__nv_bfloat16'");
     assert(M_Global % 256 == 0);
     assert(K_Global % 64 == 0);
-    assert(N_Global>0);
+    assert(N_Global > 0);
+
+    // Check GPU Compute Capability before proceeding
+    int device, major, minor;
+    CHECK_CUDA(cudaGetDevice(&device));
+    CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
+    CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
+
+    // Early exit with error for unsupported architectures
+    if ((major < 7) || (major == 7 && minor < 5)) {
+        TORCH_CHECK(false, "Quant-LLM Error: This kernel requires GPU with SM75 (Turing) or higher architecture. "
+                         "Your current device has SM", major, minor, " which is not supported.");
+    }
+
+    const bool is_sm75_gpu = (major == 7) && (minor == 5);
+    if (is_sm75_gpu && std::is_same<InputDataType, __nv_bfloat16>::value) {
+        TORCH_CHECK(false, "Quant-LLM Error: BFloat16 inputs are not supported on SM75 (Turing) GPUs.");
+    }
 
     // Work around to support more N shapes:
     size_t N_PowerOf2;
@@ -109,17 +127,6 @@ void        fpx_linear_kernel(cudaStream_t    stream,
     if(N_Global>64 && N_Global<=128)    N_PowerOf2 = 128;
     if(N_Global>128)                    N_PowerOf2 = ((N_Global-1)/128+1) * 128;
 
-    // Check GPU Compute Capability
-    int device, major, minor;
-    CHECK_CUDA(cudaGetDevice(&device));
-    CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
-    CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
-    const bool is_sm75_gpu = (major == 7) && (minor == 5);
-    if (is_sm75_gpu && std::is_same<InputDataType, __nv_bfloat16>::value)
-        TORCH_CHECK(false, "Bfloat16 inputs are not supported for SM75");
-    if ((major < 7) || (major == 7 && minor < 5))
-        TORCH_CHECK(false, "FP6LLM_API Error: FP6LLM requires GPU with SM75 or higher!\n");
-
     if (is_sm75_gpu && (N_PowerOf2 == 64 || N_PowerOf2 == 128 || N_PowerOf2 % 128 == 0)) {
         // For SM75 and N >= 64, we use a different TilingConfig to deal with smaller shared memory.
         if (Split_K == 1) {
@@ -136,7 +143,7 @@ void        fpx_linear_kernel(cudaStream_t    stream,
                 case 64:    Kernel_Ex<TilingConfig<4, 1, 8>, InputDataType, InputDataType, EXPONENT, MANTISSA>(stream, Weight, Scales, B, C, M_Global, N_Global, K_Global, Split_K);  break;
                 case 128:   Kernel_Ex<TilingConfig<4, 1, 8>, InputDataType, InputDataType, EXPONENT, MANTISSA>(stream, Weight, Scales, B, C, M_Global, N_Global, K_Global, Split_K);  break;
                 default:    if (N_PowerOf2 % 128 != 0) {
-                                TORCH_CHECK(false, "FP6LLM_API Error: Unsupported N dimension ", N_PowerOf2);
+                                TORCH_CHECK(false, "Quant-LLM Error: Unsupported N dimension ", N_PowerOf2);
                             }
                             Kernel_Ex<TilingConfig<4, 1, 8>, InputDataType, InputDataType, EXPONENT, MANTISSA>(stream, Weight, Scales, B, C, M_Global, N_Global, K_Global, Split_K);  break;
             }
@@ -149,7 +156,7 @@ void        fpx_linear_kernel(cudaStream_t    stream,
                 case 64:    Kernel_Ex<TilingConfig<4, 1, 8>, InputDataType, float, EXPONENT, MANTISSA>(stream, Weight, Scales, B, Reduction_Workspace, M_Global, N_Global, K_Global, Split_K);  break;
                 case 128:   Kernel_Ex<TilingConfig<4, 1, 8>, InputDataType, float, EXPONENT, MANTISSA>(stream, Weight, Scales, B, Reduction_Workspace, M_Global, N_Global, K_Global, Split_K);  break;
                 default:    if (N_PowerOf2 % 128 != 0) {
-                                TORCH_CHECK(false, "FP6LLM_API Error: Unsupported N dimension ", N_PowerOf2);
+                                TORCH_CHECK(false, "Quant-LLM Error: Unsupported N dimension ", N_PowerOf2);
                             }
                             Kernel_Ex<TilingConfig<4, 1, 8>, InputDataType, float, EXPONENT, MANTISSA>(stream, Weight, Scales, B, Reduction_Workspace, M_Global, N_Global, K_Global, Split_K);  break;
             }
@@ -210,6 +217,23 @@ torch::Tensor fp_eXmY_linear_forward_cuda(
     torch::Tensor   _scales,
     int64_t         splitK=1)
 {
+    // Check GPU Compute Capability before proceeding
+    int device, major, minor;
+    CHECK_CUDA(cudaGetDevice(&device));
+    CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
+    CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
+
+    // Early exit with error for unsupported architectures
+    if ((major < 7) || (major == 7 && minor < 5)) {
+        TORCH_CHECK(false, "Quant-LLM Error: This kernel requires GPU with SM75 (Turing) or higher architecture. "
+                         "Your current device has SM", major, minor, " which is not supported.");
+    }
+
+    const bool is_sm75_gpu = (major == 7) && (minor == 5);
+    if (is_sm75_gpu && _in_feats.scalar_type() == at::ScalarType::BFloat16) {
+        TORCH_CHECK(false, "Quant-LLM Error: BFloat16 inputs are not supported on SM75 (Turing) GPUs.");
+    }
+
     const int64_t NBITS   = 1 + EXPONENT + MANTISSA;
     int num_in_feats      = _in_feats.size(0);
     int num_in_channels   = _in_feats.size(1);
 
@@ -51,17 +51,14 @@
  * B: col major, FP16
  * C: col major, FP16
  */
- template<typename TilingConfig, typename InputDataType, typename OutputDataType, int EXPONENT, int MANTISSA>
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750
+template<typename TilingConfig, typename InputDataType, typename OutputDataType, int EXPONENT, int MANTISSA>
 __global__ void QUANT_GEMM_Kernel(const uint4* Weight, const half* Scales,
                                   const half *B,
                                   OutputDataType* C,
                                   const size_t M_Global, const size_t N_Global, const size_t K_Global,
                                   int Split_K)
 {
-  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
-    static_assert(false, "Quant-LLM kernel: At least Turing generation (sm75) is required.");
-    // __trap();  // fails at runtime instead of compile time
-  #endif
   #ifdef DEBUG_MODE
     assert(K_Global%TilingConfig::TILE_K==0);
     assert(M_Global%TilingConfig::TILE_M==0);
@@ -233,3 +230,15 @@ __global__ void QUANT_GEMM_Kernel(const uint4* Weight, const half* Scales,
       }
     }
 }
+#else
+// Stub implementation for older architectures
+template<typename TilingConfig, typename InputDataType, typename OutputDataType, int EXPONENT, int MANTISSA>
+__global__ void QUANT_GEMM_Kernel(const uint4* Weight, const half* Scales,
+                                  const half *B,
+                                  OutputDataType* C,
+                                  const size_t M_Global, const size_t N_Global, const size_t K_Global,
+                                  int Split_K)
+{
+//  NOOP, should never actually be called
+}
+#endif
@@ -40,6 +40,7 @@ include_directories(${TORCHAO_INCLUDE_DIRS})
 if(TORCHAO_BUILD_CPU_AARCH64)
     message(STATUS "Building with cpu/aarch64")
     add_compile_definitions(TORCHAO_BUILD_CPU_AARCH64)
+    add_compile_definitions(TORCHAO_ENABLE_ARM_NEON_DOT)
 
     # Defines torchao_kernels_aarch64
     add_subdirectory(kernels/cpu/aarch64)