NVIDIA
diff --git a/‎docs/sphinx/api/languages/python_api.rst
+1 b/‎docs/sphinx/api/languages/python_api.rst
+1
diff --git a/‎include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h
+7 b/‎include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h
+7
diff --git a/‎include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
+53 b/‎include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
+53
diff --git a/‎include/cudaq/Optimizer/Transforms/Passes.td
+10-1 b/‎include/cudaq/Optimizer/Transforms/Passes.td
+10-1
diff --git a/‎lib/Frontend/nvqpp/ConvertExpr.cpp
+56-4 b/‎lib/Frontend/nvqpp/ConvertExpr.cpp
+56-4
diff --git a/‎lib/Optimizer/Builder/Intrinsics.cpp
+1 b/‎lib/Optimizer/Builder/Intrinsics.cpp
+1
diff --git a/‎lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
+159-10 b/‎lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
+159-10
@@ -218,6 +218,7 @@ Noisy Simulation
 
 .. autoclass:: cudaq::NoiseModel
     :members:
+    :exclude-members: register_channel
     :special-members: __init__
 
 .. autoclass:: cudaq::BitFlipChannel
 
@@ -94,4 +94,11 @@ static constexpr const char QIRRecordOutput[] =
 static constexpr const char QIRClearResultMaps[] =
     "__quantum__rt__clear_result_maps";
 
+/// Used to specify the type of the data elements in the `QISApplyKrausChannel`
+/// call. (`float` or `double`)
+enum class KrausChannelDataKind { FloatKind, DoubleKind };
+
+static constexpr const char QISApplyKrausChannel[] =
+    "__quantum__qis__apply_kraus_channel_generalized";
+
 } // namespace cudaq::opt
@@ -477,6 +477,59 @@ def quake_ComputeActionOp : QuakeOp<"compute_action"> {
   }];
 }
 
+def quake_ApplyNoiseOp : QuakeOp<"apply_noise", [AttrSizedOperandSegments]> {
+  let summary = "Apply a noise operation to qubits.";
+  let description = [{
+    This operation provides support for the `cudaq::apply_noise` template
+    function. This function is only valid is simulation contexts where the
+    simulator is part of the same process as the C++ host executable itself.
+
+    A noise operator is the application of a Kraus channel to a selected set
+    of qubits. This is a point-wise annotation approach that a user might
+    deploy to introduce "noise" to their circuit under simulation. It is unlike
+    a general (unitary) gate application in that there is no notion of controls
+    or an adjoint.
+  }];
+
+  let arguments = (ins
+    OptionalAttr<FlatSymbolRefAttr>:$noise_func,
+    Optional<AnySignlessInteger>:$key,
+    Variadic<AnyType>:$parameters,
+    Variadic<NonStruqRefType>:$qubits
+  );
+
+  let hasVerifier = 1;
+  let hasCustomAssemblyFormat = 1;
+
+  let builders = [
+    OpBuilder<(ins "mlir::StringRef":$noise_func,
+                   "mlir::ValueRange":$parameters,
+                   "mlir::ValueRange":$targets), [{
+      return build($_builder, $_state, mlir::TypeRange{},
+        mlir::FlatSymbolRefAttr::get($_builder.getContext(), noise_func), {},
+        parameters, targets);
+    }]>,
+    OpBuilder<(ins "mlir::FlatSymbolRefAttr":$noise_func,
+                   "mlir::ValueRange":$parameters,
+                   "mlir::ValueRange":$targets), [{
+      return build($_builder, $_state, mlir::TypeRange{}, noise_func, {},
+        parameters, targets);
+    }]>,
+    OpBuilder<(ins "mlir::Value":$key,
+                   "mlir::ValueRange":$parameters,
+                   "mlir::ValueRange":$targets), [{
+      return build($_builder, $_state, mlir::TypeRange{},
+        mlir::FlatSymbolRefAttr{}, key, parameters, targets);
+    }]>
+  ];
+
+  let extraClassDeclaration = [{
+    static constexpr mlir::StringRef getNoiseFuncAttrNameStr() {
+      return "noise_func";
+    }
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Memory and register conversion instructions: These operations are useful for
 // intermediate conversions between memory-SSA and value-SSA semantics and vice
 
@@ -310,7 +310,16 @@ def DependencyAnalysis : Pass<"dep-analysis", "mlir::ModuleOp"> {
   ];
 }
 
-def EraseNopCalls : Pass<"erase-nop-calls", "mlir::func::FuncOp"> {
+def EraseNoise : Pass<"erase-noise"> {
+  let summary = "Erase the inject of noise via Kraus channels.";
+  let description = [{
+    Although CUDA-Q allows the user to specify the application of noise via
+    Kraus channels, these are not needed and must be removed if the code is to
+    run on quantum hardware, for example.
+  }];
+}
+
+def EraseNopCalls : Pass<"erase-nop-calls"> {
   let summary = "Erase calls to any builtin intrinsics that are NOPs.";
   let description = [{
     The code may contain marker function calls that do not generate any actual
 
@@ -1503,6 +1503,60 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
       return true;
     }
 
+    if (funcName == "apply_noise") {
+      SmallVector<Value> params;
+      SmallVector<Value> qubits;
+      bool inParams = true;
+      for (auto iter : llvm::enumerate(args)) {
+        auto a = iter.value();
+        Type aTy = a.getType();
+        if (inParams) {
+          if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(aTy))
+            if (isa<FloatType>(ptrTy.getElementType())) {
+              params.push_back(a);
+              continue;
+            }
+          if (auto stdvecTy = dyn_cast<cudaq::cc::StdvecType>(aTy))
+            if (stdvecTy.getElementType() == builder.getF64Type() &&
+                iter.index() == 0) {
+              params.push_back(a);
+              inParams = false;
+              continue;
+            }
+          inParams = false;
+        }
+        // The first argument that is not floating-point must be a qubit. If
+        // the user has interleaved floating-point and qubit arguments, that's
+        // an error.
+        if (isa<quake::RefType, quake::VeqType>(aTy)) {
+          qubits.push_back(a);
+        } else {
+          reportClangError(x, mangler,
+                           "apply_noise argument types not supported.");
+          return false;
+        }
+      }
+
+      if (auto callee = calleeOp.getDefiningOp<func::ConstantOp>()) {
+        StringRef calleeName = callee.getValue();
+        builder.create<quake::ApplyNoiseOp>(loc, calleeName, params, qubits);
+
+        // Add the declaration of the function to the module.
+        SmallVector<Type> argTys;
+        for (auto p : params)
+          argTys.push_back(p.getType());
+        for (auto q : qubits)
+          argTys.push_back(q.getType());
+        auto calleeTy = FunctionType::get(builder.getContext(), argTys, {});
+        cudaq::opt::factory::getOrAddFunc(loc, calleeName, calleeTy, module);
+        return true;
+      }
+
+      reportClangError(x, mangler,
+                       "apply_noise with a vector argument is deprecated.");
+      return false;
+    }
+
     if (funcName.equals("mx") || funcName.equals("my") ||
         funcName.equals("mz")) {
       // Measurements always return a bool or a std::vector<bool>.
@@ -1807,8 +1861,7 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
                                        kernelArgs);
         return inlinedFinishControlNegations();
       }
-      if (auto func =
-              dyn_cast_or_null<func::ConstantOp>(calleeValue.getDefiningOp())) {
+      if (auto func = calleeValue.getDefiningOp<func::ConstantOp>()) {
         auto funcTy = cast<FunctionType>(func.getType());
         auto callableSym = func.getValueAttr();
         inlinedStartControlNegations();
@@ -1920,8 +1973,7 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
                                               /*isAdjoint=*/true, ValueRange{},
                                               kernArgs);
       }
-      if (auto func =
-              dyn_cast_or_null<func::ConstantOp>(kernelValue.getDefiningOp())) {
+      if (auto func = kernelValue.getDefiningOp<func::ConstantOp>()) {
         auto kernSym = func.getValueAttr();
         auto funcTy = cast<FunctionType>(func.getType());
         auto kernArgs =
 
@@ -435,6 +435,7 @@ static constexpr IntrinsicCode intrinsicTable[] = {
   func.func private @__quantum__qis__custom_unitary__adj(!cc.ptr<complex<f64>>, !qir_array, !qir_array, !qir_charptr)
 
   llvm.func @generalizedInvokeWithRotationsControlsTargets(i64, i64, i64, i64, !qir_llvmptr, ...) attributes {sym_visibility = "private"}
+  llvm.func @__quantum__qis__apply_kraus_channel_generalized(i64, i64, i64, i64, i64, ...) attributes {sym_visibility = "private"}
 )#"},
 
     // Declarations for base and adaptive profile QIR functions used by codegen.
 
@@ -281,6 +281,155 @@ struct AllocaOpToIntRewrite : public OpConversionPattern<quake::AllocaOp> {
   }
 };
 
+struct ApplyNoiseOpRewrite : public OpConversionPattern<quake::ApplyNoiseOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(quake::ApplyNoiseOp noise, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = noise.getLoc();
+
+    if (!noise.getNoiseFunc()) {
+      // This is the key-based variant. Call the generalized version of the
+      // apply_kraus_channel helper function. Let it do all the conversions into
+      // contiguous buffers for us, greatly simplifying codegen here.
+      SmallVector<Value> args;
+      const bool pushASpan =
+          adaptor.getParameters().size() == 1 &&
+          isa<cudaq::cc::StdvecType>(adaptor.getParameters()[0].getType());
+      const bool usingDouble = [&]() {
+        if (adaptor.getParameters().empty())
+          return true;
+        auto param0 = adaptor.getParameters()[0];
+        if (pushASpan)
+          return cast<cudaq::cc::StdvecType>(param0.getType())
+                     .getElementType() == rewriter.getF64Type();
+        return cast<cudaq::cc::PointerType>(param0.getType())
+                   .getElementType() == rewriter.getF64Type();
+      }();
+      if (usingDouble) {
+        auto code = static_cast<std::int64_t>(
+            cudaq::opt::KrausChannelDataKind::DoubleKind);
+        args.push_back(rewriter.create<arith::ConstantIntOp>(loc, code, 64));
+      } else {
+        auto code = static_cast<std::int64_t>(
+            cudaq::opt::KrausChannelDataKind::FloatKind);
+        args.push_back(rewriter.create<arith::ConstantIntOp>(loc, code, 64));
+      }
+      args.push_back(adaptor.getKey());
+      if (pushASpan) {
+        args.push_back(rewriter.create<arith::ConstantIntOp>(loc, 1, 64));
+        args.push_back(rewriter.create<arith::ConstantIntOp>(loc, 0, 64));
+      } else {
+        args.push_back(rewriter.create<arith::ConstantIntOp>(loc, 0, 64));
+        auto numParams = std::distance(adaptor.getParameters().begin(),
+                                       adaptor.getParameters().end());
+        args.push_back(
+            rewriter.create<arith::ConstantIntOp>(loc, numParams, 64));
+      }
+      auto numTargets =
+          std::distance(adaptor.getQubits().begin(), adaptor.getQubits().end());
+      args.push_back(
+          rewriter.create<arith::ConstantIntOp>(loc, numTargets, 64));
+      if (pushASpan) {
+        Value stdvec = adaptor.getParameters()[0];
+        auto stdvecTy = cast<cudaq::cc::StdvecType>(stdvec.getType());
+        auto dataTy = cudaq::cc::PointerType::get(
+            cudaq::cc::ArrayType::get(stdvecTy.getElementType()));
+        args.push_back(
+            rewriter.create<cudaq::cc::StdvecDataOp>(loc, dataTy, stdvec));
+        args.push_back(rewriter.create<cudaq::cc::StdvecSizeOp>(
+            loc, rewriter.getI64Type(), stdvec));
+      } else {
+        args.append(adaptor.getParameters().begin(),
+                    adaptor.getParameters().end());
+      }
+      args.append(adaptor.getQubits().begin(), adaptor.getQubits().end());
+
+      rewriter.replaceOpWithNewOp<cudaq::cc::VarargCallOp>(
+          noise, TypeRange{}, cudaq::opt::QISApplyKrausChannel, args);
+      return success();
+    }
+
+    // This is a noise_func variant. Call the noise function. There are two
+    // cases that must be considered.
+    //
+    // 1. The parameters to the Kraus channel are passed in an object of type
+    // `std::vector<double>`. To do that requires a bunch of code to translate
+    // the span of doubles on the device side into a `std::vector<double>` on
+    // the stack for passing to the host-side function. It is ABSOLUTELY
+    // CRITICAL that the host side NOT use move semantics or otherwise try to
+    // claim ownership of the fake vector being passed back as that will crash
+    // the executable. The host side should not modify the content of the vector
+    // either. These assumptions are made in this code as the argument to the
+    // host side is `const std::vector<double>&`. This code must also modify the
+    // signature of the called function since the bridge will have assumed it
+    // was a span. Again all of this chicanery is so we don't call the function
+    // with the wrong data type and/or have the callee try to modify the vector.
+    // Such actions will result in the executable CRASHING or giving WRONG
+    // ANSWERS.
+    //
+    // 2. Easier by a jaw-dropping margin, just pass rvalue references to double
+    // values, each individually, back to the host-side function. Since that's
+    // already the case, we just append the operands.
+    SmallVector<Value> args;
+    if (adaptor.getParameters().size() == 1 &&
+        isa<cudaq::cc::StdvecType>(adaptor.getParameters()[0].getType())) {
+      Value svp = adaptor.getParameters()[0];
+      // Convert the device-side span back to a host-side vector so that C++
+      // doesn't crash.
+      auto stdvecTy = cast<cudaq::cc::StdvecType>(svp.getType());
+      auto *ctx = rewriter.getContext();
+      auto ptrTy = cudaq::cc::PointerType::get(stdvecTy.getElementType());
+      auto ptrArrTy = cudaq::cc::PointerType::get(
+          cudaq::cc::ArrayType::get(stdvecTy.getElementType()));
+      auto hostVecTy = cudaq::cc::ArrayType::get(ctx, ptrTy, 3);
+      auto hostVec = rewriter.create<cudaq::cc::AllocaOp>(loc, hostVecTy);
+      Value startPtr =
+          rewriter.create<cudaq::cc::StdvecDataOp>(loc, ptrArrTy, svp);
+      auto i64Ty = rewriter.getI64Type();
+      Value len = rewriter.create<cudaq::cc::StdvecSizeOp>(loc, i64Ty, svp);
+      Value endPtr = rewriter.create<cudaq::cc::ComputePtrOp>(
+          loc, ptrTy, startPtr, ArrayRef<cudaq::cc::ComputePtrArg>{len});
+      Value castStartPtr =
+          rewriter.create<cudaq::cc::CastOp>(loc, ptrTy, startPtr);
+      auto ptrPtrTy = cudaq::cc::PointerType::get(ptrTy);
+      Value ptr0 = rewriter.create<cudaq::cc::ComputePtrOp>(
+          loc, ptrPtrTy, hostVec, ArrayRef<cudaq::cc::ComputePtrArg>{0});
+      rewriter.create<cudaq::cc::StoreOp>(loc, castStartPtr, ptr0);
+      Value ptr1 = rewriter.create<cudaq::cc::ComputePtrOp>(
+          loc, ptrPtrTy, hostVec, ArrayRef<cudaq::cc::ComputePtrArg>{1});
+      rewriter.create<cudaq::cc::StoreOp>(loc, endPtr, ptr1);
+      Value ptr2 = rewriter.create<cudaq::cc::ComputePtrOp>(
+          loc, ptrPtrTy, hostVec, ArrayRef<cudaq::cc::ComputePtrArg>{2});
+      rewriter.create<cudaq::cc::StoreOp>(loc, endPtr, ptr2);
+
+      // N.B. This pointer must be treated as const by the C++ side and should
+      // never have move semantics!
+      args.push_back(hostVec);
+
+      // Finally, we need to modify the called function's signature.
+      auto module = noise->getParentOfType<ModuleOp>();
+      auto funcTy = FunctionType::get(ctx, {}, {});
+      auto [fn, flag] = cudaq::opt::factory::getOrAddFunc(
+          loc, *noise.getNoiseFunc(), funcTy, module);
+      funcTy = fn.getFunctionType();
+      SmallVector<Type> inputTys{funcTy.getInputs().begin(),
+                                 funcTy.getInputs().end()};
+      inputTys[0] = hostVec.getType();
+      auto newFuncTy = FunctionType::get(ctx, inputTys, funcTy.getResults());
+      fn.setFunctionType(newFuncTy);
+    } else {
+      args.append(adaptor.getParameters().begin(),
+                  adaptor.getParameters().end());
+    }
+    args.append(adaptor.getQubits().begin(), adaptor.getQubits().end());
+    rewriter.replaceOpWithNewOp<func::CallOp>(noise, TypeRange{},
+                                              *noise.getNoiseFunc(), args);
+    return success();
+  }
+};
+
 struct MaterializeConstantArrayOpRewrite
     : public OpConversionPattern<cudaq::codegen::MaterializeConstantArrayOp> {
   using OpConversionPattern::OpConversionPattern;
@@ -919,10 +1068,10 @@ struct MeasurementOpPattern : public OpConversionPattern<quake::MzOp> {
       auto cstringGlobal =
           createGlobalCString(mz, loc, rewriter, regNameAttr.getValue());
       if constexpr (!M::discriminateToClassical) {
-        // These QIR profile variants force all record output calls to appear at
-        // the end. In these variants, control-flow isn't allowed in the final
-        // LLVM. Therefore, a single basic block is assumed but unchecked here
-        // as the verifier will raise an error.
+        // These QIR profile variants force all record output calls to appear
+        // at the end. In these variants, control-flow isn't allowed in the
+        // final LLVM. Therefore, a single basic block is assumed but unchecked
+        // here as the verifier will raise an error.
         rewriter.setInsertionPoint(rewriter.getBlock()->getTerminator());
       }
       auto recOut = rewriter.create<func::CallOp>(
@@ -1454,8 +1603,8 @@ static void commonClassicalHandlingPatterns(RewritePatternSet &patterns,
 static void commonQuakeHandlingPatterns(RewritePatternSet &patterns,
                                         TypeConverter &typeConverter,
                                         MLIRContext *ctx) {
-  patterns.insert<GetMemberOpRewrite, MakeStruqOpRewrite, RelaxSizeOpErase,
-                  VeqSizeOpRewrite>(typeConverter, ctx);
+  patterns.insert<ApplyNoiseOpRewrite, GetMemberOpRewrite, MakeStruqOpRewrite,
+                  RelaxSizeOpErase, VeqSizeOpRewrite>(typeConverter, ctx);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1865,10 +2014,10 @@ struct QuakeToQIRAPIPrepPass
 
         // Recursive walk in func.
         func.walk([&](Operation *op) {
-          // Annotate all qubit allocations with the starting qubit index value.
-          // This ought to handle both reference and value semantics. If the
-          // value semantics is using wire sets, no (redundant) annotation is
-          // needed.
+          // Annotate all qubit allocations with the starting qubit index
+          // value. This ought to handle both reference and value semantics. If
+          // the value semantics is using wire sets, no (redundant) annotation
+          // is needed.
           if (auto alloc = dyn_cast<quake::AllocaOp>(op)) {
             auto allocTy = alloc.getType();
             if (isa<quake::RefType>(allocTy)) {