cuda precomputation impl

grandinetech · owanikin · Mar 20, 2024 · Apr 7, 2024 · Apr 7, 2024 · Apr 7, 2024
commit 89b44390f894d2aa676341e39ffa8e511acb8019
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/arkworks/Cargo.toml b/arkworks/Cargo.toml
@@ -16,9 +16,6 @@ hex = "0.4.3"
 rand = { version = "0.8.5", optional = true }
 libc = { version = "0.2.148", default-features = false }
 rayon = { version = "1.9.0", optional = true }
-icicle-bls12-381 = { git = "https://github.com/ArtiomTr/icicle.git", rev = "17327c36ac78f0f9135aa30e4ab037d2c98646e9", version = "1.8.0", features = ["arkworks"] }
-icicle-core = { git = "https://github.com/ArtiomTr/icicle.git", rev = "17327c36ac78f0f9135aa30e4ab037d2c98646e9", version = "1.8.0" }
-icicle-cuda-runtime = { git = "https://github.com/ArtiomTr/icicle.git", rev = "17327c36ac78f0f9135aa30e4ab037d2c98646e9", version = "1.8.0" }
 
 [dev-dependencies]
 criterion = "0.5.1"
@@ -50,6 +47,9 @@ bgmw = [
 arkmsm = [
     "kzg/arkmsm"
 ]
+cuda = [
+    "kzg/cuda"
+]
 
 [[bench]]
 name = "fft"

diff --git a/arkworks/src/fft_g1.rs b/arkworks/src/fft_g1.rs
@@ -4,12 +4,10 @@ use crate::kzg_types::{ArkFp, ArkFr, ArkG1, ArkG1Affine};
 
 use crate::kzg_types::ArkG1ProjAddAffine;
 
-use ark_std::iterable::Iterable;
-use icicle_core::traits::ArkConvertible;
-use kzg::msm::msm_impls::{batch_convert, msm};
+use kzg::msm::msm_impls::msm;
 
 use kzg::msm::precompute::PrecomputationTable;
-use kzg::{Fr as KzgFr, G1Affine, G1Mul};
+use kzg::{Fr as KzgFr, G1Mul};
 use kzg::{FFTG1, G1};
 use std::ops::MulAssign;
 
@@ -20,16 +18,12 @@ pub fn g1_linear_combination(
     len: usize,
     precomputation: Option<&PrecomputationTable<ArkFr, ArkG1, ArkFp, ArkG1Affine>>,
 ) {
-    let affines = icicle_cuda_runtime::memory::HostOrDeviceSlice::on_host(batch_convert(points).iter().map(|v: &ArkG1Affine| {
-        icicle_bls12_381::curve::G1Affine::from_ark(v.aff)
-    }).collect::<Vec<_>>());
-
-    let scalars = icicle_cuda_runtime::memory::HostOrDeviceSlice::on_host(scalars.iter().map(|v| icicle_bls12_381::curve::ScalarField::from_ark(v.fr)).collect::<Vec<_>>());
-    let mut results = icicle_cuda_runtime::memory::HostOrDeviceSlice::on_host(vec![icicle_bls12_381::curve::G1Projective::zero()]);
-
-    icicle_core::msm::msm::<icicle_bls12_381::curve::CurveCfg>(&scalars, &affines, &icicle_core::msm::MSMConfig::default_for_device(1), &mut results).unwrap();
-
-    *out = ArkG1(results.as_slice()[0].to_ark());
+    *out = msm::<ArkG1, ArkFp, ArkG1Affine, ArkG1ProjAddAffine, ArkFr>(
+        points,
+        scalars,
+        len,
+        precomputation,
+    );
 }
 
 pub fn make_data(data: usize) -> Vec<ArkG1> {

diff --git a/arkworks/src/kzg_types.rs b/arkworks/src/kzg_types.rs
@@ -17,6 +17,7 @@ use crate::utils::{
 use ark_bls12_381::{g1, g2, Fr, G1Affine, G2Affine};
 use ark_ec::{models::short_weierstrass::Projective, AffineRepr, Group};
 use ark_ec::{CurveConfig, CurveGroup};
+use ark_ff::BigInt;
 use ark_ff::{biginteger::BigInteger256, BigInteger, Field};
 use ark_serialize::{CanonicalDeserialize, CanonicalSerialize};
 use ark_std::{One, Zero};
@@ -840,6 +841,16 @@ impl G1Fp for ArkFp {
         Self(default)
     }
 
+    fn to_limbs(&self) -> [u64; 6] {
+        self.0.0.0
+    }
+
+    fn from_bytes_le(bytes: &[u8; 48]) -> Self {
+        let storage: [u64; 6] = bytes.chunks(8).map(|it| u64::from_le_bytes(it.try_into().unwrap())).collect::<Vec<_>>().try_into().unwrap();
+        let big_int = BigInt::new(storage);
+        Self(ArkFpInt::from(big_int))
+    }
+
     fn neg_assign(&mut self) {
         self.0 = -self.0;
     }

diff --git a/kzg/Cargo.toml b/kzg/Cargo.toml
@@ -10,6 +10,9 @@ num_cpus = { version = "1.16.0", optional = true }
 rayon = { version = "1.8.0", optional = true } 
 threadpool = { version = "^1.8.1", optional = true }
 siphasher = { version = "1.0.0", default-features = false }
+icicle-bls12-381 = { git = "https://github.com/ArtiomTr/icicle.git", rev = "17327c36ac78f0f9135aa30e4ab037d2c98646e9", version = "1.8.0", optional = true }
+icicle-core = { git = "https://github.com/ArtiomTr/icicle.git", rev = "17327c36ac78f0f9135aa30e4ab037d2c98646e9", version = "1.8.0", optional = true }
+icicle-cuda-runtime = { git = "https://github.com/ArtiomTr/icicle.git", rev = "17327c36ac78f0f9135aa30e4ab037d2c98646e9", version = "1.8.0", optional = true }
 
 [features]
 default = [
@@ -29,3 +32,9 @@ std = [
 rand = []
 arkmsm = []
 bgmw = []
+cuda = [
+    "parallel",
+    "dep:icicle-bls12-381",
+    "dep:icicle-core",
+    "dep:icicle-cuda-runtime"
+]
diff --git a/kzg/src/lib.rs b/kzg/src/lib.rs
@@ -201,6 +201,10 @@ pub trait G1Fp: Clone + Default + Sync + Copy + PartialEq + Debug + Send {
     fn set_one(&mut self) {
         *self = Self::ONE;
     }
+
+    fn to_limbs(&self) -> [u64; 6];
+
+    fn from_bytes_le(bytes: &[u8; 48]) -> Self;
 }
 
 pub trait G1Affine<TG1: G1, TG1Fp: G1Fp>:

diff --git a/kzg/src/msm/cuda.rs b/kzg/src/msm/cuda.rs
@@ -0,0 +1,98 @@
+use core::marker::PhantomData;
+
+use icicle_bls12_381::curve::CurveCfg;
+use icicle_core::{curve::Affine, msm::MSMConfig, traits::FieldImpl};
+use icicle_cuda_runtime::memory::HostOrDeviceSlice;
+use core::fmt::Debug;
+use crate::{Fr, G1Affine, G1Fp, G1GetFp, G1Mul, Scalar256, G1};
+
+use super::msm_impls::batch_convert;
+
+pub struct IcicleConfig<TFr, TG1, TG1Fp, TG1Affine>
+where
+    TFr: Fr,
+    TG1: G1 + G1Mul<TFr> + G1GetFp<TG1Fp>,
+    TG1Fp: G1Fp,
+    TG1Affine: G1Affine<TG1, TG1Fp>,
+{
+    affines: HostOrDeviceSlice<'static, Affine<CurveCfg>>,
+
+    g1_marker: PhantomData<TG1>,
+    g1_fp_marker: PhantomData<TG1Fp>,
+    fr_marker: PhantomData<TFr>,
+    g1_affine_marker: PhantomData<TG1Affine>
+}
+
+impl<
+TFr: Fr,
+TG1Fp: G1Fp,
+TG1: G1 + G1Mul<TFr> + G1GetFp<TG1Fp>,
+TG1Affine: G1Affine<TG1, TG1Fp>,
+> Debug for IcicleConfig<TFr, TG1, TG1Fp, TG1Affine> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        // TODO: add formatting for affines
+        f.debug_struct("IcicleConfig").finish()
+    }
+}
+
+impl<
+TFr: Fr,
+TG1Fp: G1Fp,
+TG1: G1 + G1Mul<TFr> + G1GetFp<TG1Fp>,
+TG1Affine: G1Affine<TG1, TG1Fp>,
+> Clone for IcicleConfig<TFr, TG1, TG1Fp, TG1Affine> {
+    fn clone(&self) -> Self {
+        // FIXME: affines should be cloned actually
+        Self { affines: HostOrDeviceSlice::Host(vec![]), g1_marker: PhantomData, g1_fp_marker: PhantomData, fr_marker: PhantomData, g1_affine_marker: PhantomData }
+    }
+}
+
+impl<
+        TFr: Fr,
+        TG1Fp: G1Fp,
+        TG1: G1 + G1Mul<TFr> + G1GetFp<TG1Fp>,
+        TG1Affine: G1Affine<TG1, TG1Fp>,
+    > IcicleConfig<TFr, TG1, TG1Fp, TG1Affine>
+{
+    pub fn new(points: &[TG1]) -> Result<Option<Self>, String> {
+        let affines_raw = batch_convert::<TG1, TG1Fp, TG1Affine>(points).iter().map(|it| icicle_bls12_381::curve::G1Affine::from_limbs(it.x().to_limbs(), it.y().to_limbs())).collect::<Vec<_>>();
+        let Ok(mut affines) = HostOrDeviceSlice::<'static, Affine<CurveCfg>>::cuda_malloc(affines_raw.len()) else {
+            return Ok(None);
+        };
+        if affines.copy_from_host(&affines_raw).is_err() {
+            return Ok(None);
+        }
+
+        Ok(Some(Self {
+            affines,
+
+            fr_marker: PhantomData,
+            g1_fp_marker: PhantomData,
+            g1_marker: PhantomData,
+            g1_affine_marker: PhantomData
+        }))
+    }
+
+    pub fn multiply_sequential(&self, _scalars: &[Scalar256]) -> TG1 {
+        panic!("No sequential implementation for CUDA MSM");
+    }
+
+    #[cfg(feature = "parallel")]
+    pub fn multiply_parallel(&self, scalars: &[Scalar256]) -> TG1 {
+        use icicle_bls12_381::curve::ScalarField;
+
+        let scalars = HostOrDeviceSlice::on_host(scalars.iter().map(|it| ScalarField::from_bytes_le(it.as_u8())).collect::<Vec<_>>());
+
+        let mut results = HostOrDeviceSlice::on_host(vec![icicle_bls12_381::curve::G1Projective::zero()]);
+
+        icicle_core::msm::msm(&scalars, &self.affines, &MSMConfig::default_for_device(0), &mut results).unwrap();
+
+        let mut output = TG1::default();
+
+        *output.x_mut() = TG1Fp::from_bytes_le(&results.as_slice()[0].x.to_bytes_le().try_into().unwrap());
+        *output.y_mut() = TG1Fp::from_bytes_le(&results.as_slice()[0].y.to_bytes_le().try_into().unwrap());
+        *output.z_mut() = TG1Fp::from_bytes_le(&results.as_slice()[0].z.to_bytes_le().try_into().unwrap());
+
+        output
+    }
+}
diff --git a/kzg/src/msm/mod.rs b/kzg/src/msm/mod.rs
@@ -15,3 +15,11 @@ mod pippenger_utils;
 
 #[cfg(all(feature = "bgmw", any(not(feature = "arkmsm"), feature = "parallel")))]
 mod bgmw;
+
+#[cfg(feature = "cuda")]
+mod cuda;
+
+#[cfg(all(feature = "cuda", feature = "bgmw"))]
+compile_error!{"features `cuda` and `bgmw` are mutally exclusive"}
+#[cfg(all(feature = "cuda", not(feature = "parallel")))]
+compile_error!{"feature `cuda` requires feature `parallel`"}
diff --git a/kzg/src/msm/precompute.rs b/kzg/src/msm/precompute.rs
@@ -9,7 +9,7 @@ pub type PrecomputationTable<TFr, TG1, TG1Fp, TG1Affine> =
     super::bgmw::BgmwTable<TFr, TG1, TG1Fp, TG1Affine>;
 
 #[cfg(any(
-    not(feature = "bgmw"),
+    all(not(feature = "bgmw"), not(feature = "cuda")),
     all(feature = "arkmsm", not(feature = "parallel"))
 ))]
 #[derive(Debug, Clone)]
@@ -27,7 +27,7 @@ where
 }
 
 #[cfg(any(
-    not(feature = "bgmw"),
+    all(not(feature = "bgmw"), not(feature = "cuda")),
     all(feature = "arkmsm", not(feature = "parallel"))
 ))]
 impl<TFr, TG1, TG1Fp, TG1Affine> EmptyTable<TFr, TG1, TG1Fp, TG1Affine>
@@ -52,11 +52,14 @@ where
 }
 
 #[cfg(any(
-    not(feature = "bgmw"),
+    all(not(feature = "bgmw"), not(feature = "cuda")),
     all(feature = "arkmsm", not(feature = "parallel"))
 ))]
 pub type PrecomputationTable<TFr, TG1, TG1Fp, TG1Affine> = EmptyTable<TFr, TG1, TG1Fp, TG1Affine>;
 
+#[cfg(feature = "cuda")]
+pub type PrecomputationTable<TFr, TG1, TG1Fp, TG1Affine> = super::cuda::IcicleConfig<TFr, TG1, TG1Fp, TG1Affine>;
+
 pub fn precompute<TFr, TG1, TG1Fp, TG1Affine>(
     points: &[TG1],
 ) -> Result<Option<PrecomputationTable<TFr, TG1, TG1Fp, TG1Affine>>, String>