pyscf
diff --git a/‎README.md
Lines changed: 14 additions & 17 deletions b/‎README.md
Lines changed: 14 additions & 17 deletions
diff --git a/‎examples/dft_driver.py
Lines changed: 1 addition & 1 deletion b/‎examples/dft_driver.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎gpu4pyscf/__config__.py
Lines changed: 1 addition & 1 deletion b/‎gpu4pyscf/__config__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎gpu4pyscf/df/tests/test_df_ecp.py
Lines changed: 0 additions & 3 deletions b/‎gpu4pyscf/df/tests/test_df_ecp.py
Lines changed: 0 additions & 3 deletions
diff --git a/‎gpu4pyscf/df/tests/test_df_grad.py
Lines changed: 5 additions & 5 deletions b/‎gpu4pyscf/df/tests/test_df_grad.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎gpu4pyscf/df/tests/test_df_hessian.py
Lines changed: 4 additions & 4 deletions b/‎gpu4pyscf/df/tests/test_df_hessian.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎gpu4pyscf/df/tests/test_int3c2e.py
Lines changed: 11 additions & 11 deletions b/‎gpu4pyscf/df/tests/test_int3c2e.py
Lines changed: 11 additions & 11 deletions
diff --git a/‎gpu4pyscf/df/tests/test_jk.py
Lines changed: 5 additions & 4 deletions b/‎gpu4pyscf/df/tests/test_jk.py
Lines changed: 5 additions & 4 deletions
diff --git a/‎gpu4pyscf/dft/__init__.py
Lines changed: 1 addition & 0 deletions b/‎gpu4pyscf/dft/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎gpu4pyscf/dft/gen_grid.py
Lines changed: 20 additions & 12 deletions b/‎gpu4pyscf/dft/gen_grid.py
Lines changed: 20 additions & 12 deletions
diff --git a/‎gpu4pyscf/dft/numint.py
Lines changed: 20 additions & 4 deletions b/‎gpu4pyscf/dft/numint.py
Lines changed: 20 additions & 4 deletions
diff --git a/‎gpu4pyscf/dft/rks.py
Lines changed: 0 additions & 1 deletion b/‎gpu4pyscf/dft/rks.py
Lines changed: 0 additions & 1 deletion
@@ -4,25 +4,22 @@ Installation
 --------
 
 > [!NOTE]
-> The compiled binary packages support compute capability 7.0 and later (Volta and later, such as Tesla V100, RTX 20 series and later). For older GPUs, please compile the package with the source code as follows.
+> The compiled binary packages support compute capability 7.0 and later (Volta and later, such as Tesla V100, RTX 20 series and later). For older GPUs (GTX 10**, Tesla P100), please compile the package with the source code as follows.
 
-For **CUDA 11.x**
-```sh
-pip3 install gpu4pyscf-cuda11x
-```
-and install cutensor
-```sh
-python -m cupyx.tools.install_library --cuda 11.x --library cutensor
-```
+Run ```nvidia-smi``` in your terminal to check the installed CUDA version. 
 
-For **CUDA 12.x**
-```sh
-pip3 install gpu4pyscf-cuda12x
-```
-and install cutensor
-```sh
-python -m cupyx.tools.install_library --cuda 12.x --library cutensor
-```
+Choose the proper package based on your CUDA environment.
+
+| Platform      | Command                               |
+----------------| --------------------------------------|
+| **CUDA 11.x** |  ```pip3 install gpu4pyscf-cuda11x``` |
+| **CUDA 12.x** |  ```pip3 install gpu4pyscf-cuda12x``` |
+
+```cuTensor``` is **highly recommended** to be installed for accelerating tensor contractions. 
+
+For **CUDA 11.x**, ```python -m cupyx.tools.install_library --cuda 11.x --library cutensor```
+
+For **CUDA 12.x**, ```python -m cupyx.tools.install_library --cuda 12.x --library cutensor```
 
 Compilation
 --------
 
@@ -51,7 +51,7 @@
     mf_df.nlcgrids.atom_grid = (50,194)
 mf_df.direct_scf_tol = 1e-14
 mf_df.direct_scf = 1e-14
-mf_df.conv_tol = 1e-12
+mf_df.conv_tol = 1e-10
 e_tot = mf_df.kernel()
 scf_time = time.time() - start_time
 print(f'compute time for energy: {scf_time:.3f} s')
 
@@ -4,7 +4,7 @@
 GB = 1024*1024*1024
 # such as A100-80G
 if props['totalGlobalMem'] >= 64 * GB:
-    min_ao_blksize = 256
+    min_ao_blksize = 128
     min_grid_blksize = 128*128
     ao_aligned = 32
     grid_aligned = 128
 
@@ -37,9 +37,6 @@ def setUpModule():
         output = '/dev/null'
     )
 
-    mol.build()
-    mol.verbose = 3
-
 def tearDownModule():
     global mol
     mol.stdout.close()
 
@@ -35,18 +35,18 @@
 H       0.7570000000     0.0000000000    -0.4696000000
 '''
 
-xc0='B3LYP'
-bas0='def2-tzvpp'
-auxbasis0='def2-tzvpp-jkfit'
-disp0='d3bj'
+xc0 = 'B3LYP'
+bas0 = 'def2-tzvpp'
+auxbasis0 = 'def2-tzvpp-jkfit'
+disp0 = 'd3bj'
 grids_level = 6
 nlcgrids_level = 3
 def setUpModule():
     global mol
     mol = pyscf.M(atom=atom, basis=bas0, max_memory=32000)
     mol.output = '/dev/null'
-    mol.build()
     mol.verbose = 1
+    mol.build()
 
 eps = 1.0/1024
 
 
@@ -24,10 +24,10 @@
 H       0.7570000000     0.0000000000    -0.4696000000
 '''
 
-xc0='B3LYP'
-bas0='def2-tzvpp'
-auxbasis0='def2-tzvpp-jkfit'
-disp0='d3bj'
+xc0 = 'B3LYP'
+bas0 = 'def2-tzvpp'
+auxbasis0 = 'def2-tzvpp-jkfit'
+disp0 = 'd3bj'
 grids_level = 6
 eps = 1e-3
 
 
@@ -26,7 +26,7 @@
 libgint = load_library('libgint')
 
 '''
-compare int3c2e by pyscf and gpu4pyscf
+check int3c2e consistency between pyscf and gpu4pyscf
 '''
 
 def setUpModule():
@@ -41,13 +41,13 @@ def setUpModule():
                   output='/dev/null')
     auxmol = df.addons.make_auxmol(mol, auxbasis='def2-tzvpp-jkfit')
     auxmol.output = '/dev/null'
-    
+
 def tearDownModule():
     global mol, auxmol
     mol.stdout.close()
     auxmol.stdout.close()
     del mol, auxmol
-    
+
 omega = 0.2
 
 def check_int3c2e_derivatives(ip_type):
@@ -69,40 +69,40 @@ def check_int3c2e_derivatives(ip_type):
         int3c_pyscf = getints(intor, pmol._atm, pmol._bas, pmol._env, shls_slice, aosym='s1', cintopt=opt)
         int3c_gpu = int3c2e.get_int3c2e_general(mol, auxmol, ip_type=ip_type, omega=omega).get()
         assert np.linalg.norm(int3c_pyscf - int3c_gpu) < 1e-9
-    
+
 class KnownValues(unittest.TestCase):
     def test_int3c2e(self):
         get_int3c = _int3c_wrapper(mol, auxmol, 'int3c2e', 's1')
         int3c_pyscf = get_int3c((0, mol.nbas, 0, mol.nbas, 0, auxmol.nbas))
         int3c_gpu = int3c2e.get_int3c2e(mol, auxmol, aosym='s1').get()
         assert np.linalg.norm(int3c_gpu - int3c_pyscf) < 1e-9
-    
+
     def test_int3c2e_omega(self):
         omega = 0.2
         with mol.with_range_coulomb(omega):
             get_int3c = _int3c_wrapper(mol, auxmol, 'int3c2e', 's1')
             int3c_pyscf = get_int3c((0, mol.nbas, 0, mol.nbas, 0, auxmol.nbas))
             int3c_gpu = int3c2e.get_int3c2e(mol, auxmol, aosym='s1', omega=omega).get()
         assert np.linalg.norm(int3c_gpu[0,0,:] - int3c_pyscf[0,0,:]) < 1e-9
-    
+
     def test_int3c2e_ip1(self):
         check_int3c2e_derivatives('ip1')
-    
+
     def test_int3c2e_ip2(self):
         check_int3c2e_derivatives('ip2')
-    
+
     def test_int3c2e_ipip1(self):
         check_int3c2e_derivatives('ipip1')
 
     def test_int3c2e_ipip2(self):
         check_int3c2e_derivatives('ipip2')
-    
+
     def test_int3c2e_ip1ip2(self):
         check_int3c2e_derivatives('ip1ip2')
 
     def test_int3c2e_ipvip1(self):
         check_int3c2e_derivatives('ipvip1')
-    
+
     def test_int1e_iprinv(self):
         from pyscf import gto
         coords = mol.atom_coords()
@@ -115,7 +115,7 @@ def test_int1e_iprinv(self):
             mol.set_rinv_origin(coords[i])
             h1ao = mol.intor('int1e_iprinv', comp=3) # <\nabla|1/r|>
             assert np.linalg.norm(int3c[:,:,:,i] - h1ao) < 1e-8
-            
+
 if __name__ == "__main__":
     print("Full Tests for int3c")
     unittest.main()
@@ -41,13 +41,14 @@ def setUpModule():
     mol.build()
     mol.verbose = 1
     auxmol = df.addons.make_auxmol(mol, auxbasis='sto3g')
-    
+
 def tearDownModule():
     global mol, auxmol
+    mol.stdout.close()
     del mol, auxmol
 
 class KnownValues(unittest.TestCase):
-    
+
     def test_vj_incore(self):
         int3c_gpu = int3c2e.get_int3c2e(mol, auxmol, aosym=True, direct_scf_tol=1e-14)
         intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
@@ -66,7 +67,7 @@ def test_vj_incore(self):
         vj_outcore = cupy.einsum('ijL,L->ij', int3c_gpu, rhoj_outcore)
         vj_incore = int3c2e.get_j_int3c2e_pass2(intopt, rhoj_incore)
         assert cupy.linalg.norm(vj_outcore - vj_incore) < 1e-9
-    
+
     def test_j_outcore(self):
         cupy.random.seed(1)
         nao = mol.nao
@@ -77,7 +78,7 @@ def test_j_outcore(self):
         vj0, _ = mf.get_jk(dm=dm, with_j=True, with_k=False)
         vj = df_jk.get_j(mf.with_df, dm)
         assert cupy.linalg.norm(vj - vj0) < 1e-9
-    
+
 
 if __name__ == "__main__":
     print("Full Tests for DF JK")
 
@@ -3,3 +3,4 @@
 from .uks import UKS
 from .gks import GKS
 from .roks import ROKS
+from gpu4pyscf.dft.gen_grid import Grids
@@ -185,13 +185,22 @@ def gen_grids_partition(atm_coords, coords, a):
     stream = cupy.cuda.get_current_stream()
     natm = atm_coords.shape[0]
     ngrids = coords.shape[0]
-    pbecke = cupy.ones([natm, ngrids], order='C')
     assert ngrids < 65535 * 16
+    x_i = cupy.expand_dims(atm_coords, axis=1)
+    x_g = cupy.expand_dims(coords, axis=0)
+    squared_diff = (x_i - x_g)**2
+    dist_ig = cupy.sum(squared_diff, axis=2)**0.5
+
+    x_j = cupy.expand_dims(atm_coords, axis=0)
+    squared_diff = (x_i - x_j)**2
+    dist_ij = cupy.sum(squared_diff, axis=2)**0.5
+
+    pbecke = cupy.ones([natm, ngrids], order='C')
     err = libgdft.GDFTgen_grid_partition(
         ctypes.cast(stream.ptr, ctypes.c_void_p),
         ctypes.cast(pbecke.data.ptr, ctypes.c_void_p),
-        ctypes.cast(coords.data.ptr, ctypes.c_void_p),
-        ctypes.cast(atm_coords.data.ptr, ctypes.c_void_p),
+        ctypes.cast(dist_ig.data.ptr, ctypes.c_void_p),
+        ctypes.cast(dist_ij.data.ptr, ctypes.c_void_p),
         ctypes.cast(a.data.ptr, ctypes.c_void_p),
         ctypes.c_int(ngrids),
         ctypes.c_int(natm)
@@ -243,13 +252,6 @@ def gen_atomic_grids(mol, atom_grid={}, radi_method=radi.gauss_chebyshev,
             logger.debug(mol, 'atom %s rad-grids = %d, ang-grids = %s',
                          symb, n_rad, angs)
 
-            ang_grids = {}
-            for n in sorted(set(angs)):
-                grid = numpy.empty((n,4))
-                libdft.MakeAngularGrid(grid.ctypes.data_as(ctypes.c_void_p),
-                                       ctypes.c_int(n))
-                ang_grids[n] = grid
-
             angs = numpy.array(angs)
             coords = []
             vol = []
@@ -258,8 +260,13 @@ def gen_atomic_grids(mol, atom_grid={}, radi_method=radi.gauss_chebyshev,
                 libdft.MakeAngularGrid(grid.ctypes.data_as(ctypes.c_void_p),
                                        ctypes.c_int(n))
                 idx = numpy.where(angs==n)[0]
-                coords.append(cupy.einsum('i,jk->jik', rad[idx], grid[:,:3]).reshape(-1,3))
-                vol.append(cupy.einsum('i,j->ji', rad_weight[idx], grid[:,3]).ravel())
+                for i0, i1 in lib.prange(0, len(idx), 12):  # 12 radi-grids as a group
+                    coords.append(numpy.einsum('i,jk->jik',rad[idx[i0:i1]],
+                                               grid[:,:3]).reshape(-1,3))
+                    vol.append(numpy.einsum('i,j->ji', rad_weight[idx[i0:i1]],
+                                            grid[:,3]).ravel())
+                #coords.append(cupy.einsum('i,jk->jik', rad[idx], grid[:,:3]).reshape(-1,3))
+                #vol.append(cupy.einsum('i,j->ji', rad_weight[idx], grid[:,3]).ravel())
 
             atom_grids_tab[symb] = (cupy.vstack(coords), cupy.hstack(vol))
 
@@ -327,6 +334,7 @@ def gen_grid_partition(coords):
 
     coords_all = []
     weights_all = []
+    assert radii_adjust == radi.treutler_atomic_radii_adjust
     a = -radi.get_treutler_fac(mol, atomic_radii)
     for ia in range(mol.natm):
         coords, vol = atom_grids_tab[mol.atom_symbol(ia)]
 
@@ -34,8 +34,8 @@
 GRID_BLKSIZE = 32
 MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 64*64)
 ALIGNED = getattr(__config__, 'grid_aligned', 16*16)
+AO_ALIGNMENT = getattr(__config__, 'ao_aligned', 16)
 AO_THRESHOLD = 1e-12
-AO_ALIGNMENT = 32
 
 # Should we release the cupy cache?
 FREE_CUPY_CACHE = False
@@ -564,6 +564,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         nelec = nelec[0]
         excsum = excsum[0]
         vmat = vmat[0]
+
     return nelec, excsum, vmat#np.asarray(vmat)
 
 def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
@@ -984,16 +985,31 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     if opt is None:
         ni.build(mol, grids.coords)
         opt = ni.gdftopt
+
+    mo_coeff = getattr(dms, 'mo_coeff', None)
+    mo_occ = getattr(dms,'mo_occ', None)
+
     nao, nao0 = opt.coeff.shape
     mol = opt.mol
     coeff = cupy.asarray(opt.coeff)
     dms = [coeff @ dm @ coeff.T for dm in dms.reshape(-1,nao0,nao0)]
+    assert len(dms) == 1
+
+    if mo_coeff is not None:
+        mo_coeff = coeff @ mo_coeff
+
     ao_deriv = 1
     vvrho = []
-    for ao, mask, weight, coords \
+    for ao, idx, weight, coords \
             in ni.block_loop(mol, grids, nao, ao_deriv, max_memory=max_memory):
-        rho = eval_rho(opt.mol, ao, dms[0][np.ix_(mask,mask)], xctype='GGA', hermi=1)
+        #rho = eval_rho(opt.mol, ao, dms[0][np.ix_(mask,mask)], xctype='GGA', hermi=1)
+        if mo_coeff is None:
+            rho = eval_rho(mol, ao, dms[0][np.ix_(idx,idx)], xctype='GGA', hermi=1)
+        else:
+            mo_coeff_mask = mo_coeff[idx,:]
+            rho = eval_rho2(mol, ao, mo_coeff_mask, mo_occ, None, 'GGA')
         vvrho.append(rho)
+
     rho = cupy.hstack(vvrho)
     t1 = log.timer_debug1('eval rho', *t0)
     exc = 0
@@ -1227,7 +1243,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
             # cache ao indices
             if (deriv, block_id, blksize, ngrids) not in ni.non0ao_idx:
                 stream = cupy.cuda.get_current_stream()
-                cutoff = 1e-12
+                cutoff = AO_THRESHOLD
                 ng = ip1 - ip0
                 ao_loc = mol.ao_loc_nr()
                 nbas = mol.nbas
 
@@ -93,7 +93,6 @@ def initialize_grids(ks, mol=None, dm=None):
                     # Filter grids the first time setup grids
                     ks.nlcgrids = prune_small_rho_grids_(ks, ks.mol, dm, ks.nlcgrids)
                 t0 = logger.timer_debug1(ks, 'setting up nlc grids', *t0)
-
     return ks
 
 def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
Original file line number	Diff line number	Diff line change
`@@ -37,9 +37,6 @@ def setUpModule():`
`37`	`37`	`output = '/dev/null'`
`38`	`38`	`)`
`39`	`39`
`40`		`- mol.build()`
`41`		`- mol.verbose = 3`
`42`		`-`
`43`	`40`	`def tearDownModule():`
`44`	`41`	`global mol`
`45`	`42`	`mol.stdout.close()`