15
15
#include " module_hamilt_general/module_ewald/H_Ewald_pw.h"
16
16
#include " module_hamilt_general/module_surchem/surchem.h"
17
17
#include " module_hamilt_general/module_vdw/vdw.h"
18
+ #include " kernels/force_op.h"
18
19
19
20
#ifdef _OPENMP
20
21
#include < omp.h>
@@ -531,31 +532,110 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
531
532
// to G space. maybe need fftw with OpenMP
532
533
rho_basis->real2recip (aux, aux);
533
534
534
- #ifdef _OPENMP
535
- #pragma omp parallel for
536
- #endif
537
- for (int iat = 0 ; iat < this ->nat ; ++iat)
538
- {
539
- // read `it` `ia` from the table
535
+ // sincos op for G space
536
+
537
+
538
+ // data preparation
539
+ std::vector<FPTYPE> tau_flat (this ->nat * 3 );
540
+ std::vector<FPTYPE> gcar_flat (rho_basis->npw * 3 );
541
+
542
+
543
+ for (int iat = 0 ; iat < this ->nat ; iat++) {
544
+ int it = ucell.iat2it [iat];
545
+ int ia = ucell.iat2ia [iat];
546
+
547
+ tau_flat[iat * 3 + 0 ] = static_cast <FPTYPE>(ucell.atoms [it].tau [ia][0 ]);
548
+ tau_flat[iat * 3 + 1 ] = static_cast <FPTYPE>(ucell.atoms [it].tau [ia][1 ]);
549
+ tau_flat[iat * 3 + 2 ] = static_cast <FPTYPE>(ucell.atoms [it].tau [ia][2 ]);
550
+ }
551
+
552
+ for (int ig = 0 ; ig < rho_basis->npw ; ig++) {
553
+ gcar_flat[ig * 3 + 0 ] = static_cast <FPTYPE>(rho_basis->gcar [ig][0 ]);
554
+ gcar_flat[ig * 3 + 1 ] = static_cast <FPTYPE>(rho_basis->gcar [ig][1 ]);
555
+ gcar_flat[ig * 3 + 2 ] = static_cast <FPTYPE>(rho_basis->gcar [ig][2 ]);
556
+ }
557
+
558
+ // calculate vloc_factors for all atom types
559
+ std::vector<FPTYPE> vloc_per_type_host (this ->nat * rho_basis->npw );
560
+ for (int iat = 0 ; iat < this ->nat ; iat++) {
540
561
int it = ucell.iat2it [iat];
541
- int ia = ucell.iat2ia [iat];
542
- for (int ig = 0 ; ig < rho_basis->npw ; ig++)
543
- {
544
- const double phase = ModuleBase::TWO_PI * (rho_basis->gcar [ig] * ucell.atoms [it].tau [ia]);
545
- double sinp, cosp;
546
- ModuleBase::libm::sincos (phase, &sinp, &cosp);
547
- const double factor
548
- = vloc (it, rho_basis->ig2igg [ig]) * (cosp * aux[ig].imag () + sinp * aux[ig].real ());
549
- forcelc (iat, 0 ) += rho_basis->gcar [ig][0 ] * factor;
550
- forcelc (iat, 1 ) += rho_basis->gcar [ig][1 ] * factor;
551
- forcelc (iat, 2 ) += rho_basis->gcar [ig][2 ] * factor;
562
+ for (int ig = 0 ; ig < rho_basis->npw ; ig++) {
563
+ vloc_per_type_host[iat * rho_basis->npw + ig] = static_cast <FPTYPE>(vloc (it, rho_basis->ig2igg [ig]));
552
564
}
553
- forcelc (iat, 0 ) *= (ucell.tpiba * ucell.omega );
554
- forcelc (iat, 1 ) *= (ucell.tpiba * ucell.omega );
555
- forcelc (iat, 2 ) *= (ucell.tpiba * ucell.omega );
565
+ }
566
+
567
+ std::vector<std::complex<FPTYPE>> aux_fptype (rho_basis->npw );
568
+ for (int ig = 0 ; ig < rho_basis->npw ; ig++) {
569
+ aux_fptype[ig] = static_cast <std::complex<FPTYPE>>(aux[ig]);
570
+ }
571
+
572
+ FPTYPE* d_gcar = gcar_flat.data ();
573
+ FPTYPE* d_tau = tau_flat.data ();
574
+ FPTYPE* d_vloc_per_type = vloc_per_type_host.data ();
575
+ std::complex<FPTYPE>* d_aux = aux_fptype.data ();
576
+ FPTYPE* d_force = nullptr ;
577
+ std::vector<FPTYPE> force_host (this ->nat * 3 );
578
+
579
+ if (this ->device == base_device::GpuDevice)
580
+ {
581
+ d_gcar = nullptr ;
582
+ d_tau = nullptr ;
583
+ d_vloc_per_type = nullptr ;
584
+ d_aux = nullptr ;
585
+
586
+ resmem_var_op ()(this ->ctx , d_gcar, rho_basis->npw * 3 );
587
+ resmem_var_op ()(this ->ctx , d_tau, this ->nat * 3 );
588
+ resmem_var_op ()(this ->ctx , d_vloc_per_type, this ->nat * rho_basis->npw );
589
+ resmem_complex_op ()(this ->ctx , d_aux, rho_basis->npw );
590
+ resmem_var_op ()(this ->ctx , d_force, this ->nat * 3 );
591
+
592
+ syncmem_var_h2d_op ()(this ->ctx , this ->cpu_ctx , d_gcar, gcar_flat.data (), rho_basis->npw * 3 );
593
+ syncmem_var_h2d_op ()(this ->ctx , this ->cpu_ctx , d_tau, tau_flat.data (), this ->nat * 3 );
594
+ syncmem_var_h2d_op ()(this ->ctx , this ->cpu_ctx , d_vloc_per_type, vloc_per_type_host.data (), this ->nat * rho_basis->npw );
595
+ syncmem_complex_h2d_op ()(this ->ctx , this ->cpu_ctx , d_aux, aux_fptype.data (), rho_basis->npw );
596
+
597
+ base_device::memory::set_memory_op<FPTYPE, Device>()(this ->ctx , d_force, 0.0 , this ->nat * 3 );
598
+ }
599
+ else
600
+ {
601
+ d_force = force_host.data ();
602
+ std::fill (force_host.begin (), force_host.end (), static_cast <FPTYPE>(0.0 ));
603
+ }
604
+
605
+ const FPTYPE scale_factor = static_cast <FPTYPE>(ucell.tpiba * ucell.omega );
606
+
607
+ // call op for sincos calculation
608
+ hamilt::cal_force_loc_sincos_op<FPTYPE, Device>()(
609
+ this ->ctx ,
610
+ this ->nat ,
611
+ rho_basis->npw ,
612
+ this ->nat ,
613
+ d_gcar,
614
+ d_tau,
615
+ d_vloc_per_type,
616
+ d_aux,
617
+ scale_factor,
618
+ d_force
619
+ );
620
+
621
+ if (this ->device == base_device::GpuDevice)
622
+ {
623
+ syncmem_var_d2h_op ()(this ->cpu_ctx , this ->ctx , force_host.data (), d_force, this ->nat * 3 );
624
+
625
+ delmem_var_op ()(this ->ctx , d_gcar);
626
+ delmem_var_op ()(this ->ctx , d_tau);
627
+ delmem_var_op ()(this ->ctx , d_vloc_per_type);
628
+ delmem_complex_op ()(this ->ctx , d_aux);
629
+ delmem_var_op ()(this ->ctx , d_force);
630
+ }
631
+
632
+ for (int iat = 0 ; iat < this ->nat ; iat++) {
633
+ forcelc (iat, 0 ) = static_cast <double >(force_host[iat * 3 + 0 ]);
634
+ forcelc (iat, 1 ) = static_cast <double >(force_host[iat * 3 + 1 ]);
635
+ forcelc (iat, 2 ) = static_cast <double >(force_host[iat * 3 + 2 ]);
556
636
}
557
637
558
- // this->print(GlobalV::ofs_running, "local forces", forcelc);
638
+ // this->print(GlobalV: :ofs_running, "local forces", forcelc);
559
639
Parallel_Reduce::reduce_pool (forcelc.c , forcelc.nr * forcelc.nc );
560
640
delete[] aux;
561
641
ModuleBase::timer::tick (" Forces" , " cal_force_loc" );
@@ -665,6 +745,119 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
665
745
aux[rho_basis->ig_gge0 ] = std::complex<double >(0.0 , 0.0 );
666
746
}
667
747
748
+ // sincos op for cal_force_ew
749
+
750
+ std::vector<FPTYPE> it_facts_host (this ->nat );
751
+ std::vector<FPTYPE> tau_flat (this ->nat * 3 );
752
+
753
+ // iterate over by lookup table
754
+ for (int iat = 0 ; iat < this ->nat ; iat++) {
755
+ int it = ucell.iat2it [iat];
756
+ int ia = ucell.iat2ia [iat];
757
+
758
+ double zv;
759
+ if (PARAM.inp .use_paw )
760
+ {
761
+ #ifdef USE_PAW
762
+ zv = GlobalC::paw_cell.get_val (it);
763
+ #endif
764
+ }
765
+ else
766
+ {
767
+ zv = ucell.atoms [it].ncpp .zv ;
768
+ }
769
+
770
+ it_facts_host[iat] = static_cast <FPTYPE>(zv * ModuleBase::e2 * ucell.tpiba *
771
+ ModuleBase::TWO_PI / ucell.omega * fact);
772
+
773
+ tau_flat[iat * 3 + 0 ] = static_cast <FPTYPE>(ucell.atoms [it].tau [ia][0 ]);
774
+ tau_flat[iat * 3 + 1 ] = static_cast <FPTYPE>(ucell.atoms [it].tau [ia][1 ]);
775
+ tau_flat[iat * 3 + 2 ] = static_cast <FPTYPE>(ucell.atoms [it].tau [ia][2 ]);
776
+ }
777
+
778
+ std::vector<FPTYPE> gcar_flat (rho_basis->npw * 3 );
779
+ for (int ig = 0 ; ig < rho_basis->npw ; ig++) {
780
+ gcar_flat[ig * 3 + 0 ] = static_cast <FPTYPE>(rho_basis->gcar [ig][0 ]);
781
+ gcar_flat[ig * 3 + 1 ] = static_cast <FPTYPE>(rho_basis->gcar [ig][1 ]);
782
+ gcar_flat[ig * 3 + 2 ] = static_cast <FPTYPE>(rho_basis->gcar [ig][2 ]);
783
+ }
784
+
785
+ std::vector<std::complex<FPTYPE>> aux_fptype (rho_basis->npw );
786
+ for (int ig = 0 ; ig < rho_basis->npw ; ig++) {
787
+ aux_fptype[ig] = static_cast <std::complex<FPTYPE>>(aux[ig]);
788
+ }
789
+
790
+ FPTYPE* d_gcar = gcar_flat.data ();
791
+ FPTYPE* d_tau = tau_flat.data ();
792
+ FPTYPE* d_it_facts = it_facts_host.data ();
793
+ std::complex<FPTYPE>* d_aux = aux_fptype.data ();
794
+ FPTYPE* d_force_g = nullptr ;
795
+ std::vector<FPTYPE> force_g_host (this ->nat * 3 );
796
+
797
+ if (this ->device == base_device::GpuDevice)
798
+ {
799
+ d_gcar = nullptr ;
800
+ d_tau = nullptr ;
801
+ d_it_facts = nullptr ;
802
+ d_aux = nullptr ;
803
+
804
+ resmem_var_op ()(this ->ctx , d_gcar, rho_basis->npw * 3 );
805
+ resmem_var_op ()(this ->ctx , d_tau, this ->nat * 3 );
806
+ resmem_var_op ()(this ->ctx , d_it_facts, this ->nat );
807
+ resmem_complex_op ()(this ->ctx , d_aux, rho_basis->npw );
808
+ resmem_var_op ()(this ->ctx , d_force_g, this ->nat * 3 );
809
+
810
+
811
+ syncmem_var_h2d_op ()(this ->ctx , this ->cpu_ctx , d_gcar, gcar_flat.data (), rho_basis->npw * 3 );
812
+ syncmem_var_h2d_op ()(this ->ctx , this ->cpu_ctx , d_tau, tau_flat.data (), this ->nat * 3 );
813
+ syncmem_var_h2d_op ()(this ->ctx , this ->cpu_ctx , d_it_facts, it_facts_host.data (), this ->nat );
814
+ syncmem_complex_h2d_op ()(this ->ctx , this ->cpu_ctx , d_aux, aux_fptype.data (), rho_basis->npw );
815
+
816
+
817
+ base_device::memory::set_memory_op<FPTYPE, Device>()(this ->ctx , d_force_g, 0.0 , this ->nat * 3 );
818
+ }
819
+ else
820
+ {
821
+ d_force_g = force_g_host.data ();
822
+ std::fill (force_g_host.begin (), force_g_host.end (), static_cast <FPTYPE>(0.0 ));
823
+ }
824
+
825
+ // call op for sincos calculation
826
+ hamilt::cal_force_ew_sincos_op<FPTYPE, Device>()(
827
+ this ->ctx ,
828
+ this ->nat ,
829
+ rho_basis->npw ,
830
+ rho_basis->ig_gge0 ,
831
+ d_gcar,
832
+ d_tau,
833
+ d_it_facts,
834
+ d_aux,
835
+ d_force_g
836
+ );
837
+
838
+
839
+ if (this ->device == base_device::GpuDevice)
840
+ {
841
+
842
+ syncmem_var_d2h_op ()(this ->cpu_ctx , this ->ctx , force_g_host.data (), d_force_g, this ->nat * 3 );
843
+
844
+
845
+ delmem_var_op ()(this ->ctx , d_gcar);
846
+ delmem_var_op ()(this ->ctx , d_tau);
847
+ delmem_var_op ()(this ->ctx , d_it_facts);
848
+ delmem_complex_op ()(this ->ctx , d_aux);
849
+ delmem_var_op ()(this ->ctx , d_force_g);
850
+ }
851
+
852
+
853
+ for (int iat = 0 ; iat < this ->nat ; iat++) {
854
+ forceion (iat, 0 ) += static_cast <double >(force_g_host[iat * 3 + 0 ]);
855
+ forceion (iat, 1 ) += static_cast <double >(force_g_host[iat * 3 + 1 ]);
856
+ forceion (iat, 2 ) += static_cast <double >(force_g_host[iat * 3 + 2 ]);
857
+ }
858
+
859
+
860
+ // calculate real space force
668
861
#ifdef _OPENMP
669
862
#pragma omp parallel
670
863
{
@@ -688,66 +881,7 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
688
881
iat_end = iat_beg + iat_end;
689
882
ucell.iat2iait (iat_beg, &ia_beg, &it_beg);
690
883
691
- int iat = iat_beg;
692
- int it = it_beg;
693
- int ia = ia_beg;
694
-
695
- // preprocess ig_gap for skipping the ig point
696
- int ig_gap = (rho_basis->ig_gge0 >= 0 && rho_basis->ig_gge0 < rho_basis->npw ) ? rho_basis->ig_gge0 : -1 ;
697
-
698
- double it_fact = 0 .;
699
- int last_it = -1 ;
700
-
701
- // iterating atoms
702
- while (iat < iat_end)
703
- {
704
- if (it != last_it)
705
- { // calculate it_tact when it is changed
706
- double zv;
707
- if (PARAM.inp .use_paw )
708
- {
709
- #ifdef USE_PAW
710
- zv = GlobalC::paw_cell.get_val (it);
711
- #endif
712
- }
713
- else
714
- {
715
- zv = ucell.atoms [it].ncpp .zv ;
716
- }
717
- it_fact = zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact;
718
- last_it = it;
719
- }
720
-
721
- if (ucell.atoms [it].na != 0 )
722
- {
723
- const auto ig_loop = [&](int ig_beg, int ig_end) {
724
- for (int ig = ig_beg; ig < ig_end; ig++)
725
- {
726
- const ModuleBase::Vector3<double > gcar = rho_basis->gcar [ig];
727
- const double arg = ModuleBase::TWO_PI * (gcar * ucell.atoms [it].tau [ia]);
728
- double sinp, cosp;
729
- ModuleBase::libm::sincos (arg, &sinp, &cosp);
730
- double sumnb = -cosp * aux[ig].imag () + sinp * aux[ig].real ();
731
- forceion (iat, 0 ) += gcar[0 ] * sumnb;
732
- forceion (iat, 1 ) += gcar[1 ] * sumnb;
733
- forceion (iat, 2 ) += gcar[2 ] * sumnb;
734
- }
735
- };
736
-
737
- // skip ig_gge0 point by separating ig loop into two part
738
- ig_loop (0 , ig_gap);
739
- ig_loop (ig_gap + 1 , rho_basis->npw );
740
-
741
- forceion (iat, 0 ) *= it_fact;
742
- forceion (iat, 1 ) *= it_fact;
743
- forceion (iat, 2 ) *= it_fact;
744
-
745
- ++iat;
746
- ucell.step_iait (&ia, &it);
747
- }
748
- }
749
-
750
- // means that the processor contains G=0 term.
884
+
751
885
if (rho_basis->ig_gge0 >= 0 )
752
886
{
753
887
double rmax = 5.0 / (sqrt (alpha) * ucell.lat0 );
0 commit comments