21
21
#include " utilities/gpu_macro.cuh"
22
22
#include " utilities/nep_utilities.cuh"
23
23
#include < cstring>
24
+ #include < iostream>
25
+ #include < stdexcept>
24
26
25
27
void Dataset::copy_structures (std::vector<Structure>& structures_input, int n1, int n2)
26
28
{
@@ -32,6 +34,8 @@ void Dataset::copy_structures(std::vector<Structure>& structures_input, int n1,
32
34
structures[n].num_atom = structures_input[n_input].num_atom ;
33
35
structures[n].weight = structures_input[n_input].weight ;
34
36
structures[n].has_virial = structures_input[n_input].has_virial ;
37
+ structures[n].has_atomic_virial = structures_input[n_input].has_atomic_virial ;
38
+ structures[n].atomic_virial_diag_only = structures_input[n_input].atomic_virial_diag_only ;
35
39
structures[n].charge = structures_input[n_input].charge ;
36
40
structures[n].energy = structures_input[n_input].energy ;
37
41
structures[n].energy_weight = structures_input[n_input].energy_weight ;
@@ -68,6 +72,33 @@ void Dataset::copy_structures(std::vector<Structure>& structures_input, int n1,
68
72
structures[n].fy [na] = structures_input[n_input].fy [na];
69
73
structures[n].fz [na] = structures_input[n_input].fz [na];
70
74
}
75
+
76
+ if (structures[n].has_atomic_virial != structures[0 ].has_atomic_virial ) {
77
+ throw std::runtime_error (" All structures must have the same has_atomic_virial flag." );
78
+ }
79
+ if (structures[n].atomic_virial_diag_only != structures[0 ].atomic_virial_diag_only ) {
80
+ throw std::runtime_error (" All structures must have the same atomic_virial_diag_only flag." );
81
+ }
82
+ if (structures[n].has_atomic_virial ) {
83
+ structures[n].avirialxx .resize (structures[n].num_atom );
84
+ structures[n].avirialyy .resize (structures[n].num_atom );
85
+ structures[n].avirialzz .resize (structures[n].num_atom );
86
+ for (int na = 0 ; na < structures[n].num_atom ; ++na) {
87
+ structures[n].avirialxx [na] = structures_input[n_input].avirialxx [na];
88
+ structures[n].avirialyy [na] = structures_input[n_input].avirialyy [na];
89
+ structures[n].avirialzz [na] = structures_input[n_input].avirialzz [na];
90
+ }
91
+ if (!structures[n].atomic_virial_diag_only ) {
92
+ structures[n].avirialxy .resize (structures[n].num_atom );
93
+ structures[n].avirialyz .resize (structures[n].num_atom );
94
+ structures[n].avirialzx .resize (structures[n].num_atom );
95
+ for (int na = 0 ; na < structures[n].num_atom ; ++na) {
96
+ structures[n].avirialxy [na] = structures_input[n_input].avirialxy [na];
97
+ structures[n].avirialyz [na] = structures_input[n_input].avirialyz [na];
98
+ structures[n].avirialzx [na] = structures_input[n_input].avirialzx [na];
99
+ }
100
+ }
101
+ }
71
102
}
72
103
}
73
104
@@ -142,6 +173,9 @@ void Dataset::initialize_gpu_data(Parameters& para)
142
173
energy_weight_cpu.resize (Nc);
143
174
virial_ref_cpu.resize (Nc * 6 );
144
175
force_ref_cpu.resize (N * 3 );
176
+ if (structures[0 ].has_atomic_virial ) {
177
+ avirial_ref_cpu.resize (N * (structures[0 ].atomic_virial_diag_only ? 3 : 6 ));
178
+ }
145
179
temperature_ref_cpu.resize (N);
146
180
147
181
for (int n = 0 ; n < Nc; ++n) {
@@ -170,6 +204,16 @@ void Dataset::initialize_gpu_data(Parameters& para)
170
204
force_ref_cpu[Na_sum_cpu[n] + na + N] = structures[n].fy [na];
171
205
force_ref_cpu[Na_sum_cpu[n] + na + N * 2 ] = structures[n].fz [na];
172
206
temperature_ref_cpu[Na_sum_cpu[n] + na] = structures[n].temperature ;
207
+ if (structures[n].has_atomic_virial ) {
208
+ avirial_ref_cpu[Na_sum_cpu[n] + na] = structures[n].avirialxx [na];
209
+ avirial_ref_cpu[Na_sum_cpu[n] + na + N] = structures[n].avirialyy [na];
210
+ avirial_ref_cpu[Na_sum_cpu[n] + na + N * 2 ] = structures[n].avirialzz [na];
211
+ if (!structures[n].atomic_virial_diag_only ) {
212
+ avirial_ref_cpu[Na_sum_cpu[n] + na + N * 3 ] = structures[n].avirialxy [na];
213
+ avirial_ref_cpu[Na_sum_cpu[n] + na + N * 4 ] = structures[n].avirialyz [na];
214
+ avirial_ref_cpu[Na_sum_cpu[n] + na + N * 5 ] = structures[n].avirialzx [na];
215
+ }
216
+ }
173
217
}
174
218
}
175
219
@@ -179,13 +223,19 @@ void Dataset::initialize_gpu_data(Parameters& para)
179
223
energy_weight_gpu.resize (Nc);
180
224
virial_ref_gpu.resize (Nc * 6 );
181
225
force_ref_gpu.resize (N * 3 );
226
+ if (structures[0 ].has_atomic_virial ) {
227
+ avirial_ref_gpu.resize (N * (structures[0 ].atomic_virial_diag_only ? 3 : 6 ));
228
+ }
182
229
temperature_ref_gpu.resize (N);
183
230
type_weight_gpu.copy_from_host (para.type_weight_cpu .data ());
184
231
charge_ref_gpu.copy_from_host (charge_ref_cpu.data ());
185
232
energy_ref_gpu.copy_from_host (energy_ref_cpu.data ());
186
233
energy_weight_gpu.copy_from_host (energy_weight_cpu.data ());
187
234
virial_ref_gpu.copy_from_host (virial_ref_cpu.data ());
188
235
force_ref_gpu.copy_from_host (force_ref_cpu.data ());
236
+ if (structures[0 ].has_atomic_virial ) {
237
+ avirial_ref_gpu.copy_from_host (avirial_ref_cpu.data ());
238
+ }
189
239
temperature_ref_gpu.copy_from_host (temperature_ref_cpu.data ());
190
240
191
241
box.resize (Nc * 18 );
@@ -447,6 +497,157 @@ std::vector<float> Dataset::get_rmse_force(Parameters& para, const bool use_weig
447
497
return rmse_array;
448
498
}
449
499
500
+ static __global__ void gpu_sum_avirial_diag_only_error (
501
+ const int N,
502
+ int * g_Na,
503
+ int * g_Na_sum,
504
+ int * g_type,
505
+ float * g_type_weight,
506
+ float * g_virial,
507
+ float * g_avxx_ref,
508
+ float * g_avyy_ref,
509
+ float * g_avzz_ref,
510
+ float * error_gpu)
511
+ {
512
+ int tid = threadIdx .x ;
513
+ int bid = blockIdx .x ;
514
+ int N1 = g_Na_sum[bid];
515
+ int N2 = N1 + g_Na[bid];
516
+ extern __shared__ float s_error[];
517
+ s_error[tid] = 0 .0f ;
518
+
519
+ for (int n = N1 + tid; n < N2; n += blockDim .x ) {
520
+ float avxx_ref = g_avxx_ref[n];
521
+ float avyy_ref = g_avyy_ref[n];
522
+ float avzz_ref = g_avzz_ref[n];
523
+ float dxx = g_virial[n] - avxx_ref;
524
+ float dyy = g_virial[1 * N + n] - avyy_ref;
525
+ float dzz = g_virial[2 * N + n] - avzz_ref;
526
+ float diff_square = dxx * dxx + dyy * dyy + dzz * dzz;
527
+ s_error[tid] += diff_square;
528
+ }
529
+ __syncthreads ();
530
+
531
+ for (int offset = blockDim .x >> 1 ; offset > 0 ; offset >>= 1 ) {
532
+ if (tid < offset) {
533
+ s_error[tid] += s_error[tid + offset];
534
+ }
535
+ __syncthreads ();
536
+ }
537
+
538
+ if (tid == 0 ) {
539
+ error_gpu[bid] = s_error[0 ];
540
+ }
541
+ }
542
+
543
+ static __global__ void gpu_sum_avirial_error (
544
+ const int N,
545
+ int * g_Na,
546
+ int * g_Na_sum,
547
+ int * g_type,
548
+ float * g_type_weight,
549
+ float * g_virial,
550
+ float * g_avxx_ref,
551
+ float * g_avyy_ref,
552
+ float * g_avzz_ref,
553
+ float * g_avxy_ref,
554
+ float * g_avyz_ref,
555
+ float * g_avzx_ref,
556
+ float * error_gpu)
557
+ {
558
+ int tid = threadIdx .x ;
559
+ int bid = blockIdx .x ;
560
+ int N1 = g_Na_sum[bid];
561
+ int N2 = N1 + g_Na[bid];
562
+ extern __shared__ float s_error[];
563
+ s_error[tid] = 0 .0f ;
564
+
565
+ for (int n = N1 + tid; n < N2; n += blockDim .x ) {
566
+ float avxx_ref = g_avxx_ref[n];
567
+ float avyy_ref = g_avyy_ref[n];
568
+ float avzz_ref = g_avzz_ref[n];
569
+ float avxy_ref = g_avxy_ref[n];
570
+ float avyz_ref = g_avyz_ref[n];
571
+ float avzx_ref = g_avzx_ref[n];
572
+ float dxx = g_virial[n] - avxx_ref;
573
+ float dyy = g_virial[1 * N + n] - avyy_ref;
574
+ float dzz = g_virial[2 * N + n] - avzz_ref;
575
+ float dxy = g_virial[3 * N + n] - avxy_ref;
576
+ float dyz = g_virial[4 * N + n] - avyz_ref;
577
+ float dzx = g_virial[5 * N + n] - avzx_ref;
578
+ float diff_square = dxx * dxx + dyy * dyy + dzz * dzz + dxy * dxy + dyz * dyz + dzx * dzx;
579
+ s_error[tid] += diff_square;
580
+ }
581
+ __syncthreads ();
582
+
583
+ for (int offset = blockDim .x >> 1 ; offset > 0 ; offset >>= 1 ) {
584
+ if (tid < offset) {
585
+ s_error[tid] += s_error[tid + offset];
586
+ }
587
+ __syncthreads ();
588
+ }
589
+
590
+ if (tid == 0 ) {
591
+ error_gpu[bid] = s_error[0 ];
592
+ }
593
+ }
594
+
595
+ std::vector<float > Dataset::get_rmse_avirial (Parameters& para, const bool use_weight, int device_id)
596
+ {
597
+ CHECK (gpuSetDevice (device_id));
598
+ const int block_size = 256 ;
599
+
600
+ if (structures[0 ].atomic_virial_diag_only ) {
601
+ gpu_sum_avirial_diag_only_error<<<Nc, block_size, sizeof (float ) * block_size>>> (
602
+ N,
603
+ Na.data (),
604
+ Na_sum.data (),
605
+ type.data (),
606
+ type_weight_gpu.data (),
607
+ virial.data (),
608
+ avirial_ref_gpu.data (),
609
+ avirial_ref_gpu.data () + N,
610
+ avirial_ref_gpu.data () + N * 2 ,
611
+ error_gpu.data ());
612
+ } else {
613
+ gpu_sum_avirial_error<<<Nc, block_size, sizeof (float ) * block_size>>> (
614
+ N,
615
+ Na.data (),
616
+ Na_sum.data (),
617
+ type.data (),
618
+ type_weight_gpu.data (),
619
+ virial.data (),
620
+ avirial_ref_gpu.data (),
621
+ avirial_ref_gpu.data () + N,
622
+ avirial_ref_gpu.data () + N * 2 ,
623
+ avirial_ref_gpu.data () + N * 3 ,
624
+ avirial_ref_gpu.data () + N * 4 ,
625
+ avirial_ref_gpu.data () + N * 5 ,
626
+ error_gpu.data ());
627
+ }
628
+ int mem = sizeof (float ) * Nc;
629
+ CHECK (gpuMemcpy (error_cpu.data (), error_gpu.data (), mem, gpuMemcpyDeviceToHost));
630
+
631
+ std::vector<float > rmse_array (para.num_types + 1 , 0 .0f );
632
+ std::vector<int > count_array (para.num_types + 1 , 0 );
633
+ for (int n = 0 ; n < Nc; ++n) {
634
+ float rmse_temp = use_weight ? weight_cpu[n] * weight_cpu[n] * error_cpu[n] : error_cpu[n];
635
+ for (int t = 0 ; t < para.num_types + 1 ; ++t) {
636
+ if (has_type[t * Nc + n]) {
637
+ rmse_array[t] += rmse_temp;
638
+ count_array[t] += Na_cpu[n];
639
+ }
640
+ }
641
+ }
642
+
643
+ for (int t = 0 ; t <= para.num_types ; ++t) {
644
+ if (count_array[t] > 0 ) {
645
+ rmse_array[t] = sqrt (rmse_array[t] / (count_array[t] * 6 ));
646
+ }
647
+ }
648
+ return rmse_array;
649
+ }
650
+
450
651
static __global__ void
451
652
gpu_get_energy_shift (
452
653
int * g_Na,
@@ -625,6 +826,9 @@ static __global__ void gpu_sum_virial_error(
625
826
626
827
std::vector<float > Dataset::get_rmse_virial (Parameters& para, const bool use_weight, int device_id)
627
828
{
829
+ if (para.atomic_v ) {
830
+ return get_rmse_avirial (para, use_weight, device_id);
831
+ }
628
832
CHECK (gpuSetDevice (device_id));
629
833
630
834
std::vector<float > rmse_array (para.num_types + 1 , 0 .0f );
0 commit comments