1
- """
2
- Purpose:
3
- Select structures from orginal large reference dataset based on principal component Analysis (PCA) of
4
- descriptor space using farthest point sampling. We use the PbTe as a toy example to show how this script
5
- works, one need to modify the path of reference dataset, nep model, and selected frame number case by case.
6
-
7
- Ref:
8
- calorine: https://calorine.materialsmodeling.org/tutorials/visualize_descriptor_space_with_pca.html
9
- https://github.com/bigd4/PyNEP/blob/master/examples/plot_select_structure.py
10
-
11
- Author:
12
- Penghua Ying <hityingph(at)163.com>
13
- """
14
-
15
- from ase .io import read , write
16
- from pylab import *
17
- from calorine .nep import get_descriptors
18
- from sklearn .decomposition import PCA
19
- from tqdm import tqdm
20
- from scipy .spatial .distance import cdist
21
-
22
- # Farthest Point Sampling
23
- def farthest_point_sampling (points , n_samples ):
24
- n_points = points .shape [0 ]
25
- selected_indices = [np .random .randint (n_points )]
26
- for _ in range (1 , n_samples ):
27
- distances = cdist (points , points [selected_indices ])
28
- min_distances = np .min (distances , axis = 1 )
29
- next_index = np .argmax (min_distances )
30
- selected_indices .append (next_index )
31
- return selected_indices
32
-
33
- aw = 2
34
- fs = 16
35
- lw = 2
36
- font = {'size' : fs }
37
- matplotlib .rc ('font' , ** font )
38
- matplotlib .rc ('axes' , linewidth = aw )
39
-
40
- def set_fig_properties (ax_list ):
41
- tl = 8
42
- tw = 2
43
- tlm = 4
44
-
45
- for ax in ax_list :
46
- ax .tick_params (which = 'major' , length = tl , width = tw )
47
- ax .tick_params (which = 'minor' , length = tlm , width = tw )
48
- ax .tick_params (which = 'both' , axis = 'both' , direction = 'out' , right = False , top = False )
49
-
50
-
51
-
52
- tol = read ("../../examples/11_NEP_potential_PbTe/test.xyz" , ":" ) # read orginal larger reference.xyz
53
-
54
-
55
- descriptors = []
56
- for i , t in tqdm (enumerate (tol )):
57
- d = get_descriptors (t , model_filename = '../../examples/11_NEP_potential_PbTe/nep.txt' ) # get descriptors using the pre-trained nep model
58
- d_mean = np .mean (d , axis = 0 ) # Use average of each atomic descriptors to get structure descriptors
59
- descriptors .append (d_mean )
60
-
61
- descriptors = np .array (descriptors )
62
- print (f'Total frame of structures in dataset: { descriptors .shape [0 ]} ' )
63
- print (f'Number of descriptor components: { descriptors .shape [1 ]} ' )
64
- pca = PCA (n_components = 2 )
65
- pc = pca .fit_transform (descriptors )
66
- p0 = pca .explained_variance_ratio_ [0 ]
67
- p1 = pca .explained_variance_ratio_ [1 ]
68
- print (f'Explained variance for component 0: { p0 :.2f} ' )
69
- print (f'Explained variance for component 1: { p1 :.2f} ' )
70
-
71
- # Select 25 structures using FPS
72
- n_samples = 25
73
- selected_indices = farthest_point_sampling (pc , n_samples )
74
- selected_structures = [tol [i ] for i in selected_indices ]
75
- unselected_structures = [t for i , t in enumerate (tol ) if i not in selected_indices ]
76
-
77
- # Save the selected and unselected structures
78
- write ('selected_structures.xyz' , selected_structures )
79
-
80
- figure (figsize = (10 , 8 ))
81
- set_fig_properties ([gca ()])
82
- scatter (pc [:, 0 ], pc [:, 1 ], alpha = 0.5 , c = "C0" , label = 'All structures' )
83
- scatter (pc [selected_indices , 0 ], pc [selected_indices , 1 ], s = 8 , color = 'C1' , label = 'Selected structures' )
84
- xlabel ('PC1' )
85
- ylabel ('PC2' )
86
- legend ()
1
+ """
2
+ Purpose:
3
+ Select structures from orginal large reference dataset based on principal component Analysis (PCA) of
4
+ descriptor space using farthest point sampling. We use the PbTe as a toy example to show how this script
5
+ works, one need to modify the path of reference dataset, nep model, and selected frame number case by case.
6
+
7
+ Ref:
8
+ calorine: https://calorine.materialsmodeling.org/tutorials/visualize_descriptor_space_with_pca.html
9
+ https://github.com/bigd4/PyNEP/blob/master/examples/plot_select_structure.py
10
+
11
+ Author:
12
+ Penghua Ying <hityingph(at)163.com>
13
+ """
14
+
15
+ from ase .io import read , write
16
+ from pylab import *
17
+ from calorine .nep import get_descriptors
18
+ from sklearn .decomposition import PCA
19
+ from tqdm import tqdm
20
+ from scipy .spatial .distance import cdist
21
+
22
+ # Farthest Point Sampling
23
+ def farthest_point_sampling (points , n_samples ):
24
+ n_points = points .shape [0 ]
25
+ selected_indices = [np .random .randint (n_points )]
26
+ for _ in range (1 , n_samples ):
27
+ distances = cdist (points , points [selected_indices ])
28
+ min_distances = np .min (distances , axis = 1 )
29
+ next_index = np .argmax (min_distances )
30
+ selected_indices .append (next_index )
31
+ return selected_indices
32
+
33
+ aw = 2
34
+ fs = 16
35
+ lw = 2
36
+ font = {'size' : fs }
37
+ matplotlib .rc ('font' , ** font )
38
+ matplotlib .rc ('axes' , linewidth = aw )
39
+
40
+ def set_fig_properties (ax_list ):
41
+ tl = 8
42
+ tw = 2
43
+ tlm = 4
44
+
45
+ for ax in ax_list :
46
+ ax .tick_params (which = 'major' , length = tl , width = tw )
47
+ ax .tick_params (which = 'minor' , length = tlm , width = tw )
48
+ ax .tick_params (which = 'both' , axis = 'both' , direction = 'out' , right = False , top = False )
49
+
50
+
51
+
52
+ tol = read ("../../examples/11_NEP_potential_PbTe/test.xyz" , ":" ) # read orginal larger reference.xyz
53
+
54
+
55
+ descriptors = []
56
+ for i , t in tqdm (enumerate (tol )):
57
+ d = get_descriptors (t , model_filename = '../../examples/11_NEP_potential_PbTe/nep.txt' ) # get descriptors using the pre-trained nep model
58
+ d_mean = np .mean (d , axis = 0 ) # Use average of each atomic descriptors to get structure descriptors
59
+ descriptors .append (d_mean )
60
+
61
+ descriptors = np .array (descriptors )
62
+ print (f'Total frame of structures in dataset: { descriptors .shape [0 ]} ' )
63
+ print (f'Number of descriptor components: { descriptors .shape [1 ]} ' )
64
+ pca = PCA (n_components = 2 )
65
+ pc = pca .fit_transform (descriptors )
66
+ p0 = pca .explained_variance_ratio_ [0 ]
67
+ p1 = pca .explained_variance_ratio_ [1 ]
68
+ print (f'Explained variance for component 0: { p0 :.2f} ' )
69
+ print (f'Explained variance for component 1: { p1 :.2f} ' )
70
+
71
+ # Select 25 structures using FPS
72
+ n_samples = 25
73
+ selected_indices = farthest_point_sampling (pc , n_samples )
74
+ selected_structures = [tol [i ] for i in selected_indices ]
75
+ unselected_structures = [t for i , t in enumerate (tol ) if i not in selected_indices ]
76
+
77
+ # Save the selected and unselected structures
78
+ write ('selected_structures.xyz' , selected_structures )
79
+
80
+ figure (figsize = (10 , 8 ))
81
+ set_fig_properties ([gca ()])
82
+ scatter (pc [:, 0 ], pc [:, 1 ], alpha = 0.5 , c = "C0" , label = 'All structures' )
83
+ scatter (pc [selected_indices , 0 ], pc [selected_indices , 1 ], s = 8 , color = 'C1' , label = 'Selected structures' )
84
+ xlabel ('PC1' )
85
+ ylabel ('PC2' )
86
+ legend ()
87
87
savefig ('FPS.png' , bbox_inches = 'tight' )
0 commit comments