Release V0.1.1

deepomicslab · Apr 26, 2023 · 85ca668 · 85ca668
1 parent 286958b
commit 85ca668
Show file tree

Hide file tree

Showing 27 changed files with 14,989 additions and 95 deletions.
diff --git a/README.md b/README.md
@@ -61,25 +61,61 @@ predict_data = sc_model.fit_list_complete(data)
 
 ## Parameters
 ###  ```sc_multi_omics```
-> + ```K1```: The local element-wise product parameter, see the manuscript for details (default=20).
-> + ```K2```: The local element-wise product parameter (default=20).
-> + ```K3```: The local element-wise product parameter (default=20).
-> + ```random_seed```: The random seed used in optimization (default=111).
+> + ```K1```: The local element-wise product parameter, see the manuscript for details (default=30).
+> + ```K2```: The local element-wise product parameter (default=30).
+> + ```K3```: The local element-wise product parameter (default=30).
+> + ```random_seed```: The random seed used in optimization (default=123).
 
 ###  ```fit```
+> + ```normalization```: Whether to applied min-max normalization (default=True).
+> + ```pre_impute```: Whether to applied KNNImputer for pre-processing (default=False).
 > + ```opt```: The optimization algorithm for gradient descent, including SGD, Adam, Adadelta, Adagrad, AdamW, SparseAdam, Adamax, ASGD, LBFGS (default="Adam").
 > + ```dist```:The distribution used for modeling, including gaussian, poisson, negative_bionomial (default="gaussian").
 > + ```lr```: The learning rate for gradient descent (default=1e-2).
 > + ```n_epochs```: The number of optimization epochs (default=1000).
-> + ```lambda_C_regularizer```: The coefficient for the penalty term of global cell embeddings (default=0.01).
-> + ```lambda_G_regularizer```: The coefficient for the penalty term of global gene embeddings (default=0.01).
-> + ```lambda_O_regularizer```: The coefficient list for the penalty term of global omics embeddings; the length of the list should be the same with the number of omics (default=[0.01, 0.01]).
-> + ```lambda_OC_regularizer```: The coefficient list for the penalty term of omics-specific cell embeddings; the length of the list should be the same with the number of omics, not avaiable for complete functions (default=[1, 1]).
-> + ```lambda_OG_regularizer```: The coefficient list for the penalty term of omics-specific gene embeddings, the length of the list should be the same with the number of omics, not avaiable for list functions (default=[1, 1]).
-> + ```batch_size```: The batch size used for gradient descent, not avaiable for complete functions (default=1000).
+> + ```lambda_C_regularizer```: The coefficient for the penalty term of global cell embeddings (default=0, indicating automatically adjust.).
+> + ```lambda_G_regularizer```: The coefficient for the penalty term of global gene embeddings (default=0).
+> + ```lambda_O_regularizer```: The coefficient list for the penalty term of global omics embeddings; the length of the list should be the same with the number of omics (default=[0, 0]).
+> + ```lambda_OC_regularizer```: The coefficient list for the penalty term of omics-specific cell embeddings; the length of the list should be the same with the number of omics, not avaiable for complete functions (default=[0, 0]).
+> + ```lambda_OG_regularizer```: The coefficient list for the penalty term of omics-specific gene embeddings, the length of the list should be the same with the number of omics, not avaiable for list functions (default=[0, 0]).
+> + ```batch_size```: The batch size used for gradient descent, not avaiable for complete functions (default=256).
 > + ```device```: CPU or GPU (default='cuda' if torch.cuda.is_available() else 'cpu').
 > + ```verbose```: Whether to print loss for each epoch (default=True).
 
+###  ```cell_analysis```
+#### ```knn_adj_matrix```
+Construct KNN graph with the cell embeddings.
+> + ```k```: The number of neighbos used to construct KNN graph (default=20).
+#### ```snn_adj_matrix```
+Construct SNN graph with the cell embeddings.
+> + ```k```: The number of neighbos used to construct SNN graph (default=20).
+#### ```jsnn_adj_matrix```
+Construct jSNN graph with the cell embeddings.
+> + ```k```: The number of neighbos used to construct jaccard SNN graph (default=20).
+> + ```prune```: Set the score below the value to zero (default=1/15).
+#### ```RunLouvain```
+Run Louvain algorithm for the graph.
+> + ```k```: Terminate the search once this number of communities is detected (default=None).
+#### ```RunSpectral```
+Run Spectral clustering algorithm for the graph.
+> + ```k```: Number of clusters (default=5).
+#### ```RunLeiden```
+Run Leiden algorithm for the graph.
+
+###  ```gene_analysis```
+#### ```pearson_correlation```
+Calculate the correlation between the features.
+#### ```feature_projection```
+Project the feature embedding to cell embeddings and visualize with UMAP.
+> + ```umap_epochs```: The number of UMAP epochs for visualization (default=100).
+> + ```dimension```: The dimension of the embeddings to use (default=30).
+> + ```figure_name```: The saved figure name (default="feature_projections.png").
+
+
+### Version history
++ `v0.1.1`: Automatically adjusts the coefficients; Add downstream analyses; Extend to unpaired data;
++ `v0.0.1`: Initial version.
+
 ### Maintainer
 WANG Ruohan [email protected]
 

diff --git a/examples/.DS_Store b/examples/.DS_Store
diff --git a/examples/CITE_seq_analysis.py → examples/case_studies/CITE_seq_analysis.py b/examples/CITE_seq_analysis.py → examples/case_studies/CITE_seq_analysis.py
@@ -33,30 +33,16 @@ def load_data():
     return expression_data, protein_data, labels
 
 
-def data_normalization(expression_data, protein_data):
-
-    # min-max normalization
-    expression_data = (expression_data - expression_data.min()) / (expression_data.max() - expression_data.min()) + 0.1
-    protein_data = (protein_data - protein_data.min()) / (protein_data.max() - protein_data.min()) + 0.1
-
-    expression_data = np.array(expression_data)
-    protein_data = np.array(protein_data)
-
-    data = [expression_data, protein_data]
-
-    return data
-
-
 if __name__ == "__main__":
 
     start_time = time.time()
     expression_data, protein_data, labels = load_data()
-    data = data_normalization(expression_data, protein_data)
+    data = [expression_data, protein_data]
     print(data[0].shape)
     print(data[1].shape)
 
-    sc_model = sc_multi_omics(K1=30, K2=30, K3=30)
-    predict_data = sc_model.fit_list_complete(data, opt="Adam", dist="gaussian", lr=1e-3, n_epochs=1500, lambda_C_regularizer=0.01, lambda_G_regularizer=0.01, lambda_O_regularizer=[0.01, 0.01], device="cuda:1")
+    sc_model = sc_multi_omics()
+    predict_data = sc_model.fit_list_complete(data, dist="gaussian", lr=1e-3, n_epochs=5000)
 
     np.savetxt("cell_embeddings.csv", sc_model.C, delimiter = ',')
     np.savetxt("gene_embeddings.csv", sc_model.G[0], delimiter = ',')

diff --git a/examples/PEA_STA_analysis.py → examples/case_studies/PEA_STA_analysis.py b/examples/PEA_STA_analysis.py → examples/case_studies/PEA_STA_analysis.py
@@ -22,29 +22,14 @@ def load_data():
     return expression_data, protein_data, labels
 
 
-def data_normalization(expression_data, protein_data):
-
-    # min-max normalization
-    expression_data = (expression_data - expression_data.min()) / (expression_data.max() - expression_data.min()) + 0.1
-    protein_data = (protein_data - protein_data.min()) / (protein_data.max() - protein_data.min()) + 0.1
-
-    expression_data = np.array(expression_data)
-    protein_data = np.array(protein_data)
-
-    data = np.array([expression_data, protein_data])
-
-    return data
-
-
 if __name__ == "__main__":
 
     start_time = time.time()
     expression_data, protein_data, labels = load_data()
-    data = data_normalization(expression_data, protein_data)
-    print(data.shape)
+    data = np.array([expression_data, protein_data])
 
-    sc_model = sc_multi_omics(K1=10, K2=10, K3=10)
-    predict_data = sc_model.fit(data, opt="Adam", dist="gaussian", n_epochs=850, lambda_C_regularizer=0.01, lambda_G_regularizer=1, lambda_O_regularizer=[1, 1], lambda_OC_regularizer=[1, 1], lambda_OG_regularizer=[0.01, 0.01], batch_size=256, device="cpu")
+    sc_model = sc_multi_omics()
+    predict_data = sc_model.fit(data, dist="gaussian", n_epochs=1000)
 
     np.savetxt("cell_embeddings.csv", sc_model.C, delimiter = ',')
     np.savetxt("local_gene_embeddings.csv", sc_model.OG, delimiter = ',')

diff --git a/examples/SCoPE2_analysis.py → examples/case_studies/SCoPE2_analysis.py b/examples/SCoPE2_analysis.py → examples/case_studies/SCoPE2_analysis.py
@@ -17,29 +17,15 @@ def load_data():
     return expression_data, protein_data, labels
 
 
-def data_normalization(expression_data, protein_data):
-
-    # min-max normalization
-    expression_data = (expression_data - expression_data.min()) / (expression_data.max() - expression_data.min()) + 0.1
-    protein_data = (protein_data - protein_data.min()) / (protein_data.max() - protein_data.min()) + 0.1
-
-    expression_data = np.array(expression_data)
-    protein_data = np.array(protein_data)
-    data = np.array([expression_data, protein_data])
-    data[np.isnan(data)] = 0.1
-
-    return data
-
-
 if __name__ == "__main__":
 
     start_time = time.time()
     expression_data, protein_data, labels = load_data()
-    data = data_normalization(expression_data, protein_data)
+    data =np.array([expression_data, protein_data])
     print(data.shape)
 
-    sc_model = sc_multi_omics(K1=30, K2=30, K3=30)
-    predict_data = sc_model.fit_complete(data, opt="Adam", dist="gaussian", lr=1e-3, n_epochs=7000, lambda_C_regularizer=0.01, lambda_G_regularizer=0.01, lambda_O_regularizer=[0.01, 0.01], lambda_OC_regularizer=[2, 2], lambda_OG_regularizer=[2, 2]) 
+    sc_model = sc_multi_omics()
+    predict_data = sc_model.fit_complete(data, dist="gaussian", lr=1e-3, n_epochs=5000) 
 
     np.savetxt("cell_embeddings.csv", sc_model.C, delimiter = ',')
     np.savetxt("gene_embeddings.csv", sc_model.G, delimiter = ',')

diff --git a/examples/SNARE_seq_adult_mouse_analysis.py → ...studies/SNARE_seq_adult_mouse_analysis.py b/examples/SNARE_seq_adult_mouse_analysis.py → ...studies/SNARE_seq_adult_mouse_analysis.py
@@ -23,8 +23,8 @@ def load_data():
     print(data[0].shape)
     print(data[1].shape)
 
-    sc_model = sc_multi_omics(K1=20, K2=20, K3=20)
-    predict_data = sc_model.fit_list_complete(data, opt="Adam", dist="gaussian", lr=1e-3, n_epochs=600, lambda_C_regularizer=0.01, lambda_G_regularizer=0.01, lambda_O_regularizer=[0.01, 0.01])
+    sc_model = sc_multi_omics()
+    predict_data = sc_model.fit_list_complete(data, normalization=False, dist="gaussian", lr=1e-3, n_epochs=3000)
 
     np.savetxt("cell_embeddings.csv", sc_model.C, delimiter = ',')
     np.savetxt("gene_embeddings.csv", sc_model.G[0], delimiter = ',')

diff --git a/...ples/SNARE_seq_neonatal_mouse_analysis.py → ...dies/SNARE_seq_neonatal_mouse_analysis.py b/...ples/SNARE_seq_neonatal_mouse_analysis.py → ...dies/SNARE_seq_neonatal_mouse_analysis.py
@@ -23,8 +23,8 @@ def load_data():
     print(data[0].shape)
     print(data[1].shape)
 
-    sc_model = sc_multi_omics(K1=20, K2=20, K3=20)
-    predict_data = sc_model.fit_list_complete(data, opt="Adam", dist="gaussian", lr=1e-3, n_epochs=1700, lambda_C_regularizer=0.01, lambda_G_regularizer=0.01, lambda_O_regularizer=[0.01, 0.01])
+    sc_model = sc_multi_omics()
+    predict_data = sc_model.fit_list_complete(data, normalization=False, dist="gaussian", lr=1e-3, n_epochs=3000)
 
     np.savetxt("cell_embeddings.csv", sc_model.C, delimiter = ',')
     np.savetxt("gene_embeddings.csv", sc_model.G[0], delimiter = ',')

diff --git a/examples/case_studies/scNMT_analysis.py b/examples/case_studies/scNMT_analysis.py
@@ -0,0 +1,43 @@
+import numpy as np
+import pandas as pd
+from scoit import sc_multi_omics
+import time
+
+def load_data():
+    expression_data = np.loadtxt("data/scNMT/expression_data_300.csv")
+    promoter_methy_data = np.loadtxt("data/scNMT/promoter_methy_data_300.csv")
+    promoter_acc_data = np.loadtxt("data/scNMT/promoter_acc_data_300.csv")
+
+    cell_stage = np.array(pd.read_csv("data/scNMT/cell_stage.csv", header=None))
+
+    labels = []
+    for each in cell_stage:
+        if each == "E5.5":
+            labels.append(0)
+        if each == "E6.5":
+            labels.append(1)
+        if each == "E7.5":
+            labels.append(2)
+    labels = np.array(labels)
+
+
+    return expression_data, promoter_methy_data, promoter_acc_data, labels
+
+
+if __name__ == "__main__":
+
+    start_time = time.time()
+    expression_data, promoter_methy_data, promoter_acc_data, labels = load_data()
+    data = [expression_data, promoter_methy_data, promoter_acc_data]
+    print(data[0].shape)
+    print(data[1].shape)
+    print(data[2].shape)
+
+    sc_model = sc_multi_omics()
+    predict_data = sc_model.fit_list(data, normalization=False, dist="gaussian", lr=1e-3, n_epochs=1000)
+
+    np.savetxt("cell_embeddings.csv", sc_model.C, delimiter = ',')
+    np.savetxt("predict_data_expression.csv", predict_data[0])
+    np.savetxt("predict_data_promoter_methy.csv", predict_data[1])
+    np.savetxt("predict_data_promoter_acc.csv", predict_data[2])
+    print(time.time() - start_time)
diff --git a/examples/sc_GEM_analysis.py → examples/case_studies/sc_GEM_analysis.py b/examples/sc_GEM_analysis.py → examples/case_studies/sc_GEM_analysis.py
@@ -4,8 +4,8 @@
 import time
 
 def load_data():
-    expression_data = pd.read_csv("data/sc_GEM/expression_data.csv", index_col=0)
-    methylation_data = pd.read_csv("data/sc_GEM/methylation_data.csv", index_col=0)
+    expression_data = np.array(pd.read_csv("data/sc_GEM/expression_data.csv", index_col=0))
+    methylation_data = np.array(pd.read_csv("data/sc_GEM/methylation_data.csv", index_col=0))
     cell_stage = np.array(pd.read_csv("data/sc_GEM/cell_stage.csv", header=None))[0]
     labels = []
     for each in cell_stage:
@@ -25,28 +25,16 @@ def load_data():
     return expression_data, methylation_data, labels
 
 
-def data_normalization(expression_data, methylation_data):
-
-    # min-max normalization
-    expression_data = (expression_data - expression_data.min()) / (expression_data.max() - expression_data.min()) + 0.1
-    methylation_data = methylation_data + 0.1
-
-    expression_data = np.array(expression_data)
-    methylation_data = np.array(methylation_data)
-
-    data = np.array([expression_data, methylation_data])
-
-    return data
 
 if __name__ == "__main__":
 
     start_time = time.time()
     expression_data, methylation_data, labels = load_data()
-    data = data_normalization(expression_data, methylation_data)
+    data = np.array([expression_data, methylation_data])
     print(data.shape)
 
-    sc_model = sc_multi_omics(K1=30, K2=30, K3=30)
-    predict_data = sc_model.fit(data, opt="Adam", dist="negative_bionomial", n_epochs=260, lambda_C_regularizer=0.01, lambda_G_regularizer=0.01, lambda_O_regularizer=[0.01, 0.01], lambda_OC_regularizer=[1, 1], lambda_OG_regularizer=[1, 1], batch_size=256, device="cpu")
+    sc_model = sc_multi_omics()
+    predict_data = sc_model.fit(data, dist="negative_bionomial", n_epochs=1000, device="cpu")
 
     np.savetxt("cell_embeddings.csv", sc_model.C, delimiter = ',')
     np.savetxt("gene_embeddings.csv", sc_model.G, delimiter = ',')

diff --git a/examples/sci_CAR_analysis.py → examples/case_studies/sci_CAR_analysis.py b/examples/sci_CAR_analysis.py → examples/case_studies/sci_CAR_analysis.py
@@ -23,8 +23,8 @@ def load_data():
     print(data[0].shape)
     print(data[1].shape)
 
-    sc_model = sc_multi_omics(K1=20, K2=20, K3=20)
-    predict_data = sc_model.fit_list_complete(data, opt="Adam", dist="gaussian", lr=1e-3, n_epochs=500, lambda_C_regularizer=0.01, lambda_G_regularizer=0.01, lambda_O_regularizer=[0.01, 0.01])
+    sc_model = sc_multi_omics()
+    predict_data = sc_model.fit_list_complete(data, normalization=False, dist="gaussian", lr=1e-3, n_epochs=3000)
 
     np.savetxt("cell_embeddings.csv", sc_model.C, delimiter = ',')
     np.savetxt("gene_embeddings.csv", sc_model.G[0], delimiter = ',')