From c649385c6cb85494d73139ff9c463e98b0fc2b12 Mon Sep 17 00:00:00 2001
From: essharom <kamp.sebi@gmail.com>
Date: Thu, 22 Aug 2024 19:48:51 +0100
Subject: [PATCH] clean hparams, add readme

---
 README.md             | 11 ++++++++---
 bin/hyperopt.py       | 16 ++++++++++++++--
 bin/hyperopt_all_2.py | 40 ++++++++++++++++++++--------------------
 conf/string.config    |  2 +-
 main.nf               |  1 +
 5 files changed, 44 insertions(+), 26 deletions(-)
diff --git a/README.md b/README.md
index 1d587e7..6babf6f 100644
--- a/README.md
+++ b/README.md
@@ -127,13 +127,18 @@ docker pull ghcr.io/stracquadaniolab/gnn-suite:latest
     models = ["gcn", "gat", ..., "new_model"]
     ```
 
-## Running the Hyperparameter Optimization Workflow
+## Hyperparameter Optimization with Optuna
+
+To run the hyperparameter optimization workflow using `optuna` defined in `hyperopt.py`, run the hyperparameter optimization workflow:
 
-To run the hyperparameter optimization workflow using Nextflow:
 ```bash
 nextflow run main.nf -profile docker,<experiment_file> -entry hyperopt
 ```
-The results of the search will be in `results/hyperparameters`.
+
+The results of the search will be stored in the `results/hyperparameters/<experiment_file>/` directory. You can find the best trial information in the `best_trial_<model>_<experiment>.txt` file.
+
+For more information on `optuna`, you can visit the official documentation at [https://optuna.readthedocs.io](https://optuna.readthedocs.io).
+
 
 ## FAQ
 In case:
diff --git a/bin/hyperopt.py b/bin/hyperopt.py
index 4290c0d..ebea114 100755
--- a/bin/hyperopt.py
+++ b/bin/hyperopt.py
@@ -7,6 +7,15 @@
 
 from gnn import run
 
+import os
+import sys
+import contextlib
+
+def run_silently(func, *args, **kwargs):
+    with open(os.devnull, 'w') as fnull:
+        with contextlib.redirect_stdout(fnull):
+            return func(*args, **kwargs)
+
 def objective_gnn(trial, model_name, gene_filename, network_filename, num_epochs=300):
     # Define hyperparameters to optimize
 
@@ -27,6 +36,9 @@ def objective_gnn(trial, model_name, gene_filename, network_filename, num_epochs
         verbose_interval= 10,
         dropout= dropout
     )
+
+    
+
     
     return bacc
 
@@ -64,8 +76,8 @@ def run_optuna(data_pair, model):
     model_name = model
 
     #testing num_epochs
-    num_epochs = 4
-    #num_epochs = 250
+    #num_epochs = 5
+    num_epochs = 250
     
     study = optuna.create_study(study_name=model_name+"_hp_search",
                                 direction="maximize")
diff --git a/bin/hyperopt_all_2.py b/bin/hyperopt_all_2.py
index 2ed13ee..a42bccf 100644
--- a/bin/hyperopt_all_2.py
+++ b/bin/hyperopt_all_2.py
@@ -62,7 +62,7 @@ def run_optuna(data_pair, model):
     data_name = data_pair['name']
     model_name = model
 
-    num_epochs = 250
+    num_epochs = 25
     # RENAME FOR DIFFERENT MODELS AND IN OBJECTIVE FUNCTION above
     #model_name = "GCN2"
 
@@ -85,7 +85,7 @@ def run_optuna(data_pair, model):
                                                     network_filename,
                                                     num_epochs),
                                                 n_jobs=-1,
-                                                n_trials=300)
+                                                n_trials=10)
 
 
     # Print the best trial
@@ -115,26 +115,26 @@ def run_optuna(data_pair, model):
         'name': 'string',
         'networkFile': '/home/essharom/code/cancer-gnn-nf/data/entrez_stringhc.tsv',
         'geneFile': '/home/essharom/code/cancer-gnn-nf/data/entrez_fpancanall_nstringhc_lbailey.csv'
-    },
-    {
-        'name': 'string_cosmic',
-        'networkFile': '/home/essharom/code/cancer-gnn-nf/data/entrez_stringhc.tsv',
-        'geneFile': '/home/essharom/code/cancer-gnn-nf/data/entrez_fpancanall_nstringhc_lcosmic.csv'
-    },
-    {
-        'name': 'biogrid',
-        'networkFile': '/home/essharom/code/cancer-gnn-nf/data/entrez_biogridhc.tsv',
-        'geneFile': '/home/essharom/code/cancer-gnn-nf/data/entrez_fpancanall_nbiogridhc_lbailey.csv'
-    },
-    {
-        'name': 'biogrid_cosmic',
-        'networkFile': '/home/essharom/code/cancer-gnn-nf/data/entrez_biogridhc.tsv',
-        'geneFile': '/home/essharom/code/cancer-gnn-nf/data/entrez_fpancanall_nbiogridhc_lcosmic.csv'
-    }
+    }#,
+    # {
+    #     'name': 'string_cosmic',
+    #     'networkFile': '/home/essharom/code/cancer-gnn-nf/data/entrez_stringhc.tsv',
+    #     'geneFile': '/home/essharom/code/cancer-gnn-nf/data/entrez_fpancanall_nstringhc_lcosmic.csv'
+    # },
+    # {
+    #     'name': 'biogrid',
+    #     'networkFile': '/home/essharom/code/cancer-gnn-nf/data/entrez_biogridhc.tsv',
+    #     'geneFile': '/home/essharom/code/cancer-gnn-nf/data/entrez_fpancanall_nbiogridhc_lbailey.csv'
+    # },
+    # {
+    #     'name': 'biogrid_cosmic',
+    #     'networkFile': '/home/essharom/code/cancer-gnn-nf/data/entrez_biogridhc.tsv',
+    #     'geneFile': '/home/essharom/code/cancer-gnn-nf/data/entrez_fpancanall_nbiogridhc_lcosmic.csv'
+    # }
     ]
 
-    models = ["sage", "gin", "gtn", "gcn2", "gcn", "gat", "gat3h", "hgcn", "phgcn"]
-    
+    #models = ["sage", "gin", "gtn", "gcn2", "gcn", "gat", "gat3h", "hgcn", "phgcn"]
+    models = ["gcn"]
 
     for data_pair in data_pairs:
         for model in models:
diff --git a/conf/string.config b/conf/string.config
index 96da112..e7b196a 100644
--- a/conf/string.config
+++ b/conf/string.config
@@ -4,7 +4,7 @@ params {
   networkFile = "${baseDir}/data/entrez_stringhc.tsv"
   geneFile = "${baseDir}/data/entrez_fpancanall_nstringhc_lbailey.csv"
   epochs = [100]
-  models = ["gcn",]
+  models = ["gcn", "gat"]
   replicates = 3
   verbose_interval = 1
   dropout = 0.2
diff --git a/main.nf b/main.nf
index 882fed5..8936920 100644
--- a/main.nf
+++ b/main.nf
@@ -131,6 +131,7 @@ process HyperparameterOptimization {
         hyperopt.py ${geneFile} ${networkFile}\
             ${model} \
             ${dataSet} > best_trial_${model}_${dataSet}.txt
+        clean_hparams.py best_trial_${model}_${dataSet}.txt
     """
 }