added hparam to nextflow using -entry hyperopt

stracquadaniolab · Aug 22, 2024 · 65b0515 · 65b0515
1 parent 16c7581
commit 65b0515
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 98 deletions.
diff --git a/README.md b/README.md
@@ -127,6 +127,14 @@ docker pull ghcr.io/stracquadaniolab/gnn-suite:latest
     models = ["gcn", "gat", ..., "new_model"]
     ```
 
+## Running the Hyperparameter Optimization Workflow
+
+To run the hyperparameter optimization workflow using Nextflow:
+```bash
+nextflow run main.nf -profile docker,<experiment_file> -entry hyperopt
+```
+The results of the search will be in `results/hyperparameters`.
+
 ## FAQ
 In case:
 ```groovy

diff --git a/bin/hyperopt.py b/bin/hyperopt.py
@@ -1,8 +1,9 @@
-# This script is used to run hyperparameter optimization for all models on all datasets
+#!/usr/bin/env python3
 import sys
 import torch
 import optuna
 sys.path.append('models.py')
+import typer
 
 from gnn import run
 
@@ -62,7 +63,9 @@ def run_optuna(data_pair, model):
     data_name = data_pair['name']
     model_name = model
 
-    num_epochs = 250
+    #testing num_epochs
+    num_epochs = 4
+    #num_epochs = 250
 
     study = optuna.create_study(study_name=model_name+"_hp_search",
                                 direction="maximize")
@@ -94,104 +97,38 @@ def run_optuna(data_pair, model):
         print(f"    {key}: {value}")
 
     # Save the best trial to a file
-    save_dir = '/home/essharom/code/cancer-gnn-nf/results/hyperparameters'
+    # save_dir = '/home/essharom/code/cancer-gnn-nf/results/hyperparameters'
 
     # CHANGE FILENAME TO DIFFERENT MODELS
     # Save the best trial to a file
-    with open(f'{save_dir}/best_trial_{model_name}_{data_name}.txt', 'w') as f:
-        f.write("Best trial:\n")
-        f.write(f"  Value: {best_trial.value}\n")
-        f.write("  Params:\n")
-        for key, value in best_trial.params.items():
-            f.write(f"    {key}: {value}\n")
+    #with open(f'{save_dir}/best_trial_{model_name}_{data_name}.txt', 'w') as f:
+    #    f.write("Best trial:\n")
+    #    f.write(f"  Value: {best_trial.value}\n")
+    #    f.write("  Params:\n")
+    #    for key, value in best_trial.params.items():
+    #        f.write(f"    {key}: {value}\n")
 
 
-# Define the directory to save hyperparameter results
-save_dir = '/home/essharom/code/cancer-gnn-nf/results/hyperparameters'
-output_csv_file = '/home/essharom/code/cancer-gnn-nf/results/hyperparameters.csv'
+def run_hyperopt(
+        gene_filename: str,
+        network_filename: str,
+        model_name: str,
+        data_set: str):
 
-def save_best_trial(study, model_name, data_name, save_dir=save_dir):
-    best_trial = study.best_trial
-
-    # Create the directory if it doesn't exist
-    os.makedirs(save_dir, exist_ok=True)
+    data_pairs = [{'name': data_set, 
+                   'networkFile': network_filename, 
+                   'geneFile': gene_filename}]
 
-    # Save the best trial to a file with the appropriate name
-    filename = f'best_trial_{model_name}_{data_name}.txt'
-    file_path = os.path.join(save_dir, filename)
-
-    with open(file_path, 'w') as f:
-        f.write("Best trial:\n")
-        f.write(f"  Value: {best_trial.value}\n")
-        f.write("  Params:\n")
-        for key, value in best_trial.params.items():
-            f.write(f"    {key}: {value}\n")
-
-    print(f"Best trial saved to {file_path}")
-
-def extract_model_dataset_names(filename):
-    parts = filename.split("_")
-    model_name = parts[2]
-    dataset_name = "_".join(parts[3:]).split(".")[0]
-    return model_name, dataset_name
-
-def handle_special_characters(value):
-    numeric_value = re.findall(r'[+-]?\d+(?:\.\d+)?', value)
-    if numeric_value:
-        return float(numeric_value[0])
-    else:
-        return value
-
-def summarize_results_to_csv(directory_path=save_dir, output_file=output_csv_file):
-    # Initialize an empty list to store the data
-    data = []
-
-    # Loop through the files in the directory
-    for filename in os.listdir(directory_path):
-        if filename.startswith("best_trial"):
-            model_name, dataset_name = extract_model_dataset_names(filename)
-            with open(os.path.join(directory_path, filename), 'r') as file:
-                lines = file.readlines()
-                # Extract the value using regular expressions to handle special characters
-                bacc_value = handle_special_characters(re.findall(r'[+-]?\d+(?:\.\d+)?', lines[1])[0])
-                params = {}
-                for line in lines[3:]:
-                    key, value = line.split(":")
-                    params[key.strip()] = handle_special_characters(value.strip())
-                data.append({
-                    'Model': model_name,
-                    'Dataset': dataset_name,
-                    'BACC': bacc_value,
-                    **params
-                })
-
-    # Create a DataFrame from the data list
-    df = pd.DataFrame(data)
-
-    # Save the data table to a CSV file
-    df.to_csv(output_file, index=False)
-
-    print("Data table has been saved to:", output_file)
-
-def main():
-    # Example setup - Replace with actual data and model handling logic
-    data_pairs = [{'name': 'example_dataset'}]  # This should be your actual data
-    models = ['example_model']  # Replace with your model names
+    models = [model_name]
 
     for data_pair in data_pairs:
-        data_name = data_pair['name']  # Get dataset name
-        for model_name in models:
-            print(f"Running hyperparameter optimization for model '{model_name}' with data pair '{data_name}'")
-
-            # Assuming run_optuna is your function to run the optimization
-            study = run_optuna(data_pair, model_name)  # Replace with actual function call
-            save_best_trial(study, model_name, data_name)
-
-            # Clear CUDA cache after each run to avoid memory issues
+        for model in models:
+            print(f"Running hyperparameter optimization for model '{model}' with data pair '{data_pair['name']}'")
+            run_optuna(data_pair, model)
             torch.cuda.empty_cache()
 
-    # After running all optimizations, summarize the results into a CSV
-    summarize_results_to_csv()
 
 if __name__ == "__main__":
-    main()
+    typer.run(run_hyperopt)
+
+
diff --git a/main.nf b/main.nf
@@ -19,6 +19,7 @@ println "Replicates: ${params.replicates}"
 println "Metrics:  ${params.metrics}"
 println "Eval-q: ${params.eval_threshold}"
 println "Verbose interval: ${params.verbose_interval}"
+println "Data set: ${params.dataSet}"
 
 println ""
 
@@ -111,25 +112,43 @@ process CollectStats {
 }
 
 process HyperparameterOptimization {
+
+
+    tag "${dataSet}-${model}"  
+
+    publishDir "${resultsDir}/hyperparameters/${dataSet}", pattern: "best_trial_${model}_${dataSet}.txt", mode: 'copy'
+
     input:
-    path config_file
-    path data_files
-
+        tuple path(geneFile),
+        path(networkFile),
+        val(model),
+        val(dataSet)
 
     output:
-    path "optuna_${params.dataSet}.json" into optuna_results
+        path "best_trial_${model}_${dataSet}.txt", emit: best_trial_output
 
-    script:
     """
-    python3 hyperopt_all_2.py --config ${config_file} --data ${data_files} --output optuna_${params.dataSet}.json
+        hyperopt.py ${geneFile} ${networkFile}\
+            ${model} \
+            ${dataSet} > best_trial_${model}_${dataSet}.txt
     """
 }
 
-workflow hyperoptWorkflow {
-    HyperparameterOptimization(config_file: params.config, data_files: params.data)
+workflow hyperopt {
+    geneChan = channel.fromPath(params.geneFile)
+    networkChan = channel.fromPath(params.networkFile)
+    modelChan = channel.from(params.models)
+    dataSetChan = channel.value(params.dataSet)
+    hparamChan = geneChan.combine(networkChan).combine(modelChan).combine(dataSetChan)
+
+    hparams = HyperparameterOptimization(
+        hparamChan
+    )
+    hparams.view()
 }
 
 
+
 workflow {
     // building channels for experiments
     geneChan = channel.fromPath(params.geneFile)