Skip to content

Commit

Permalink
added hparam to nextflow using -entry hyperopt
Browse files Browse the repository at this point in the history
  • Loading branch information
essharom committed Aug 22, 2024
1 parent 16c7581 commit 65b0515
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 98 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,14 @@ docker pull ghcr.io/stracquadaniolab/gnn-suite:latest
models = ["gcn", "gat", ..., "new_model"]
```
## Running the Hyperparameter Optimization Workflow
To run the hyperparameter optimization workflow using Nextflow:
```bash
nextflow run main.nf -profile docker,<experiment_file> -entry hyperopt
```
The results of the search will be in `results/hyperparameters`.

## FAQ
In case:
```groovy
Expand Down
117 changes: 27 additions & 90 deletions bin/hyperopt.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# This script is used to run hyperparameter optimization for all models on all datasets
#!/usr/bin/env python3
import sys
import torch
import optuna
sys.path.append('models.py')
import typer

from gnn import run

Expand Down Expand Up @@ -62,7 +63,9 @@ def run_optuna(data_pair, model):
data_name = data_pair['name']
model_name = model

num_epochs = 250
#testing num_epochs
num_epochs = 4
#num_epochs = 250

study = optuna.create_study(study_name=model_name+"_hp_search",
direction="maximize")
Expand Down Expand Up @@ -94,104 +97,38 @@ def run_optuna(data_pair, model):
print(f" {key}: {value}")

# Save the best trial to a file
save_dir = '/home/essharom/code/cancer-gnn-nf/results/hyperparameters'
# save_dir = '/home/essharom/code/cancer-gnn-nf/results/hyperparameters'

# CHANGE FILENAME TO DIFFERENT MODELS
# Save the best trial to a file
with open(f'{save_dir}/best_trial_{model_name}_{data_name}.txt', 'w') as f:
f.write("Best trial:\n")
f.write(f" Value: {best_trial.value}\n")
f.write(" Params:\n")
for key, value in best_trial.params.items():
f.write(f" {key}: {value}\n")
#with open(f'{save_dir}/best_trial_{model_name}_{data_name}.txt', 'w') as f:
# f.write("Best trial:\n")
# f.write(f" Value: {best_trial.value}\n")
# f.write(" Params:\n")
# for key, value in best_trial.params.items():
# f.write(f" {key}: {value}\n")


# Define the directory to save hyperparameter results
save_dir = '/home/essharom/code/cancer-gnn-nf/results/hyperparameters'
output_csv_file = '/home/essharom/code/cancer-gnn-nf/results/hyperparameters.csv'
def run_hyperopt(
gene_filename: str,
network_filename: str,
model_name: str,
data_set: str):

def save_best_trial(study, model_name, data_name, save_dir=save_dir):
best_trial = study.best_trial

# Create the directory if it doesn't exist
os.makedirs(save_dir, exist_ok=True)
data_pairs = [{'name': data_set,
'networkFile': network_filename,
'geneFile': gene_filename}]

# Save the best trial to a file with the appropriate name
filename = f'best_trial_{model_name}_{data_name}.txt'
file_path = os.path.join(save_dir, filename)

with open(file_path, 'w') as f:
f.write("Best trial:\n")
f.write(f" Value: {best_trial.value}\n")
f.write(" Params:\n")
for key, value in best_trial.params.items():
f.write(f" {key}: {value}\n")

print(f"Best trial saved to {file_path}")

def extract_model_dataset_names(filename):
parts = filename.split("_")
model_name = parts[2]
dataset_name = "_".join(parts[3:]).split(".")[0]
return model_name, dataset_name

def handle_special_characters(value):
numeric_value = re.findall(r'[+-]?\d+(?:\.\d+)?', value)
if numeric_value:
return float(numeric_value[0])
else:
return value

def summarize_results_to_csv(directory_path=save_dir, output_file=output_csv_file):
# Initialize an empty list to store the data
data = []

# Loop through the files in the directory
for filename in os.listdir(directory_path):
if filename.startswith("best_trial"):
model_name, dataset_name = extract_model_dataset_names(filename)
with open(os.path.join(directory_path, filename), 'r') as file:
lines = file.readlines()
# Extract the value using regular expressions to handle special characters
bacc_value = handle_special_characters(re.findall(r'[+-]?\d+(?:\.\d+)?', lines[1])[0])
params = {}
for line in lines[3:]:
key, value = line.split(":")
params[key.strip()] = handle_special_characters(value.strip())
data.append({
'Model': model_name,
'Dataset': dataset_name,
'BACC': bacc_value,
**params
})

# Create a DataFrame from the data list
df = pd.DataFrame(data)

# Save the data table to a CSV file
df.to_csv(output_file, index=False)

print("Data table has been saved to:", output_file)

def main():
# Example setup - Replace with actual data and model handling logic
data_pairs = [{'name': 'example_dataset'}] # This should be your actual data
models = ['example_model'] # Replace with your model names
models = [model_name]

for data_pair in data_pairs:
data_name = data_pair['name'] # Get dataset name
for model_name in models:
print(f"Running hyperparameter optimization for model '{model_name}' with data pair '{data_name}'")

# Assuming run_optuna is your function to run the optimization
study = run_optuna(data_pair, model_name) # Replace with actual function call
save_best_trial(study, model_name, data_name)

# Clear CUDA cache after each run to avoid memory issues
for model in models:
print(f"Running hyperparameter optimization for model '{model}' with data pair '{data_pair['name']}'")
run_optuna(data_pair, model)
torch.cuda.empty_cache()

# After running all optimizations, summarize the results into a CSV
summarize_results_to_csv()

if __name__ == "__main__":
main()
typer.run(run_hyperopt)


35 changes: 27 additions & 8 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ println "Replicates: ${params.replicates}"
println "Metrics: ${params.metrics}"
println "Eval-q: ${params.eval_threshold}"
println "Verbose interval: ${params.verbose_interval}"
println "Data set: ${params.dataSet}"

println ""

Expand Down Expand Up @@ -111,25 +112,43 @@ process CollectStats {
}

process HyperparameterOptimization {


tag "${dataSet}-${model}"

publishDir "${resultsDir}/hyperparameters/${dataSet}", pattern: "best_trial_${model}_${dataSet}.txt", mode: 'copy'

input:
path config_file
path data_files

tuple path(geneFile),
path(networkFile),
val(model),
val(dataSet)

output:
path "optuna_${params.dataSet}.json" into optuna_results
path "best_trial_${model}_${dataSet}.txt", emit: best_trial_output

script:
"""
python3 hyperopt_all_2.py --config ${config_file} --data ${data_files} --output optuna_${params.dataSet}.json
hyperopt.py ${geneFile} ${networkFile}\
${model} \
${dataSet} > best_trial_${model}_${dataSet}.txt
"""
}

workflow hyperoptWorkflow {
HyperparameterOptimization(config_file: params.config, data_files: params.data)
workflow hyperopt {
geneChan = channel.fromPath(params.geneFile)
networkChan = channel.fromPath(params.networkFile)
modelChan = channel.from(params.models)
dataSetChan = channel.value(params.dataSet)
hparamChan = geneChan.combine(networkChan).combine(modelChan).combine(dataSetChan)

hparams = HyperparameterOptimization(
hparamChan
)
hparams.view()
}



workflow {
// building channels for experiments
geneChan = channel.fromPath(params.geneFile)
Expand Down

0 comments on commit 65b0515

Please sign in to comment.