Skip to content

Commit

Permalink
Use target input by user
Browse files Browse the repository at this point in the history
  • Loading branch information
teobucci committed Mar 26, 2024
1 parent 76aaa81 commit 4f03dc8
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 21 deletions.
20 changes: 17 additions & 3 deletions hawk/analysis/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def run_tefs_analysis(
results.append(
run_simulation_tefs(
datasets=self.datasets,
target_column_name=self.target_column_name,
config=config,
)
)
Expand Down Expand Up @@ -219,8 +220,21 @@ def run(self):
tefs_results = self.run_tefs_analysis()
pcmci_results = self.run_pcmci_analysis()

self.plot_pcmci, self.details_pcmci = run_postprocessing_pcmci(pcmci_results, self.datasets, self.workdir)
self.plot_tefs, self.details_tefs = run_postprocessing_tefs(tefs_results, self.datasets, self.workdir)
self.plot_pcmci, self.details_pcmci = run_postprocessing_pcmci(
results_pcmci=pcmci_results,
target_column_name=self.target_column_name,
datasets=self.datasets,
destination_path=self.workdir,
)
self.plot_tefs, self.details_tefs = run_postprocessing_tefs(
results_tefs=tefs_results,
target_column_name=self.target_column_name,
datasets=self.datasets,
destination_path=self.workdir,
)
self.plot_tefs_wrapper, self.details_tefs_wrapper = run_postprocessing_tefs_wrapper(
tefs_results, self.datasets, self.workdir
results_tefs=tefs_results,
target_column_name=self.target_column_name,
datasets=self.datasets,
destination_path=self.workdir,
)
34 changes: 18 additions & 16 deletions hawk/analysis/postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def plot_feature_presence_and_r2(df_presence, scores_values, scores_labels):

def run_postprocessing_pcmci(
results_pcmci,
target_column_name,
datasets,
destination_path,
):
Expand Down Expand Up @@ -170,7 +171,7 @@ def run_postprocessing_pcmci(
score_r2 = (
regression_analysis(
inputs_names_lags=inputs_names_lags,
target_name="target",
target_name=target_column_name,
df_train=dataframe["train"],
df_test=dataframe["test"],
)
Expand All @@ -182,7 +183,7 @@ def run_postprocessing_pcmci(
score_r2_lag = (
regression_analysis(
inputs_names_lags=inputs_names_lags,
target_name="target",
target_name=target_column_name,
df_train=dataframe["train"],
df_test=dataframe["test"],
)
Expand All @@ -191,10 +192,10 @@ def run_postprocessing_pcmci(
)

inputs_names_lags = {feature: list(range(0, simulation["params"]["lag"] + 1)) for feature in selected_features}
inputs_names_lags["target"] = list(range(1, simulation["params"]["lag"] + 1))
inputs_names_lags[target_column_name] = list(range(1, simulation["params"]["lag"] + 1))
score_r2_lag_ar = regression_analysis(
inputs_names_lags=inputs_names_lags,
target_name="target",
target_name=target_column_name,
df_train=dataframe["train"],
df_test=dataframe["test"],
)
Expand All @@ -219,8 +220,8 @@ def run_postprocessing_pcmci(
save_to_pkl_file(target_file_results_details, results_table_pcmci)

# Feature presences heatmap
if "target" in all_basin_variables:
all_basin_variables.remove("target")
if target_column_name in all_basin_variables:
all_basin_variables.remove(target_column_name)
all_basin_variables = sorted(list(all_basin_variables))
df_presence = pd.DataFrame(index=all_basin_variables, columns=range(len(results_pcmci)))
scores = []
Expand Down Expand Up @@ -261,6 +262,7 @@ def run_postprocessing_pcmci(

def run_postprocessing_tefs(
results_tefs,
target_column_name,
datasets,
destination_path,
):
Expand Down Expand Up @@ -292,7 +294,7 @@ def run_postprocessing_tefs(
score_r2 = (
regression_analysis(
inputs_names_lags=inputs_names_lags,
target_name="target",
target_name=target_column_name,
df_train=dataframe["train"],
df_test=dataframe["test"],
)
Expand All @@ -304,7 +306,7 @@ def run_postprocessing_tefs(
score_r2_lag = (
regression_analysis(
inputs_names_lags=inputs_names_lags,
target_name="target",
target_name=target_column_name,
df_train=dataframe["train"],
df_test=dataframe["test"],
)
Expand All @@ -313,10 +315,10 @@ def run_postprocessing_tefs(
)

inputs_names_lags = {feature: lagfeatures for feature in selected_features_names}
inputs_names_lags["target"] = lagtarget
inputs_names_lags[target_column_name] = lagtarget
score_r2_lag_ar = regression_analysis(
inputs_names_lags=inputs_names_lags,
target_name="target", # TODO change to use the target column name given by the user
target_name=target_column_name, # TODO change to use the target column name given by the user
df_train=dataframe["train"],
df_test=dataframe["test"],
)
Expand All @@ -341,8 +343,8 @@ def run_postprocessing_tefs(
save_to_pkl_file(target_file_results_details, results_table_te)

# Feature presences heatmap
if "target" in all_basin_variables:
all_basin_variables.remove("target")
if target_column_name in all_basin_variables:
all_basin_variables.remove(target_column_name)
all_basin_variables = sorted(list(all_basin_variables))
df_presence = pd.DataFrame(index=all_basin_variables, columns=range(len(results_tefs)))
scores = []
Expand Down Expand Up @@ -383,6 +385,7 @@ def run_postprocessing_tefs(

def run_postprocessing_tefs_wrapper(
results_tefs,
target_column_name,
datasets,
destination_path,
):
Expand All @@ -397,8 +400,7 @@ def run_postprocessing_tefs_wrapper(
dataset_name = simulation["dataset_name"]
dataframe = datasets[dataset_name]

target_columns = ["target"]
features_columns = dataframe["full"].drop(columns=target_columns).columns
features_columns = dataframe["full"].drop(columns=[target_column_name]).columns

# --------------------- Select features using threshold (conservative) ---------------------
# selected_features_names_with_threshold = simulation["results"].select_features(simulation["params"]["threshold"]) # noqa
Expand All @@ -418,13 +420,13 @@ def run_postprocessing_tefs_wrapper(
lagtarget = simulation["params"]["lagtarget"]

inputs_names_lags = {feature: lagfeatures for feature in selected_features_names}
inputs_names_lags["target"] = lagtarget
inputs_names_lags[target_column_name] = lagtarget

# --- Compute the train_test version ---
test_r2_train_test.append(
regression_analysis(
inputs_names_lags=inputs_names_lags,
target_name=target_columns[0],
target_name=target_column_name,
df_train=dataframe["train"],
df_test=dataframe["test"],
)
Expand Down
5 changes: 3 additions & 2 deletions hawk/analysis/simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def run_simulation_pcmci(
def run_simulation_tefs(
datasets,
config,
target_column_name,
n_jobs=1,
):
params = config["params"]
Expand All @@ -79,8 +80,8 @@ def run_simulation_tefs(
# param_str = param_str.replace(" ", "")
# config_id = f"dataset{dataset_name}_{param_str}"

features = dataframe["full"].drop(columns=["target"])
target = dataframe["full"]["target"]
features = dataframe["full"].drop(columns=[target_column_name])
target = dataframe["full"][target_column_name]
var_names = list(features.columns)

# run the feature selection algorithm
Expand Down

0 comments on commit 4f03dc8

Please sign in to comment.