Skip to content

Commit ccb6042

Browse files
committed
Newest .ipynb files for:
- running tabular Q-learning (notebooks/deception_evaluation_cur.ipynb) - visualize restuls of DQN experiments (cyberbattle/agents/baseline/notebook_tabularq.ipynb)
1 parent 07b3732 commit ccb6042

14 files changed

+17818
-1656097
lines changed

cyberbattle/agents/baseline/agent_dql.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -507,12 +507,17 @@ def stateaction_as_string(self, action_metadata) -> str:
507507
def eval(self):
508508
self.policy_net.eval()
509509
self.target_net.eval()
510-
# self.train_while_exploit = False # TODO check if that is correct
510+
self.prev_train_while_exploit = False
511+
if self.train_while_exploit:
512+
self.prev_train_while_exploit = True
513+
self.train_while_exploit = False
511514

512515
def train(self):
513516
self.policy_net.train()
514517
self.target_net.train()
515-
# self.train_while_exploit = True # TODO check if that is correct
518+
if hasattr(self, 'prev_train_while_exploit'):
519+
self.train_while_exploit = self.prev_train_while_exploit
520+
delattr(self, 'prev_train_while_exploit')
516521

517522
def save(self, filename: str, optimizer_save=False) -> None:
518523
logger.info("Saving policy_net, target_net " + optimizer_save * "and optimizer " + "parameters")
@@ -554,7 +559,7 @@ def load_best(self, logdir_training: str, evaluation_ckpt=True, optimizer_load=T
554559

555560
self.load(os.path.join(logdir_training, filename_best_ckpt), optimizer_load=optimizer_load)
556561
logger.info(f"Load best model from {'evaluation'*evaluation_ckpt + 'training'*(not evaluation_ckpt)} " +
557-
" from file {os.path.join(logdir_training, filename_best_ckpt)}")
562+
f" from file {os.path.join(logdir_training, filename_best_ckpt)}")
558563

559564
def loss_as_string(self) -> str:
560565
return str(getattr(self, 'loss').item()) if hasattr(self, 'loss') else ''

cyberbattle/agents/baseline/learner.py

+39-17
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import math
66
import sys
77
import os
8+
import re
89

910
from .plotting import PlotTraining, plot_averaged_cummulative_rewards
1011
from .agent_wrapper import AgentWrapper, EnvironmentBounds, Verbosity, ActionTrackingStateAugmentation
@@ -49,6 +50,18 @@ def exploit(self, wrapped_env: AgentWrapper, observation) -> Tuple[str, Optional
4950
def on_step(self, wrapped_env: AgentWrapper, observation, reward, done, info, action_metadata) -> None:
5051
raise NotImplementedError
5152

53+
def train(self) -> None:
54+
return
55+
56+
def eval(self) -> None:
57+
return
58+
59+
def save(self, filename) -> None:
60+
return
61+
62+
def load_best(self, filename) -> None:
63+
return
64+
5265
def parameters_as_string(self) -> str:
5366
return ''
5467

@@ -75,7 +88,8 @@ def explore(self, wrapped_env: AgentWrapper) -> Tuple[str, cyberbattle_env.Actio
7588
return "explore", gym_action, None
7689

7790
def exploit(self, wrapped_env: AgentWrapper, observation) -> Tuple[str, Optional[cyberbattle_env.Action], object]:
78-
raise NotImplementedError
91+
gym_action = wrapped_env.env.sample_valid_action()
92+
return "explore", gym_action, None
7993

8094
def on_step(self, wrapped_env: AgentWrapper, observation, reward, done, info, action_metadata):
8195
return None
@@ -119,7 +133,7 @@ def write_to_summary(writer, all_rewards, epsilon, loss_string, observation, ite
119133
# TODO: make higher verbosity level
120134
# writer.add_histogram(writer_tag + "/rewards", all_rewards, steps_done)
121135
writer.add_scalar(writer_tag + "/epsilon", epsilon, steps_done) if is_training else ''
122-
writer.add_scalar("loss", float(loss_string.split("=")[-1]), steps_done) if is_training and loss_string else ''
136+
writer.add_scalar("loss", float(re.sub(r'[^a-zA-Z0-9]', '', loss_string.split("=")[-1])), steps_done) if is_training and loss_string else ''
123137

124138
n_positive_actions = np.sum(np.array(all_rewards) > 0)
125139
writer.add_scalar(writer_tag + "/n_positive_actions", n_positive_actions, steps_done)
@@ -203,9 +217,7 @@ def evaluate_model(
203217
plot_title = f"{title} (epochs={eval_episode_count}, ϵ={initial_epsilon}" + learner.parameters_as_string()
204218

205219
render_file_index = 1
206-
train_while_exploit_before = learner.train_while_exploit
207220
learner.eval()
208-
learner.train_while_exploit = False
209221

210222
if configuration.log_results:
211223
detection_points_results = {}
@@ -334,8 +346,9 @@ def evaluate_model(
334346
learner.save(save_model_filename.replace('.tar', f'_eval_steps{training_steps_done + steps_done}.tar'))
335347
learner.save(save_model_filename.replace('.tar', '_eval_best.tar'))
336348

337-
write_to_summary(writer, np.array(all_rewards), epsilon, loss_string, observation, iteration_count, best_eval_running_mean,
338-
training_steps_done + steps_done, writer_tag="evaluation")
349+
if configuration.log_results:
350+
write_to_summary(writer, np.array(all_rewards), epsilon, loss_string, observation, iteration_count, best_eval_running_mean,
351+
training_steps_done + steps_done, writer_tag="evaluation")
339352
length = episode_ended_at if episode_ended_at else iteration_count
340353
learner.end_of_episode(i_episode=i_episode, t=length)
341354
# if render:
@@ -345,7 +358,6 @@ def evaluate_model(
345358
logger.info("evaluation ended\n") if configuration.log_results else None
346359

347360
learner.train()
348-
learner.train_while_exploit = train_while_exploit_before
349361

350362
return TrainedLearner(
351363
all_episodes_rewards=all_episodes_rewards,
@@ -438,9 +450,9 @@ def epsilon_greedy_search(
438450
f"Learning with: episode_count={episode_count},"
439451
f"iteration_count={iteration_count},"
440452
f"ϵ={epsilon},"
441-
f'ϵ_min={epsilon_minimum}, '
442-
+ (f"ϵ_multdecay={epsilon_multdecay}," if epsilon_multdecay else '')
443-
+ (f"ϵ_expdecay={epsilon_exponential_decay}," if epsilon_exponential_decay else '') +
453+
f'ϵ_min={epsilon_minimum}, ' +
454+
(f"ϵ_multdecay={epsilon_multdecay}," if epsilon_multdecay else '') +
455+
(f"ϵ_expdecay={epsilon_exponential_decay}," if epsilon_exponential_decay else '') +
444456
f"{learner.parameters_as_string()}")
445457

446458
initial_epsilon = epsilon
@@ -460,11 +472,19 @@ def epsilon_greedy_search(
460472

461473
# print(learner.parameters_as_string().replace("γ", "gamma").replace("replaymemory", "replay_memory_size").replace(" ", "").replace("\n", "").split(","))
462474

463-
hparams_dict.update({param_val.split("=")[0]: float(param_val.split("=")[1]) if float(param_val.split("=")[1]) != round(float(param_val.split("=")[1])) else int(param_val.split("=")[1])
475+
hparams_dict.update({param_val.split("=")[0]: float(param_val.split("=")[1]) if len(param_val.split("=")) > 1 and
476+
float(param_val.split("=")[1]) != round(float(param_val.split("=")[1])) else
477+
(int(param_val.split("=")[1]) if len(param_val.split("=")) > 1 else '')
464478
for param_val in learner.parameters_as_string().replace("γ",
465479
"gamma").replace("replaymemory",
466480
"replay_memory_size").replace("\n", "").replace(" ", "").split(",")})
467-
hparam_domain_discrete = {"gamma": [0.015, 0.25, 0.5, 0.8], "train_while_exploit": [0, 1], "reward_clip": [0, 1]}
481+
hparam_domain_discrete = {}
482+
if 'gamma' in hparams_dict:
483+
hparam_domain_discrete["gamma"] = [0.015, 0.25, 0.5, 0.8] if '' != hparams_dict.get('gamma', '') else ['']
484+
if 'train_while_exploit' in hparams_dict:
485+
hparam_domain_discrete["train_while_exploit"] = [0, 1] if '' != hparams_dict.get('train_while_exploit', '') else ['']
486+
if 'reward_clip' in hparams_dict:
487+
hparam_domain_discrete["reward_clip"] = [0, 1] if '' != hparams_dict.get('reward_clip', '') else ['']
468488

469489
exp, ssi, sei = hparams(hparams_dict,
470490
metric_dict={"run_mean": -3000,
@@ -485,10 +505,10 @@ def epsilon_greedy_search(
485505
ActionTrackingStateAugmentation(environment_properties, cyberbattle_gym_env.reset()))
486506
steps_done = 0
487507
i_episode = 0
488-
plot_title = f"{title} (epochs={episode_count}, ϵ={initial_epsilon}, ϵ_min={epsilon_minimum}," \
489-
+ (f"ϵ_multdecay={epsilon_multdecay}," if epsilon_multdecay else '') \
490-
+ (f"ϵ_expdecay={epsilon_exponential_decay}," if epsilon_exponential_decay else '') \
491-
+ learner.parameters_as_string()
508+
plot_title = (f"{title} (epochs={episode_count}, ϵ={initial_epsilon}, ϵ_min={epsilon_minimum}," +
509+
(f"ϵ_multdecay={epsilon_multdecay}," if epsilon_multdecay else '') +
510+
(f"ϵ_expdecay={epsilon_exponential_decay}," if epsilon_exponential_decay else '') +
511+
learner.parameters_as_string())
492512
plottraining = PlotTraining(title=plot_title, render_each_episode=render)
493513

494514
render_file_index = 1
@@ -498,6 +518,8 @@ def epsilon_greedy_search(
498518

499519
detection_points_results = {}
500520

521+
logger.info('episode_counts ' + str(episode_count))
522+
501523
# for i_episode in range(1, episode_count + 1):
502524
while steps_done <= episode_count * iteration_count:
503525
i_episode += 1
@@ -674,7 +696,7 @@ def epsilon_greedy_search(
674696
learner.save(save_model_filename.replace('.tar', f'_steps{steps_done}.tar'))
675697
learner.save(save_model_filename.replace('.tar', '_best.tar'))
676698

677-
if not only_eval_summary:
699+
if configuration.log_results and not only_eval_summary:
678700
write_to_summary(writer, np.array(all_rewards), epsilon, loss_string, observation, iteration_count, best_running_mean,
679701
steps_done)
680702

0 commit comments

Comments
 (0)