5
5
import math
6
6
import sys
7
7
import os
8
+ import re
8
9
9
10
from .plotting import PlotTraining , plot_averaged_cummulative_rewards
10
11
from .agent_wrapper import AgentWrapper , EnvironmentBounds , Verbosity , ActionTrackingStateAugmentation
@@ -49,6 +50,18 @@ def exploit(self, wrapped_env: AgentWrapper, observation) -> Tuple[str, Optional
49
50
def on_step (self , wrapped_env : AgentWrapper , observation , reward , done , info , action_metadata ) -> None :
50
51
raise NotImplementedError
51
52
53
+ def train (self ) -> None :
54
+ return
55
+
56
+ def eval (self ) -> None :
57
+ return
58
+
59
+ def save (self , filename ) -> None :
60
+ return
61
+
62
+ def load_best (self , filename ) -> None :
63
+ return
64
+
52
65
def parameters_as_string (self ) -> str :
53
66
return ''
54
67
@@ -75,7 +88,8 @@ def explore(self, wrapped_env: AgentWrapper) -> Tuple[str, cyberbattle_env.Actio
75
88
return "explore" , gym_action , None
76
89
77
90
def exploit (self , wrapped_env : AgentWrapper , observation ) -> Tuple [str , Optional [cyberbattle_env .Action ], object ]:
78
- raise NotImplementedError
91
+ gym_action = wrapped_env .env .sample_valid_action ()
92
+ return "explore" , gym_action , None
79
93
80
94
def on_step (self , wrapped_env : AgentWrapper , observation , reward , done , info , action_metadata ):
81
95
return None
@@ -119,7 +133,7 @@ def write_to_summary(writer, all_rewards, epsilon, loss_string, observation, ite
119
133
# TODO: make higher verbosity level
120
134
# writer.add_histogram(writer_tag + "/rewards", all_rewards, steps_done)
121
135
writer .add_scalar (writer_tag + "/epsilon" , epsilon , steps_done ) if is_training else ''
122
- writer .add_scalar ("loss" , float (loss_string .split ("=" )[- 1 ]), steps_done ) if is_training and loss_string else ''
136
+ writer .add_scalar ("loss" , float (re . sub ( r'[^a-zA-Z0-9]' , '' , loss_string .split ("=" )[- 1 ]) ), steps_done ) if is_training and loss_string else ''
123
137
124
138
n_positive_actions = np .sum (np .array (all_rewards ) > 0 )
125
139
writer .add_scalar (writer_tag + "/n_positive_actions" , n_positive_actions , steps_done )
@@ -203,9 +217,7 @@ def evaluate_model(
203
217
plot_title = f"{ title } (epochs={ eval_episode_count } , ϵ={ initial_epsilon } " + learner .parameters_as_string ()
204
218
205
219
render_file_index = 1
206
- train_while_exploit_before = learner .train_while_exploit
207
220
learner .eval ()
208
- learner .train_while_exploit = False
209
221
210
222
if configuration .log_results :
211
223
detection_points_results = {}
@@ -334,8 +346,9 @@ def evaluate_model(
334
346
learner .save (save_model_filename .replace ('.tar' , f'_eval_steps{ training_steps_done + steps_done } .tar' ))
335
347
learner .save (save_model_filename .replace ('.tar' , '_eval_best.tar' ))
336
348
337
- write_to_summary (writer , np .array (all_rewards ), epsilon , loss_string , observation , iteration_count , best_eval_running_mean ,
338
- training_steps_done + steps_done , writer_tag = "evaluation" )
349
+ if configuration .log_results :
350
+ write_to_summary (writer , np .array (all_rewards ), epsilon , loss_string , observation , iteration_count , best_eval_running_mean ,
351
+ training_steps_done + steps_done , writer_tag = "evaluation" )
339
352
length = episode_ended_at if episode_ended_at else iteration_count
340
353
learner .end_of_episode (i_episode = i_episode , t = length )
341
354
# if render:
@@ -345,7 +358,6 @@ def evaluate_model(
345
358
logger .info ("evaluation ended\n " ) if configuration .log_results else None
346
359
347
360
learner .train ()
348
- learner .train_while_exploit = train_while_exploit_before
349
361
350
362
return TrainedLearner (
351
363
all_episodes_rewards = all_episodes_rewards ,
@@ -438,9 +450,9 @@ def epsilon_greedy_search(
438
450
f"Learning with: episode_count={ episode_count } ,"
439
451
f"iteration_count={ iteration_count } ,"
440
452
f"ϵ={ epsilon } ,"
441
- f'ϵ_min={ epsilon_minimum } , '
442
- + (f"ϵ_multdecay={ epsilon_multdecay } ," if epsilon_multdecay else '' )
443
- + (f"ϵ_expdecay={ epsilon_exponential_decay } ," if epsilon_exponential_decay else '' ) +
453
+ f'ϵ_min={ epsilon_minimum } , ' +
454
+ (f"ϵ_multdecay={ epsilon_multdecay } ," if epsilon_multdecay else '' ) +
455
+ (f"ϵ_expdecay={ epsilon_exponential_decay } ," if epsilon_exponential_decay else '' ) +
444
456
f"{ learner .parameters_as_string ()} " )
445
457
446
458
initial_epsilon = epsilon
@@ -460,11 +472,19 @@ def epsilon_greedy_search(
460
472
461
473
# print(learner.parameters_as_string().replace("γ", "gamma").replace("replaymemory", "replay_memory_size").replace(" ", "").replace("\n", "").split(","))
462
474
463
- hparams_dict .update ({param_val .split ("=" )[0 ]: float (param_val .split ("=" )[1 ]) if float (param_val .split ("=" )[1 ]) != round (float (param_val .split ("=" )[1 ])) else int (param_val .split ("=" )[1 ])
475
+ hparams_dict .update ({param_val .split ("=" )[0 ]: float (param_val .split ("=" )[1 ]) if len (param_val .split ("=" )) > 1 and
476
+ float (param_val .split ("=" )[1 ]) != round (float (param_val .split ("=" )[1 ])) else
477
+ (int (param_val .split ("=" )[1 ]) if len (param_val .split ("=" )) > 1 else '' )
464
478
for param_val in learner .parameters_as_string ().replace ("γ" ,
465
479
"gamma" ).replace ("replaymemory" ,
466
480
"replay_memory_size" ).replace ("\n " , "" ).replace (" " , "" ).split ("," )})
467
- hparam_domain_discrete = {"gamma" : [0.015 , 0.25 , 0.5 , 0.8 ], "train_while_exploit" : [0 , 1 ], "reward_clip" : [0 , 1 ]}
481
+ hparam_domain_discrete = {}
482
+ if 'gamma' in hparams_dict :
483
+ hparam_domain_discrete ["gamma" ] = [0.015 , 0.25 , 0.5 , 0.8 ] if '' != hparams_dict .get ('gamma' , '' ) else ['' ]
484
+ if 'train_while_exploit' in hparams_dict :
485
+ hparam_domain_discrete ["train_while_exploit" ] = [0 , 1 ] if '' != hparams_dict .get ('train_while_exploit' , '' ) else ['' ]
486
+ if 'reward_clip' in hparams_dict :
487
+ hparam_domain_discrete ["reward_clip" ] = [0 , 1 ] if '' != hparams_dict .get ('reward_clip' , '' ) else ['' ]
468
488
469
489
exp , ssi , sei = hparams (hparams_dict ,
470
490
metric_dict = {"run_mean" : - 3000 ,
@@ -485,10 +505,10 @@ def epsilon_greedy_search(
485
505
ActionTrackingStateAugmentation (environment_properties , cyberbattle_gym_env .reset ()))
486
506
steps_done = 0
487
507
i_episode = 0
488
- plot_title = f"{ title } (epochs={ episode_count } , ϵ={ initial_epsilon } , ϵ_min={ epsilon_minimum } ," \
489
- + (f"ϵ_multdecay={ epsilon_multdecay } ," if epsilon_multdecay else '' ) \
490
- + (f"ϵ_expdecay={ epsilon_exponential_decay } ," if epsilon_exponential_decay else '' ) \
491
- + learner .parameters_as_string ()
508
+ plot_title = ( f"{ title } (epochs={ episode_count } , ϵ={ initial_epsilon } , ϵ_min={ epsilon_minimum } ," +
509
+ (f"ϵ_multdecay={ epsilon_multdecay } ," if epsilon_multdecay else '' ) +
510
+ (f"ϵ_expdecay={ epsilon_exponential_decay } ," if epsilon_exponential_decay else '' ) +
511
+ learner .parameters_as_string () )
492
512
plottraining = PlotTraining (title = plot_title , render_each_episode = render )
493
513
494
514
render_file_index = 1
@@ -498,6 +518,8 @@ def epsilon_greedy_search(
498
518
499
519
detection_points_results = {}
500
520
521
+ logger .info ('episode_counts ' + str (episode_count ))
522
+
501
523
# for i_episode in range(1, episode_count + 1):
502
524
while steps_done <= episode_count * iteration_count :
503
525
i_episode += 1
@@ -674,7 +696,7 @@ def epsilon_greedy_search(
674
696
learner .save (save_model_filename .replace ('.tar' , f'_steps{ steps_done } .tar' ))
675
697
learner .save (save_model_filename .replace ('.tar' , '_best.tar' ))
676
698
677
- if not only_eval_summary :
699
+ if configuration . log_results and not only_eval_summary :
678
700
write_to_summary (writer , np .array (all_rewards ), epsilon , loss_string , observation , iteration_count , best_running_mean ,
679
701
steps_done )
680
702
0 commit comments