17
17
)
18
18
19
19
# n_workers, n_ps, n_intra, n_iter, n_partition, optimizer, batch_size, learning_rate
20
- Optimizer = ["SGD" ,"Adadelta" ,"Adagrad" ,"Ftrl" , " Adam" ,"Momentum" ,"RMSProp" ]
20
+ Optimizer = ["SGD" ,"Adadelta" ,"Adagrad" ,"Adam" ,"Momentum" ,"RMSProp" ]
21
21
22
22
def wait_finish (model , id ):
23
23
start_time = time .time ()
24
24
if model != "CNN" :
25
- path = "/root/code/disML_Framwork/bin/temp0"
26
- maxTime = 30000 .0
25
+ path = "/root/code/disML_Framwork/bin/temp0"
26
+ maxTime = 36000 .0
27
27
else :
28
- path = "/root/code/disCNN_cifar/bin/temp0"
29
- maxTime = 18000 .0
28
+ path = "/root/code/disCNN_cifar/bin/temp0"
29
+ maxTime = 10800 .0
30
30
while os .path .exists (path ):
31
31
logging .info ("The job %s is not finish" % id )
32
- logging .info ("Running time is %f s" % (time .time ()- start_time ))
32
+ logging .info ("Running time is %f s" % (time .time ()- start_time ))
33
33
time .sleep (10 )
34
- if (time .time ()- start_time )> maxTime :
35
- os .system ("./bin/kill_cluster_pid.sh 36 72 22222" )
36
- break
37
-
34
+ if (time .time ()- start_time )> maxTime :
35
+ os .system ("./bin/kill_cluster_pid.sh 1 36 22222" )
36
+ break
38
37
logging .info ("The job %s is finish !" % id )
39
38
40
39
def execute (model , n_workers , n_ps , n_intra , n_partition , optimizer , batch_size , learning_rate ):
@@ -53,7 +52,7 @@ def execute(model, n_workers, n_ps, n_intra, n_partition, optimizer, batch_size,
53
52
learning_rate
54
53
))
55
54
if model == "SVM" :
56
- cmd = "./bin/ps.sh %d %d %s %f 22222 %s %d %d %d 3231961 0.07 " % (n_workers ,
55
+ cmd = "./bin/ps.sh %d %d %s %f 22222 %s %d %d %d 29890095 0.1 " % (n_workers ,
57
56
n_ps ,
58
57
optimizer ,
59
58
learning_rate ,
@@ -91,13 +90,12 @@ def run(n_samples, model):
91
90
n_partition = int (np .random .randint (1 , 50 )* n_ps / 10 )
92
91
if n_partition == 0 :
93
92
n_partition = 1
94
- optimizer = Optimizer [np .random .randint (0 , 6 )]
93
+ optimizer = Optimizer [np .random .randint (0 , 5 )]
95
94
if model != "CNN" :
96
95
batch_size = np .random .randint (10 , 50 )* 100
97
- learning_rate = np .random .randint (1 , 10 )/ 10000.0
98
96
else :
99
97
batch_size = np .random .randint (1 , 10 )* 100
100
- learning_rate = np .random .randint (1 , 10 )/ 100000.0
98
+ learning_rate = np .random .randint (1 , 10 )/ 100000.0
101
99
threads = []
102
100
id = model + "_" + str (n_workers )+ "_" + str (n_intra )+ "_" + optimizer + "_" + str (learning_rate )+ "_" + str (batch_size )+ "_" + str (n_partition )
103
101
t1 = threading .Thread (target = execute ,args = (model , n_workers , n_ps , n_intra , n_partition , optimizer , batch_size , learning_rate ))
0 commit comments