10
10
import logging
11
11
import sys
12
12
13
- id = ""
13
+
14
14
logging .basicConfig (level = logging .DEBUG ,
15
15
format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s' ,
16
16
datefmt = '%m-%d %H:%M' ,
19
19
# n_workers, n_ps, n_intra, n_iter, n_partition, optimizer, batch_size, learning_rate
20
20
Optimizer = ["SGD" ,"Adadelta" ,"Adagrad" ,"Ftrl" ,"Adam" ,"Momentum" ,"RMSProp" ]
21
21
22
- def wait_finish ():
22
+ def wait_finish (model , id ):
23
23
start_time = time .time ()
24
- dir = os .path .join ("temp0" )
25
- while os .path .exists ("/root/code/disML_Framwork/bin/temp0" ):
24
+ if model != "CNN" :
25
+ path = "/root/code/disML_Framwork/bin/temp0"
26
+ else :
27
+ path = "/root/code/disCNN_cifar/bin/temp0"
28
+ while os .path .exists (path ):
26
29
logging .info ("The job %s is not finish" % id )
27
- time .sleep (10 )
30
+ time .sleep (3 )
28
31
if (time .time ()- start_time )> 18000 :
29
32
os .system ("./bin/kill_cluster_pid.sh 36 72 22222" )
30
33
@@ -65,7 +68,7 @@ def execute(model, n_workers, n_ps, n_intra, n_partition, optimizer, batch_size,
65
68
batch_size ,
66
69
n_partition )
67
70
else :
68
- cmd = "~/code/disCNN_cifar/bin/ps.sh %d %s %f 22222 %s %d %d %d " % (n_workers ,
71
+ cmd = "~/code/disCNN_cifar/bin/ps.sh %d %d %s %f 22222 %s %d %d %d " % (n_workers ,
69
72
n_ps ,
70
73
optimizer ,
71
74
learning_rate ,
@@ -74,31 +77,33 @@ def execute(model, n_workers, n_ps, n_intra, n_partition, optimizer, batch_size,
74
77
batch_size ,
75
78
n_partition )
76
79
logging .info ("run command: %s" % cmd )
77
- id = model + "_" + str (n_workers )+ "_" + str (n_intra )+ "_" + optimizer + "_" + str (learning_rate )+ "_" + str (batch_size )+ "_" + str (n_partition )
78
80
#p = pexpect.spawn(cmd)
79
- # os.system(cmd+" >> running_log.out")
80
- return id
81
+ os .system (cmd + " >> running_log.out" )
82
+
81
83
82
84
def run (n_samples , model ):
83
85
for i in range (0 , n_samples ):
84
86
n_workers = np .random .randint (1 , 35 )
85
87
n_ps = 36 - n_workers
86
88
n_intra = np .random .randint (1 , 15 )
87
- n_partition = int (np .random .randint (0.1 , 2 )* n_ps )
89
+ n_partition = int (np .random .randint (1 , 50 )* n_ps / 10 )
90
+ if n_partition == 0 :
91
+ n_partition = 1
88
92
optimizer = Optimizer [np .random .randint (0 , 6 )]
89
93
if model != "CNN" :
90
94
batch_size = np .random .randint (10 , 50 )* 100
91
95
else :
92
96
batch_size = np .random .randint (1 , 10 )* 100
93
- learning_rate = np .random .randint (1 , 1000 ) / 10000 .0
97
+ learning_rate = np .random .randint (1 , 10 ) / 100000 .0
94
98
threads = []
99
+ id = model + "_" + str (n_workers )+ "_" + str (n_intra )+ "_" + optimizer + "_" + str (learning_rate )+ "_" + str (batch_size )+ "_" + str (n_partition )
95
100
t1 = threading .Thread (target = execute ,args = (model , n_workers , n_ps , n_intra , n_partition , optimizer , batch_size , learning_rate ))
96
101
threads .append (t1 )
97
102
#job_id = execute(model, n_workers, n_ps, n_intra, n_partition, optimizer, batch_size, learning_rate)
98
103
t1 .setDaemon (True )
99
104
t1 .start ()
100
105
time .sleep (10 )
101
- wait_finish ()
106
+ wait_finish (model , id )
102
107
def main ():
103
108
run (int (sys .argv [1 ]), sys .argv [2 ])
104
109
if __name__ == "__main__" :
0 commit comments