Skip to content

Commit e64f656

Browse files
AndyShen105AndyShen105
AndyShen105
authored and
AndyShen105
committed
update
1 parent ee5a35e commit e64f656

File tree

2 files changed

+18
-13
lines changed

2 files changed

+18
-13
lines changed

bin/ps.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ do
6161
then
6262
sleep 0.5
6363
fi
64-
ssh $ip python /root/code/disML_Framwork/disML_Framwork.py $ps $worker --job_name=worker --task_index=$index --Learning_rate=$4 --ML_model=$6 --optimizer=$3 --n_intra_threads=$7 --Batch_size=$8 --n_partitions=$9 --num_Features=${10} --targeted_loss=${11} >> /root/code/$index".temp"
64+
ssh $ip python /root/code/disML_Framwork/disML_Framwork.py $ps $worker --job_name=worker --task_index=$index --Learning_rate=$4 --ML_model=$6 --optimizer=$3 --n_intra_threads=$7 --Batch_size=$8 --n_partitions=$9 --num_Features=${10} --targeted_loss=0.5 >> /root/code/$index".temp"
6565
echo "worker"$index" complated"
6666
echo "1"> /root/code/disML_Framwork/bin/temp$index
6767
fi

start_experiment.py

+17-12
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import logging
1111
import sys
1212

13-
id = ""
13+
1414
logging.basicConfig(level=logging.DEBUG,
1515
format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
1616
datefmt='%m-%d %H:%M',
@@ -19,12 +19,15 @@
1919
# n_workers, n_ps, n_intra, n_iter, n_partition, optimizer, batch_size, learning_rate
2020
Optimizer=["SGD","Adadelta","Adagrad","Ftrl","Adam","Momentum","RMSProp"]
2121

22-
def wait_finish():
22+
def wait_finish(model, id):
2323
start_time = time.time()
24-
dir = os.path.join("temp0")
25-
while os.path.exists("/root/code/disML_Framwork/bin/temp0"):
24+
if model != "CNN":
25+
path = "/root/code/disML_Framwork/bin/temp0"
26+
else:
27+
path = "/root/code/disCNN_cifar/bin/temp0"
28+
while os.path.exists(path):
2629
logging.info("The job %s is not finish" % id)
27-
time.sleep(10)
30+
time.sleep(3)
2831
if (time.time()-start_time)>18000:
2932
os.system("./bin/kill_cluster_pid.sh 36 72 22222")
3033

@@ -65,7 +68,7 @@ def execute(model, n_workers, n_ps, n_intra, n_partition, optimizer, batch_size,
6568
batch_size,
6669
n_partition)
6770
else:
68-
cmd = "~/code/disCNN_cifar/bin/ps.sh %d %s %f 22222 %s %d %d %d " % (n_workers,
71+
cmd = "~/code/disCNN_cifar/bin/ps.sh %d %d %s %f 22222 %s %d %d %d " % (n_workers,
6972
n_ps,
7073
optimizer,
7174
learning_rate,
@@ -74,31 +77,33 @@ def execute(model, n_workers, n_ps, n_intra, n_partition, optimizer, batch_size,
7477
batch_size,
7578
n_partition)
7679
logging.info("run command: %s" % cmd)
77-
id = model+"_"+str(n_workers)+"_"+str(n_intra)+"_"+optimizer+"_"+str(learning_rate)+"_"+str(batch_size)+"_"+str(n_partition)
7880
#p = pexpect.spawn(cmd)
79-
#os.system(cmd+" >> running_log.out")
80-
return id
81+
os.system(cmd+" >> running_log.out")
82+
8183

8284
def run(n_samples, model):
8385
for i in range(0, n_samples):
8486
n_workers = np.random.randint(1, 35)
8587
n_ps = 36-n_workers
8688
n_intra = np.random.randint(1, 15)
87-
n_partition = int(np.random.randint(0.1, 2)*n_ps)
89+
n_partition = int(np.random.randint(1, 50)*n_ps/10)
90+
if n_partition == 0:
91+
n_partition=1
8892
optimizer = Optimizer[np.random.randint(0, 6)]
8993
if model != "CNN":
9094
batch_size = np.random.randint(10, 50)*100
9195
else:
9296
batch_size = np.random.randint(1, 10)*100
93-
learning_rate = np.random.randint(1, 1000)/10000.0
97+
learning_rate = np.random.randint(1, 10)/100000.0
9498
threads = []
99+
id = model+"_"+str(n_workers)+"_"+str(n_intra)+"_"+optimizer+"_"+str(learning_rate)+"_"+str(batch_size)+"_"+str(n_partition)
95100
t1 = threading.Thread(target=execute,args=(model, n_workers, n_ps, n_intra, n_partition, optimizer, batch_size, learning_rate))
96101
threads.append(t1)
97102
#job_id = execute(model, n_workers, n_ps, n_intra, n_partition, optimizer, batch_size, learning_rate)
98103
t1.setDaemon(True)
99104
t1.start()
100105
time.sleep(10)
101-
wait_finish()
106+
wait_finish(model, id)
102107
def main():
103108
run(int(sys.argv[1]), sys.argv[2])
104109
if __name__=="__main__":

0 commit comments

Comments
 (0)