Skip to content

Commit af0b073

Browse files
AndyShen105AndyShen105
AndyShen105
authored and
AndyShen105
committed
update
1 parent 7567e66 commit af0b073

File tree

3 files changed

+17
-17
lines changed

3 files changed

+17
-17
lines changed

disML_Framwork.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@
7272

7373
#inout data
7474
if FLAGS.ML_model == "SVM":
75-
trainset_files = ["hdfs://b10g37:8020/user/root/train_data/url_combined"]
75+
trainset_files = ["hdfs://b10g37:8020/user/root/train_data/kddb"]
7676
else:
7777
trainset_files=["hdfs://b10g37:8020/user/root/train_data/kdd12.tr"]
7878
train_filename_queue = tf.train.string_input_producer(trainset_files)
@@ -126,7 +126,9 @@
126126
cost = 1000000.0
127127
step = 0
128128
while not sv.should_stop() and not (cost < targeted_loss) or (step<1000 and (FLAGS.ML_model=="SVM")) :
129+
print ("info: start read data process")
129130
label_one_hot,label,indices,sparse_indices,weight_list,read_count = read_batch(sess, train_data_line, batch_size)
131+
print ("info: start training process")
130132
if FLAGS.ML_model=="LR":
131133
_,cost, step= sess.run([train_op, LR_loss, global_step], feed_dict = { y: label_one_hot,
132134
sp_indices: sparse_indices,
@@ -141,6 +143,7 @@
141143
weights_val: weight_list})
142144

143145
duration = time.time()-batch_time
146+
print ("info: start ")
144147
if (time.time()-check_point_time>600) and is_chief:
145148
print ("do a check_points")
146149
saver.save(sess, save_path="train_logs", global_step=global_step)
@@ -150,7 +153,6 @@
150153
process = open("/root/ex_result/baseline/"+job_id+"_process.csv","a+")
151154
process.write(re+"\r\n")
152155
process.close()
153-
154156
print("Step: %d," % (step+1),
155157
" Loss: %f" % cost,
156158
" Bctch_Time: %fs" % float(duration))

ml_model.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def SVMModel_with_linear(x_data, y, num_features, variable_partition_num):
3838
with tf.name_scope('parameter'):
3939
weight_svm = tf.get_variable("weight_svm", initializer=tf.constant(0.0, shape=[num_features, 1]),
4040
partitioner=tf.fixed_size_partitioner(variable_partition_num))
41-
b = tf.Variable(tf.constant(0.1, shape=[1]))
41+
b = tf.Variable(tf.constant(0.1, shape=[1, 1]))
4242
y_ = tf.subtract(tf.sparse_tensor_dense_matmul(x_data, weight_svm), b)
4343
alpha = tf.constant([0.001])
4444
with tf.name_scope('loss'):

start_experiment.py

+12-14
Original file line numberDiff line numberDiff line change
@@ -17,24 +17,23 @@
1717
)
1818

1919
# n_workers, n_ps, n_intra, n_iter, n_partition, optimizer, batch_size, learning_rate
20-
Optimizer=["SGD","Adadelta","Adagrad","Ftrl","Adam","Momentum","RMSProp"]
20+
Optimizer=["SGD","Adadelta","Adagrad","Adam","Momentum","RMSProp"]
2121

2222
def wait_finish(model, id):
2323
start_time = time.time()
2424
if model != "CNN":
25-
path = "/root/code/disML_Framwork/bin/temp0"
26-
maxTime = 30000.0
25+
path = "/root/code/disML_Framwork/bin/temp0"
26+
maxTime = 36000.0
2727
else:
28-
path = "/root/code/disCNN_cifar/bin/temp0"
29-
maxTime = 18000.0
28+
path = "/root/code/disCNN_cifar/bin/temp0"
29+
maxTime = 10800.0
3030
while os.path.exists(path):
3131
logging.info("The job %s is not finish" % id)
32-
logging.info("Running time is %f s" % (time.time()-start_time))
32+
logging.info("Running time is %f s" % (time.time()-start_time))
3333
time.sleep(10)
34-
if (time.time()-start_time)>maxTime :
35-
os.system("./bin/kill_cluster_pid.sh 36 72 22222")
36-
break
37-
34+
if (time.time()-start_time)>maxTime :
35+
os.system("./bin/kill_cluster_pid.sh 1 36 22222")
36+
break
3837
logging.info("The job %s is finish !" % id)
3938

4039
def execute(model, n_workers, n_ps, n_intra, n_partition, optimizer, batch_size, learning_rate):
@@ -53,7 +52,7 @@ def execute(model, n_workers, n_ps, n_intra, n_partition, optimizer, batch_size,
5352
learning_rate
5453
))
5554
if model == "SVM":
56-
cmd = "./bin/ps.sh %d %d %s %f 22222 %s %d %d %d 3231961 0.07" % (n_workers,
55+
cmd = "./bin/ps.sh %d %d %s %f 22222 %s %d %d %d 29890095 0.1" % (n_workers,
5756
n_ps,
5857
optimizer,
5958
learning_rate,
@@ -91,13 +90,12 @@ def run(n_samples, model):
9190
n_partition = int(np.random.randint(1, 50)*n_ps/10)
9291
if n_partition == 0:
9392
n_partition=1
94-
optimizer = Optimizer[np.random.randint(0, 6)]
93+
optimizer = Optimizer[np.random.randint(0, 5)]
9594
if model != "CNN":
9695
batch_size = np.random.randint(10, 50)*100
97-
learning_rate = np.random.randint(1, 10)/10000.0
9896
else:
9997
batch_size = np.random.randint(1, 10)*100
100-
learning_rate = np.random.randint(1, 10)/100000.0
98+
learning_rate = np.random.randint(1, 10)/100000.0
10199
threads = []
102100
id = model+"_"+str(n_workers)+"_"+str(n_intra)+"_"+optimizer+"_"+str(learning_rate)+"_"+str(batch_size)+"_"+str(n_partition)
103101
t1 = threading.Thread(target=execute,args=(model, n_workers, n_ps, n_intra, n_partition, optimizer, batch_size, learning_rate))

0 commit comments

Comments
 (0)