update

AndyShen105 · AndyShen105 · commit af0b07335e9f · 2018-01-10T14:51:13.000+08:00
diff --git a/disML_Framwork.py b/disML_Framwork.py
@@ -72,7 +72,7 @@
 
 	#inout data
 	if FLAGS.ML_model == "SVM":
-            trainset_files = ["hdfs://b10g37:8020/user/root/train_data/url_combined"]
+            trainset_files = ["hdfs://b10g37:8020/user/root/train_data/kddb"]
         else:
             trainset_files=["hdfs://b10g37:8020/user/root/train_data/kdd12.tr"]
 	train_filename_queue = tf.train.string_input_producer(trainset_files)
@@ -126,7 +126,9 @@
 	cost = 1000000.0
 	step = 0
 	while not sv.should_stop() and not (cost < targeted_loss) or (step<1000 and (FLAGS.ML_model=="SVM")) :
+	    print ("info: start read data process")
 	    label_one_hot,label,indices,sparse_indices,weight_list,read_count = read_batch(sess, train_data_line, batch_size)
+	    print ("info: start training process")
 	    if FLAGS.ML_model=="LR":	
             	_,cost, step= sess.run([train_op, LR_loss, global_step], feed_dict = { y: label_one_hot,
 										sp_indices: sparse_indices,
@@ -141,6 +143,7 @@
 										weights_val: weight_list})
 	
 	    duration = time.time()-batch_time
+	    print ("info: start ")
 	    if (time.time()-check_point_time>600) and is_chief:
 		print ("do a check_points")
 		saver.save(sess, save_path="train_logs", global_step=global_step)
@@ -150,7 +153,6 @@
 	    process = open("/root/ex_result/baseline/"+job_id+"_process.csv","a+")
 	    process.write(re+"\r\n")
 	    process.close()
-	    
 	    print("Step: %d," % (step+1),
                             " Loss: %f" % cost,
                             " Bctch_Time: %fs" % float(duration))
diff --git a/ml_model.py b/ml_model.py
@@ -38,7 +38,7 @@ def SVMModel_with_linear(x_data, y, num_features, variable_partition_num):
     with tf.name_scope('parameter'):
 	weight_svm = tf.get_variable("weight_svm", initializer=tf.constant(0.0, shape=[num_features, 1]),
                                  partitioner=tf.fixed_size_partitioner(variable_partition_num))
-	b = tf.Variable(tf.constant(0.1, shape=[1]))
+	b = tf.Variable(tf.constant(0.1, shape=[1, 1]))
 	y_ = tf.subtract(tf.sparse_tensor_dense_matmul(x_data, weight_svm), b)
 	alpha = tf.constant([0.001])
     with tf.name_scope('loss'):
diff --git a/start_experiment.py b/start_experiment.py
@@ -17,24 +17,23 @@
                     )
 
 # n_workers, n_ps, n_intra, n_iter, n_partition, optimizer, batch_size, learning_rate
-Optimizer=["SGD","Adadelta","Adagrad","Ftrl","Adam","Momentum","RMSProp"]
+Optimizer=["SGD","Adadelta","Adagrad","Adam","Momentum","RMSProp"]
 
 def wait_finish(model, id):
     start_time = time.time()
     if model != "CNN":
-		path = "/root/code/disML_Framwork/bin/temp0"
-		maxTime = 30000.0
+	path = "/root/code/disML_Framwork/bin/temp0"
+	maxTime = 36000.0
     else:
-		path = "/root/code/disCNN_cifar/bin/temp0"
-		maxTime = 18000.0
+	path = "/root/code/disCNN_cifar/bin/temp0"
+	maxTime = 10800.0
     while os.path.exists(path):
         logging.info("The job %s is not finish" % id)
-		logging.info("Running time is %f s" % (time.time()-start_time))
+	logging.info("Running time is %f s" % (time.time()-start_time))
         time.sleep(10)
-		if (time.time()-start_time)>maxTime :
-	    	os.system("./bin/kill_cluster_pid.sh 36 72 22222")
-			break
-
+	if (time.time()-start_time)>maxTime :
+	    os.system("./bin/kill_cluster_pid.sh 1 36 22222")
+	    break
     logging.info("The job %s is finish !" % id)
 
 def execute(model, n_workers, n_ps, n_intra, n_partition, optimizer, batch_size, learning_rate):
@@ -53,7 +52,7 @@ def execute(model, n_workers, n_ps, n_intra, n_partition, optimizer, batch_size,
 	learning_rate
     ))
     if model == "SVM":
-        cmd = "./bin/ps.sh %d %d %s %f 22222 %s %d %d %d 3231961 0.07" % (n_workers,
+        cmd = "./bin/ps.sh %d %d %s %f 22222 %s %d %d %d 29890095 0.1" % (n_workers,
 								  	n_ps,
 									optimizer, 
 									learning_rate,
@@ -91,13 +90,12 @@ def run(n_samples, model):
 	n_partition = int(np.random.randint(1, 50)*n_ps/10)
 	if n_partition == 0:
 	    n_partition=1
-	optimizer = Optimizer[np.random.randint(0, 6)]
+	optimizer = Optimizer[np.random.randint(0, 5)]
 	if model != "CNN":
 	    batch_size = np.random.randint(10, 50)*100
-		learning_rate = np.random.randint(1, 10)/10000.0
 	else:
 	    batch_size = np.random.randint(1, 10)*100
-		learning_rate = np.random.randint(1, 10)/100000.0
+	learning_rate = np.random.randint(1, 10)/100000.0
 	threads = []
 	id = model+"_"+str(n_workers)+"_"+str(n_intra)+"_"+optimizer+"_"+str(learning_rate)+"_"+str(batch_size)+"_"+str(n_partition)
 	t1 = threading.Thread(target=execute,args=(model, n_workers, n_ps, n_intra, n_partition, optimizer, batch_size, learning_rate))