update

brightmart · brightmart · commit 56bfb8b8cce0 · 2018-10-31T00:40:32.000+08:00
diff --git a/README.md b/README.md
@@ -117,14 +117,14 @@ if you want to try BERT with pre-train of masked language model and fine-tuning.
    
    than training from completely new, and f1 score is also higher while new model may start from 0.
    
-   Notice: to help you try new idea first, you can set hypermater test_mode to True. it will only load few data, and start to training quickly.
+   Notice: to help you try new idea first, you can set hyper-paramater test_mode to True. it will only load few data, and start to training quickly.
   
   
-   ##### [basic step] to handle a classification problem with transform: 
+   ##### [basic step] to handle a classification problem with transform(optional): 
     
      python train_transform.py [DONE, but a bug exist prevent it from converge, welcome you to fix, email: brightmart@hotmail.com]
         
-  #### Optional hypermeters
+  #### Optional hyper-parameters
   
   d_model: dimension of model.   [512]
   
@@ -146,6 +146,7 @@ if you want to try BERT with pre-train of masked language model and fine-tuning.
 ##### for pre-train stage 
 each line is document(several sentences) or a sentence. that is free-text you can get easily.
 
+check data/bert_train.txt or bert_train2.txt in the zip file.
 
 ##### for data used on fine-tuning stage:
 
@@ -158,14 +159,21 @@ token1 token2 token3 __label__l1 __label__l5 __label__l3
 
 token1 token2 token3 __label__l2 __label__l4
 
+check data/bert_train.txt or bert_train2.txt in the zip file.
 
 check 'data' folder for sample data. <a href='https://pan.baidu.com/s/1HUzBXB_-zzqv-abWZ74w2Q'>down load a middle size data set here
 
 </a>with 450k 206 classes, each input is a document, average length is around 300, one or multi-label associate with input.
 
 ## Suggestion for User
 
-1. things can be easy: 1) download dataset(around 200M),2) run step 1 for pre-train, 3) and run step 2 for fine-tuning.
+1. things can be easy: 
+           
+      1) download data set(around 200M, 450k data, with some cache file), unzip it and put it in data/ folder,
+           
+      2) run step 1 for pre-train, 
+           
+      3) and run step 2 for fine-tuning.
 
 2. i finish above three steps, and want to have a better performance, how can i do further. do i need to find a big dataset?
 
diff --git a/model/bert_cnn_model.py b/model/bert_cnn_model.py
@@ -305,6 +305,7 @@ def train():
     save_path = config.ckpt_dir + "model.ckpt"
     #if not os.path.exists(config.ckpt_dir):
     #    os.makedirs(config.ckpt_dir)
+    batch_size = 8
     with tf.Session(config=gpu_config) as sess:
         sess.run(tf.global_variables_initializer())
         if os.path.exists(config.ckpt_dir): #
diff --git a/model/bert_model.py b/model/bert_model.py
@@ -212,6 +212,7 @@ def train():
     gpu_config.gpu_options.allow_growth = True
     saver = tf.train.Saver()
     save_path = config.ckpt_dir + "model.ckpt"
+    batch_size=8
     #if not os.path.exists(config.ckpt_dir):
     #    os.makedirs(config.ckpt_dir)
     with tf.Session(config=gpu_config) as sess:
diff --git a/pretrain_task.py b/pretrain_task.py
@@ -257,10 +257,9 @@ def process_one_chunk_lm(lines,max_allow_sentence_length,index2word,sub_target_f
             y_mask_lm.append(mask_word_indexed)  # input(y) to list
             p_mask_lm.append(index)
             # print some log
-            count = count + 1
             if i % 1000 == 0:
-                print(count, "index:", index, "i:", i, "j:", j, ";mask_word_1:", mask_word, ";string_list:",string_list)
-                print(count, "index:", index, "i:", i, "j:", j, ";mask_word_indexed:", mask_word_indexed,";string_list_indexed:", string_list_indexed)
+                print("index:", index, "i:", i, "j:", j, ";mask_word_1:", mask_word, ";string_list:",string_list)
+                print( "index:", index, "i:", i, "j:", j, ";mask_word_indexed:", mask_word_indexed,";string_list_indexed:", string_list_indexed)
     # save to file system
     X_mask_lm=np.array(X_mask_lm)
     y_mask_lm=np.array(y_mask_lm)
diff --git a/train_bert_fine_tuning.py b/train_bert_fine_tuning.py
@@ -51,11 +51,11 @@
 tf.app.flags.DEFINE_string("word2vec_model_path","./data/Tencent_AILab_ChineseEmbedding_100w.txt","word2vec's vocabulary and vectors") # data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5--->data/news_12g_baidubaike_20g_novel_90g_embedding_64.bin--->sgns.merge.char
 tf.app.flags.DEFINE_boolean("test_mode",False,"whether it is test mode. if it is test mode, only small percentage of data will be used. test mode for test purpose.")
 
-tf.app.flags.DEFINE_integer("d_model", 64, "dimension of model") # 512-->128
+tf.app.flags.DEFINE_integer("d_model", 128, "dimension of model") # 512-->128
 tf.app.flags.DEFINE_integer("num_layer", 6, "number of layer")
 tf.app.flags.DEFINE_integer("num_header", 8, "number of header")
-tf.app.flags.DEFINE_integer("d_k", 8, "dimension of k") # 64-->16
-tf.app.flags.DEFINE_integer("d_v", 8, "dimension of v") # 64-->16
+tf.app.flags.DEFINE_integer("d_k", 16, "dimension of k") # 64-->16
+tf.app.flags.DEFINE_integer("d_v", 16, "dimension of v") # 64-->16
 
 def main(_):
     # 1.load vocabulary of token from cache file save from pre-trained stage; load label dict from training file; print some message.