update

brightmart · brightmart · commit 2975b512cdc3 · 2018-10-30T02:10:21.000+08:00
diff --git a/README.md b/README.md
@@ -1,14 +1,17 @@
 # bert_language_understanding
 Pre-train is all you need!
 
-An tensorflow implementation of Pre-training of Deep Bidirectional Transformers for Language Understanding
+BERT achieve new state of art result on more than 10 nlp tasks recently.
 
-(Bert) and Attention is all you need(Transformer). BERT achieve new state of art result on more than 10 nlp tasks recently.
+This is an tensorflow implementation of Pre-training of Deep Bidirectional Transformers for Language Understanding
+
+(Bert) and Attention is all you need(Transformer). 
 
 Update: The majority part of replicate main ideas of these two papers was done, there is a apparent performance gain 
   
 for pre-train a model & fine-tuning compare to train the model from scratch.
 
+
 We have done experiment to replace backbone network of bert from Transformer to TextCNN, and the result is that 
 
 pre-train the model with masked language model using lots of raw data can boost performance in a notable amount. 
@@ -17,7 +20,14 @@ More generally, we believe that pre-train and fine-tuning strategy is model inde
 
 with that being said, you can replace backbone network as you like. and add more pre-train tasks or define some new pre-train tasks as 
 
-you can, pre-train will not be limited to masked language model and or predict next sentence task. 
+you can, pre-train will not be limited to masked language model and or predict next sentence task. What surprise us is that,
+
+with a middle size data set that say, one million, even without use external data, with the help of pre-train task like masked language
+
+model, performance can be boost in a big margin, and the model can converge even fast. sometime training can be in a only need a few epoch
+
+in fine-tuning stage.
+
  
 While there is an open source(<a href='https://github.com/tensorflow/tensor2tensor'>tensor2tensor</a>) and official
 
@@ -125,9 +135,13 @@ if you want to try BERT with pre-train of masked language model and fine-tuning.
     or want to train a small model, use d_model=128,h=8,d_k=d_v=16(small), or d_model=64,h=8,d_k=d_v=8(tiny).
   
  
-## Data Format and Sample Data
+## Sample Data, Data Format & Suggestion to User
 
-##### for train transform:
+##### for pre-train stage 
+each line is document(several sentences) or a sentence. that is free-text you can get easily.
+
+
+##### for data used on fine-tuning stage:
 
 input and output is in the same line, each label is start with '__label__'. 
 
@@ -138,11 +152,30 @@ token1 token2 token3 __label__l1 __label__l5 __label__l3
 
 token1 token2 token3 __label__l2 __label__l4
 
-##### for pre-train masked language with BERT:
 
-each line is a sentence or serveral sentences( that is raw data you can get easily)
+check 'data' folder for sample data. <a href='https://pan.baidu.com/s/1HUzBXB_-zzqv-abWZ74w2Q'>down load a middle size data set here, 
+
+with 450k 206 classes</a>each input is a document, average length is around 300, one or multi-label associate with input.
+
+##### Suggestion to User
 
-check 'data' folder for sample data.
+1. things can be easy: 1) download dataset(around 200M),2) run step 1 for pre-train, 3) and run step 2 for fine-tuning.
+
+2. i finish above three steps, and want to have a better performance, how can i do further. do i need to find a big dataset?
+
+No. you can generate a big data set yourself for pre-train stage by downloading some free-text, make sure each line is a 
+
+document or sentence then replace data/bert_train2.txt with your new data file.
+
+3. what's more?
+
+try some big hyper-parameter or big model(by replacing backbone network) util you can observe all your pre-train data.
+
+play around with model:model/bert_cnn_model.py, or check pre-process with data_util_hdf5.py.
+
+
+
+##### for pre-train masked language with BERT:
 
 
 ## Pretrain Language Understanding Task 
diff --git a/data_util_hdf5.py b/data_util_hdf5.py
@@ -301,6 +301,8 @@ def get_lable2index(data_path,training_data_path,tokenize_style='word'):
             return pickle.load(data_f)
     file_object = codecs.open(training_data_path, mode='r', encoding='utf-8')
     lines=file_object.readlines()
+    random.shuffle(lines)
+    lines=lines[0:60000] # only read 100k lines to make training fast
     c_labels=Counter()
     for i,line in enumerate(lines):
         _,input_label=get_input_strings_and_labels(line, tokenize_style=tokenize_style)
diff --git a/model/bert_cnn_model.py b/model/bert_cnn_model.py
@@ -12,7 +12,7 @@
 from model.encoder import Encoder
 from model.config import Config
 import os
-os.environ["CUDA_VISIBLE_DEVICES"] = "7"
+#os.environ["CUDA_VISIBLE_DEVICES"] = "7"
 
 class BertCNNModel:
     def __init__(self,config):
diff --git a/temp_covert.py b/temp_covert.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+import json
+import random
+
+dict_unique={}
+
+dict_type_ignore_count={'train':0,'valid':0,'test':0}
+def transform_data_to_fasttext_format(file_path,target_path,data_type):
+    file_object=open(file_path,'r')
+    target_object=open(target_path,'w')
+    lines=file_object.readlines()
+    print("length of lines:",len(lines))
+    random.shuffle(lines)
+    for i,line in enumerate(lines):
+        json_string=json.loads(line)
+        accusation_list=json_string['meta']['accusation']
+        fact=json_string['fact'].strip('\n\r').replace("\n","").replace("\r","")
+        unique_value=dict_unique.get(fact,None)
+        if unique_value is None: # if not exist, put to unique dict, then process
+            dict_unique[fact] = fact
+        else: # otherwise, ignore
+            print("going to ignore.",data_type,fact)
+            dict_type_ignore_count[data_type]=dict_type_ignore_count[data_type]+1
+            continue
+        length_accusation=len(accusation_list)
+        #if length_accusation>1:
+            #print("accusation_list:",str(accusation_list))
+            #print("json_string:",json_string)
+        accusation_strings=''
+        for i,accusation in enumerate(accusation_list):
+            accusation_strings+=' __label__'+accusation
+        target_object.write(fact+accusation_strings+"\n")
+    target_object.close()
+    file_object.close()
+    print("dict_type_ignore_count:",dict_type_ignore_count[data_type])
+
+file_path='./data/cail2018/data_valid_checked.json'
+target_path='./data/data_valid2.txt'
+transform_data_to_fasttext_format(file_path,target_path,'valid')
+
+file_path='./data/cail2018/data_test.json'
+target_path='./data/data_test2.txt'
+transform_data_to_fasttext_format(file_path,target_path,'test')
+
+file_path='./data/cail2018/cail2018_big_downsmapled.json'
+target_path='./data/data_train2.txt'
+transform_data_to_fasttext_format(file_path,target_path,'train')
+
diff --git a/train_bert_fine_tuning.py b/train_bert_fine_tuning.py
@@ -23,9 +23,9 @@
 FLAGS=tf.app.flags.FLAGS
 
 tf.app.flags.DEFINE_string("data_path","./data/","path of traning data.")
-tf.app.flags.DEFINE_string("training_data_file","./data/bert_train.txt","path of traning data.") #./data/cail2018_bi.json
-tf.app.flags.DEFINE_string("valid_data_file","./data/bert_train.txt","path of validation data.")
-tf.app.flags.DEFINE_string("test_data_file","./data/bert_test.txt","path of validation data.")
+tf.app.flags.DEFINE_string("training_data_file","./data/bert_train2.txt","path of traning data.") #./data/cail2018_bi.json
+tf.app.flags.DEFINE_string("valid_data_file","./data/bert_valid2.txt","path of validation data.")
+tf.app.flags.DEFINE_string("test_data_file","./data/bert_test2.txt","path of validation data.")
 tf.app.flags.DEFINE_string("ckpt_dir","./checkpoint_lm/","checkpoint location for the model for restore from pre-train") #save to here, so make it easy to upload for test
 tf.app.flags.DEFINE_string("ckpt_dir_save","./checkpoint_lm_save/","checkpoint location for the model for save fine-tuning") #save to here, so make it easy to upload for test
 
@@ -35,21 +35,21 @@
 tf.app.flags.DEFINE_float("learning_rate",0.00001,"learning rate") #0.001
 tf.app.flags.DEFINE_integer("batch_size", 64, "Batch size for training/evaluating.") # 32-->128
 tf.app.flags.DEFINE_integer("decay_steps", 10000, "how many steps before decay learning rate.") # 32-->128
-tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.65
+tf.app.flags.DEFINE_float("decay_rate", 0.8, "Rate of decay for learning rate.") #0.65
 tf.app.flags.DEFINE_float("dropout_keep_prob", 0.9, "percentage to keep when using dropout.") #0.65
 tf.app.flags.DEFINE_integer("sequence_length",200,"max sentence length")#400
 tf.app.flags.DEFINE_integer("sequence_length_lm",10,"max sentence length for masked language model")
 
 tf.app.flags.DEFINE_boolean("is_training",True,"is training.true:tranining,false:testing/inference")
 tf.app.flags.DEFINE_boolean("is_fine_tuning",True,"is_finetuning.ture:this is fine-tuning stage")
 
-tf.app.flags.DEFINE_integer("num_epochs",30,"number of epochs to run.")
-tf.app.flags.DEFINE_integer("process_num",3,"number of cpu used")
+tf.app.flags.DEFINE_integer("num_epochs",35,"number of epochs to run.")
+tf.app.flags.DEFINE_integer("process_num",35,"number of cpu used")
 
 tf.app.flags.DEFINE_integer("validate_every", 1, "Validate every validate_every epochs.") #
 tf.app.flags.DEFINE_boolean("use_pretrained_embedding",False,"whether to use embedding or not.")#
 tf.app.flags.DEFINE_string("word2vec_model_path","./data/Tencent_AILab_ChineseEmbedding_100w.txt","word2vec's vocabulary and vectors") # data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5--->data/news_12g_baidubaike_20g_novel_90g_embedding_64.bin--->sgns.merge.char
-tf.app.flags.DEFINE_boolean("test_mode",True,"whether it is test mode. if it is test mode, only small percentage of data will be used. test mode for test purpose.")
+tf.app.flags.DEFINE_boolean("test_mode",False,"whether it is test mode. if it is test mode, only small percentage of data will be used. test mode for test purpose.")
 
 tf.app.flags.DEFINE_integer("d_model", 64, "dimension of model") # 512-->128
 tf.app.flags.DEFINE_integer("num_layer", 6, "number of layer")
@@ -81,7 +81,7 @@ def main(_):
         if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
             print("Restoring Variables from Checkpoint.")
             sess.run(tf.global_variables_initializer())
-            for i in range(2): #decay learning rate if necessary.
+            for i in range(6): #decay learning rate if necessary.
                 print(i,"Going to decay learning rate by a factor of "+str(FLAGS.decay_rate))
                 sess.run(model.learning_rate_decay_half_op)
             # restore those variables that names and shapes exists in your model from checkpoint. for detail check: https://gist.github.com/iganichev/d2d8a0b1abc6b15d4a07de83171163d4
@@ -110,7 +110,7 @@ def main(_):
                 current_loss,lr,l2_loss,_=sess.run([model.loss_val,model.learning_rate,model.l2_loss,model.train_op],feed_dict)
                 loss_total,counter=loss_total+current_loss,counter+1
                 if counter %30==0:
-                    print("Learning rate:%.5f\tLoss:%.3f\tCurrent_loss:%.3f\tL2_loss%.3f\t"%(lr,float(loss_total)/float(counter),current_loss,l2_loss))
+                    print("Learning rate:%.7f\tLoss:%.3f\tCurrent_loss:%.3f\tL2_loss%.3f\t"%(lr,float(loss_total)/float(counter),current_loss,l2_loss))
                 if start!=0 and start%(4000*FLAGS.batch_size)==0:
                     loss_valid, f1_macro_valid, f1_micro_valid= do_eval(sess, model, valid,num_classes,label2index)
                     f1_score_valid=((f1_macro_valid+f1_micro_valid)/2.0) #*100.0
diff --git a/train_bert_lm.py b/train_bert_lm.py
@@ -23,7 +23,7 @@
 #configuration
 FLAGS=tf.app.flags.FLAGS
 
-tf.app.flags.DEFINE_boolean("test_mode",True,"whether it is test mode. if it is test mode, only small percentage of data will be used")
+tf.app.flags.DEFINE_boolean("test_mode",False,"whether it is test mode. if it is test mode, only small percentage of data will be used")
 tf.app.flags.DEFINE_string("data_path","./data/","path of traning data.")
 tf.app.flags.DEFINE_string("mask_lm_source_file","./data/bert_train2.txt","path of traning data.")
 tf.app.flags.DEFINE_string("ckpt_dir","./checkpoint_lm/","checkpoint location for the model") #save to here, so make it easy to upload for test
@@ -49,7 +49,7 @@
 tf.app.flags.DEFINE_integer("validate_every", 1, "Validate every validate_every epochs.")
 tf.app.flags.DEFINE_boolean("use_pretrained_embedding",False,"whether to use embedding or not.")#
 tf.app.flags.DEFINE_string("word2vec_model_path","./data/Tencent_AILab_ChineseEmbedding_100w.txt","word2vec's vocabulary and vectors") # data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5--->data/news_12g_baidubaike_20g_novel_90g_embedding_64.bin--->sgns.merge.char
-tf.app.flags.DEFINE_integer("process_num",20,"number of cpu process")
+tf.app.flags.DEFINE_integer("process_num",35,"number of cpu process")
 
 def main(_):
     vocab_word2index, _= create_or_load_vocabulary(FLAGS.data_path,FLAGS.mask_lm_source_file,FLAGS.vocab_size,test_mode=FLAGS.test_mode,tokenize_style=FLAGS.tokenize_style)