Skip to content

Commit 56bfb8b

Browse files
committed
update
1 parent f187e12 commit 56bfb8b

File tree

5 files changed

+19
-10
lines changed

5 files changed

+19
-10
lines changed

Diff for: README.md

+12-4
Original file line numberDiff line numberDiff line change
@@ -117,14 +117,14 @@ if you want to try BERT with pre-train of masked language model and fine-tuning.
117117

118118
than training from completely new, and f1 score is also higher while new model may start from 0.
119119

120-
Notice: to help you try new idea first, you can set hypermater test_mode to True. it will only load few data, and start to training quickly.
120+
Notice: to help you try new idea first, you can set hyper-paramater test_mode to True. it will only load few data, and start to training quickly.
121121

122122

123-
##### [basic step] to handle a classification problem with transform:
123+
##### [basic step] to handle a classification problem with transform(optional):
124124

125125
python train_transform.py [DONE, but a bug exist prevent it from converge, welcome you to fix, email: [email protected]]
126126
127-
#### Optional hypermeters
127+
#### Optional hyper-parameters
128128

129129
d_model: dimension of model. [512]
130130

@@ -146,6 +146,7 @@ if you want to try BERT with pre-train of masked language model and fine-tuning.
146146
##### for pre-train stage
147147
each line is document(several sentences) or a sentence. that is free-text you can get easily.
148148

149+
check data/bert_train.txt or bert_train2.txt in the zip file.
149150

150151
##### for data used on fine-tuning stage:
151152

@@ -158,14 +159,21 @@ token1 token2 token3 __label__l1 __label__l5 __label__l3
158159

159160
token1 token2 token3 __label__l2 __label__l4
160161

162+
check data/bert_train.txt or bert_train2.txt in the zip file.
161163

162164
check 'data' folder for sample data. <a href='https://pan.baidu.com/s/1HUzBXB_-zzqv-abWZ74w2Q'>down load a middle size data set here
163165

164166
</a>with 450k 206 classes, each input is a document, average length is around 300, one or multi-label associate with input.
165167

166168
## Suggestion for User
167169

168-
1. things can be easy: 1) download dataset(around 200M),2) run step 1 for pre-train, 3) and run step 2 for fine-tuning.
170+
1. things can be easy:
171+
172+
1) download data set(around 200M, 450k data, with some cache file), unzip it and put it in data/ folder,
173+
174+
2) run step 1 for pre-train,
175+
176+
3) and run step 2 for fine-tuning.
169177

170178
2. i finish above three steps, and want to have a better performance, how can i do further. do i need to find a big dataset?
171179

Diff for: model/bert_cnn_model.py

+1
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,7 @@ def train():
305305
save_path = config.ckpt_dir + "model.ckpt"
306306
#if not os.path.exists(config.ckpt_dir):
307307
# os.makedirs(config.ckpt_dir)
308+
batch_size = 8
308309
with tf.Session(config=gpu_config) as sess:
309310
sess.run(tf.global_variables_initializer())
310311
if os.path.exists(config.ckpt_dir): #

Diff for: model/bert_model.py

+1
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ def train():
212212
gpu_config.gpu_options.allow_growth = True
213213
saver = tf.train.Saver()
214214
save_path = config.ckpt_dir + "model.ckpt"
215+
batch_size=8
215216
#if not os.path.exists(config.ckpt_dir):
216217
# os.makedirs(config.ckpt_dir)
217218
with tf.Session(config=gpu_config) as sess:

Diff for: pretrain_task.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -257,10 +257,9 @@ def process_one_chunk_lm(lines,max_allow_sentence_length,index2word,sub_target_f
257257
y_mask_lm.append(mask_word_indexed) # input(y) to list
258258
p_mask_lm.append(index)
259259
# print some log
260-
count = count + 1
261260
if i % 1000 == 0:
262-
print(count, "index:", index, "i:", i, "j:", j, ";mask_word_1:", mask_word, ";string_list:",string_list)
263-
print(count, "index:", index, "i:", i, "j:", j, ";mask_word_indexed:", mask_word_indexed,";string_list_indexed:", string_list_indexed)
261+
print("index:", index, "i:", i, "j:", j, ";mask_word_1:", mask_word, ";string_list:",string_list)
262+
print( "index:", index, "i:", i, "j:", j, ";mask_word_indexed:", mask_word_indexed,";string_list_indexed:", string_list_indexed)
264263
# save to file system
265264
X_mask_lm=np.array(X_mask_lm)
266265
y_mask_lm=np.array(y_mask_lm)

Diff for: train_bert_fine_tuning.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,11 @@
5151
tf.app.flags.DEFINE_string("word2vec_model_path","./data/Tencent_AILab_ChineseEmbedding_100w.txt","word2vec's vocabulary and vectors") # data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5--->data/news_12g_baidubaike_20g_novel_90g_embedding_64.bin--->sgns.merge.char
5252
tf.app.flags.DEFINE_boolean("test_mode",False,"whether it is test mode. if it is test mode, only small percentage of data will be used. test mode for test purpose.")
5353

54-
tf.app.flags.DEFINE_integer("d_model", 64, "dimension of model") # 512-->128
54+
tf.app.flags.DEFINE_integer("d_model", 128, "dimension of model") # 512-->128
5555
tf.app.flags.DEFINE_integer("num_layer", 6, "number of layer")
5656
tf.app.flags.DEFINE_integer("num_header", 8, "number of header")
57-
tf.app.flags.DEFINE_integer("d_k", 8, "dimension of k") # 64-->16
58-
tf.app.flags.DEFINE_integer("d_v", 8, "dimension of v") # 64-->16
57+
tf.app.flags.DEFINE_integer("d_k", 16, "dimension of k") # 64-->16
58+
tf.app.flags.DEFINE_integer("d_v", 16, "dimension of v") # 64-->16
5959

6060
def main(_):
6161
# 1.load vocabulary of token from cache file save from pre-trained stage; load label dict from training file; print some message.

0 commit comments

Comments
 (0)