Skip to content

Commit 70e39ad

Browse files
author
Ubuntu
committed
Re-organized shuffled files, all scripts work from wikinews.db file, added Dataset Creation Notebook tutorial
1 parent 533bb59 commit 70e39ad

10 files changed

+184
-96
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.pyc

Dataset SQLite3 Example.ipynb

+97
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# SQLite Creation"
8+
]
9+
},
10+
{
11+
"cell_type": "markdown",
12+
"metadata": {},
13+
"source": [
14+
"Example of creating a dataset that will be compatible with [Summary Loop](https://github.com/CannyLab/summary_loop) training scripts.\n",
15+
"\n",
16+
"Another option is to modify the [collate_fn](https://pytorch.org/docs/stable/data.html) of the scripts, to interface with another data format.\n"
17+
]
18+
},
19+
{
20+
"cell_type": "code",
21+
"execution_count": null,
22+
"metadata": {},
23+
"outputs": [],
24+
"source": [
25+
"import sqlite3, os # This is a built in, no need to install\n",
26+
"\n",
27+
"example_dataset = [\n",
28+
" {\"title\": \"Example document1\", \"body\": \"The body of the very first document in the collection\"},\n",
29+
" {\"title\": \"Example document2\", \"body\": \"The body of the second document in the collection. You could put any data in here.\"},\n",
30+
"]"
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": null,
36+
"metadata": {},
37+
"outputs": [],
38+
"source": [
39+
"db_file = \"/home/phillab/dataset/example_dataset.db\"\n",
40+
"\n",
41+
"if os.path.isfile(db_file):\n",
42+
" os.remove(db_file)\n",
43+
"\n",
44+
"conn = sqlite3.connect(db_file,detect_types=sqlite3.PARSE_DECLTYPES)\n",
45+
"conn.row_factory = sqlite3.Row\n",
46+
"c = conn.cursor()"
47+
]
48+
},
49+
{
50+
"cell_type": "code",
51+
"execution_count": null,
52+
"metadata": {},
53+
"outputs": [],
54+
"source": [
55+
"# CREATE TABLE\n",
56+
"\n",
57+
"sql_create = \"CREATE TABLE articles (id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT NOT NULL, body TEXT);\"\n",
58+
"c.execute(sql_create)\n",
59+
"conn.commit()"
60+
]
61+
},
62+
{
63+
"cell_type": "code",
64+
"execution_count": null,
65+
"metadata": {},
66+
"outputs": [],
67+
"source": [
68+
"sql_insert = \"INSERT INTO articles (title, body) VALUES (?, ?)\"\n",
69+
"\n",
70+
"for a in example_dataset:\n",
71+
" c.execute(sql_insert, (a['title'], a['body']))\n",
72+
"conn.commit()"
73+
]
74+
}
75+
],
76+
"metadata": {
77+
"kernelspec": {
78+
"display_name": "Python 3",
79+
"language": "python",
80+
"name": "python3"
81+
},
82+
"language_info": {
83+
"codemirror_mode": {
84+
"name": "ipython",
85+
"version": 3
86+
},
87+
"file_extension": ".py",
88+
"mimetype": "text/x-python",
89+
"name": "python",
90+
"nbconvert_exporter": "python",
91+
"pygments_lexer": "ipython3",
92+
"version": "3.6.10"
93+
}
94+
},
95+
"nbformat": 4,
96+
"nbformat_minor": 2
97+
}
File renamed without changes.
File renamed without changes.

pretrain_bert.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
from pytorch_transformers.tokenization_bert import BertTokenizer
33
from pytorch_transformers.modeling_bert import BertForPreTraining
44
from torch.utils.data import DataLoader, RandomSampler
5+
import torch, os, time, utils_misc, argparse
6+
from utils_dataset import SQLDataset
57
from utils_logplot import LogPlot
6-
import torch, os, time, utils_hdf5, argparse
78
import random
89

910
parser = argparse.ArgumentParser()
@@ -96,7 +97,7 @@ def convert_example_to_features(tokens_a, tokens_b, max_seq_length, tokenizer):
9697
return input_ids, input_mask, segment_ids, lm_label_ids
9798

9899
def collate_func(inps):
99-
bodies = [inp[0].decode() for inp in inps]
100+
bodies = [inp['body'] for inp in inps]
100101
bodies_tokenized = [tokenizer.tokenize(body) for body in bodies]
101102

102103
max_length = 400
@@ -131,7 +132,7 @@ def collate_func(inps):
131132

132133
return batch_ids, batch_mask, batch_segments, batch_lm_label_ids, batch_is_next
133134

134-
dataset = utils_hdf5.HDF5Dataset(args.dataset_file, collection_name="name")
135+
dataset = SQLDataset(args.dataset_file)
135136
dataloader = DataLoader(dataset=dataset, batch_size=2*args.train_batch_size, sampler=RandomSampler(dataset), drop_last=True, collate_fn=collate_func)
136137

137138
param_optimizer = list(model.named_parameters())

pretrain_coverage.py

+18-16
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,37 @@
1-
from transformers.optimization import AdamW, WarmupLinearSchedule
2-
import torch.utils.data
1+
from transformers.optimization import AdamW
32
from torch.utils.data import DataLoader, RandomSampler
43

5-
import tqdm, nltk, torch, time, numpy as np
6-
import argparse, os
4+
import tqdm, nltk, torch, time, numpy as np, argparse, os
75
from utils_logplot import LogPlot
8-
from coverage import KeywordCoverage
9-
import utils_hdf5
6+
from model_coverage import KeywordCoverage
7+
from utils_dataset import SQLDataset
8+
import utils_misc
109

1110
parser = argparse.ArgumentParser()
1211
parser.add_argument("--experiment", type=str, required=True, help="Experiment name. Will be used to save a model file and a log file.")
12+
parser.add_argument("--dataset_file", type=str, required=True, help="Which dataset file to use. Can be full path or the root folder will be attached.")
13+
1314
parser.add_argument("--train_batch_size", type=int, default=8, help="Training batch size.")
1415
parser.add_argument("--n_kws", type=int, default=15, help="Top n words (tf-idf wise) will be masked in the coverage model.")
1516
parser.add_argument("--device", type=str, default="cuda", help="cuda or cpu")
1617
parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
1718

19+
models_folder = "/home/ubuntu/models/"
20+
logs_folder = "/home/ubuntu/logs/"
21+
22+
1823
args = parser.parse_args()
1924

2025
if args.device == "cuda":
21-
freer_gpu = str(utils_hdf5.get_freer_gpu())
26+
freer_gpu = str(utils_misc.get_freer_gpu())
2227
os.environ["CUDA_VISIBLE_DEVICES"] = ""+str(freer_gpu)
2328
args.experiment += "_"+freer_gpu
2429

25-
def collate_func(inps):
26-
return [inp[0].decode() for inp in inps], [inp[1].decode() for inp in inps]
30+
def collate_func(documents):
31+
# When pretraining the coverage model, can feed real summaries, or the first K words of the document as summaries (for full unsupervised).
32+
return [utils_misc.cut300(doc['body']) for doc in documents], [" ".join(doc['body'].split()[:50]) for doc in documents]
2733

28-
models_folder = "/home/phillab/models/"
29-
# dataset = utils_hdf5.HDF5Dataset("/home/phillab/dataset/nl_quality_summaries.0.2.hdf5", collection_name="name")
30-
dataset = utils_hdf5.HDF5Dataset("/home/phillab/dataset/cnndm_training.hdf5", collection_name="name")
34+
dataset = SQLDataset(args.dataset_file)
3135
dataloader = DataLoader(dataset=dataset, batch_size=args.train_batch_size, sampler=RandomSampler(dataset), drop_last=True, collate_fn=collate_func)
3236

3337
kw_cov = KeywordCoverage(args.device, keyword_model_file=os.path.join(models_folder, "keyword_extractor.joblib"), n_kws=args.n_kws) # , model_file=os.path.join(models_folder, "news_bert_bs64.bin")
@@ -42,8 +46,7 @@ def collate_func(inps):
4246
]
4347

4448
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)
45-
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0, t_total=len(dataloader))
46-
logplot = LogPlot("/home/phillab/logs/coverage/bert_coverage_"+args.experiment+".log")
49+
logplot = LogPlot(os.path.join(logs_folder, "coverage/bert_coverage_%s.log" % (args.experiment)))
4750

4851
if args.fp16:
4952
try:
@@ -65,12 +68,11 @@ def collate_func(inps):
6568
loss.backward()
6669

6770
if ib%optim_every == 0:
68-
scheduler.step() # Update learning rate schedule
6971
optimizer.step()
7072
optimizer.zero_grad()
7173

7274
logplot.cache({"loss": loss.item(), "accuracy": acc, "count": len(batch)}, prefix="T_")
7375
if time.time()-time_save > 60.0:
7476
logplot.save(printing=True)
7577
time_save = time.time()
76-
kw_cov.save_model("/home/phillab/models/bert_coverage_"+args.experiment+".bin")
78+
kw_cov.save_model(os.path.join(models_folder, "bert_coverage_%s.bin" % (args.experiment)))

train_generator.py

+15-31
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,18 @@
11
from transformers.optimization import AdamW
22
from model_generator import GeneTransformer
33
from torch.utils.data import DataLoader, RandomSampler
4+
import torch, os, time, argparse, tqdm
5+
from utils_dataset import SQLDataset
46
from utils_logplot import LogPlot
5-
import torch, os, time, argparse
67
from datetime import datetime
7-
import utils_hdf5
8-
import getpass, tqdm
9-
10-
# user = os.getlogin()
11-
user = getpass.getuser()
8+
import utils_misc
129

1310
parser = argparse.ArgumentParser()
1411
parser.add_argument("--experiment", type=str, required=True, help="Experiment name. Will be used to save a model file and a log file.")
1512
parser.add_argument("--dataset_file", type=str, required=True, help="Which dataset file to use.")
16-
parser.add_argument("--task", type=str, help="Which generation task to perform. Can be: `cgen` (conditionally generate), lm` (language modeling) or `copy`")
13+
parser.add_argument("--task", type=str, required=True, help="Which generation task to perform. Can be: `cgen` (conditionally generate), lm` (language modeling) or `copy`. `cgen` is useful to train a supervised model, when data is available (for example a headline generator, summarizer, etc). `lm` is an unconditional language model, such as the GPT2 model, can be used to train a Fluency model. `copy` can be used to pretrain the generator for the summary_loop, this speeds up training of the summary_loop as the generator already starts with the strong baseline of copying the first K words of the input.")
1714
parser.add_argument("--max_output_length", required=True, type=int, help="Maximum output length. Saves time if the sequences are short.")
1815

19-
parser.add_argument("--root_folder", type=str, default="/home/"+user+"/")
20-
parser.add_argument("--tokenizer", type=str, default="gpt2", help="Which tokenizer to use: gpt2 or bpecap.")
2116
parser.add_argument("--train_batch_size", type=int, default=8, help="Training batch size.")
2217
parser.add_argument("--n_epochs", type=int, default=3, help="Number of epochs to run over the data.")
2318
parser.add_argument("--optim_every", type=int, default=4, help="Optimize every x backprops. A multiplier to the true batch size.")
@@ -27,33 +22,27 @@
2722

2823
args = parser.parse_args()
2924

30-
models_folder = os.path.join(args.root_folder, "models/")
31-
logs_folder = os.path.join(args.root_folder, "logs/")
25+
models_folder = "/home/ubuntu/models/"
26+
logs_folder = "/home/ubuntu/logs/"
3227

3328
if args.device == "cuda":
34-
freer_gpu = str(utils_hdf5.get_freer_gpu())
29+
freer_gpu = str(utils_misc.get_freer_gpu())
3530
os.environ["CUDA_VISIBLE_DEVICES"] = ""+str(freer_gpu)
3631
args.experiment += "_"+freer_gpu
3732

3833
learning_rate = 2e-5
3934
n_epochs = args.n_epochs
4035

41-
utils_hdf5.DoublePrint("printlog_generator_"+args.experiment+"_"+datetime.now().strftime("%Y-%m-%d")+".log", "a") ## << Wooh
42-
43-
bpe_model = ""
44-
if args.tokenizer == "bpecap":
45-
bpe_model = os.path.join(models_folder, "m.model")
46-
47-
model = GeneTransformer(tokenizer_type=args.tokenizer, max_output_length=args.max_output_length, device=args.device, bpe_model=bpe_model)
36+
model = GeneTransformer(tokenizer_type="gpt2", max_output_length=args.max_output_length, device=args.device, bpe_model="")
4837
if len(args.starter_model) > 0:
4938
model.reload(os.path.join(models_folder, args.starter_model))
5039

5140
print("Model loaded")
5241

53-
def collate_func(inps):
54-
return [inp[0] for inp in inps], [inp[1] for inp in inps]
42+
def collate_func(documents):
43+
return [utils_misc.cut300(doc['body']) for doc in documents], [doc['title'] for doc in documents]
5544

56-
dataset = utils_hdf5.HDF5Dataset(args.dataset_file, collection_name="name")
45+
dataset = SQLDataset(args.dataset_file)
5746

5847
N = len(dataset)
5948
N_dev = 500
@@ -63,16 +52,14 @@ def collate_func(inps):
6352
dl_train = DataLoader(dataset=d_train, batch_size=args.train_batch_size, sampler=RandomSampler(d_train), collate_fn=collate_func)
6453
dl_dev = DataLoader(dataset=d_dev, batch_size=20, sampler=RandomSampler(d_dev), collate_fn=collate_func)
6554

66-
# dataloader = DataLoader(dataset=dataset, batch_size=args.train_batch_size, sampler=RandomSampler(dataset), drop_last=True, collate_fn=collate_func)
67-
6855
param_optimizer = list(model.model.named_parameters())
6956
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
7057
optimizer_grouped_parameters = [
7158
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
7259
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
7360
]
7461

75-
logplot_file = os.path.join(logs_folder, "generator_"+args.experiment+".log")
62+
logplot_file = os.path.join(logs_folder, "generator_%s.log" % (args.experiment))
7663
summ = LogPlot(logplot_file)
7764

7865
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
@@ -89,14 +76,11 @@ def collate_func(inps):
8976

9077
def map_batch(batch, task):
9178
sources, targets = batch
92-
sources = [source.decode() for source in sources]
93-
targets = [target.decode() for target in targets]
9479

95-
sources = [s for s in sources]
96-
if task == "copy":
80+
if task == "cgen":
81+
pass # already in shape
82+
elif task == "copy":
9783
targets = sources
98-
elif task == "cgen":
99-
targets = [t for t in targets]
10084
elif task == "lm":
10185
targets = sources
10286
sources = [""] * len(sources)

train_summary_loop.py

+14-14
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1+
from torch.utils.data import DataLoader, RandomSampler
2+
import torch, os, sys, time, argparse, numpy as np
3+
from utils_dataset import SQLDataset, HDF5Dataset
14
from transformers.optimization import AdamW
25
from model_generator import GeneTransformer
3-
from torch.utils.data import DataLoader, RandomSampler
46
from datetime import datetime, timedelta
57
from utils_logplot import LogPlot
6-
import torch, os, sys, time, argparse, numpy as np
7-
import utils_hdf5, utils_tokenizer
8+
import utils_misc, utils_tokenizer
89

9-
from coverage import KeywordCoverage
10-
from fluency import PatternPenalty, LengthPenalty, RepeatPenalty
10+
from model_coverage import KeywordCoverage
11+
from model_guardrails import PatternPenalty, LengthPenalty, RepeatPenalty
1112
import threading, queue
12-
import torch.utils.data.dataset
1313

1414
user = os.getlogin()
1515

@@ -24,20 +24,20 @@
2424
parser.add_argument("--max_output_length", type=int, default=25, help="Maximum output length. Saves time if the sequences are short.")
2525
parser.add_argument("--save_every", type=int, default=60, help="Number of seconds between any two saves.")
2626
parser.add_argument("--device", type=str, default="cuda", help="cuda or cpu")
27-
parser.add_argument("--log_folder", type=str, default="", help="What should the model file start with.")
2827
parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
2928
parser.add_argument("--ckpt_every", type=int, default=600, help="If 0, checkpointing is not used. Otherwise, checkpointing is done very x seconds.")
3029
parser.add_argument("--ckpt_lookback", type=int, default=300, help="When checkpointing, will consider the avg total score of the last x samples.")
3130

3231
args = parser.parse_args()
3332
if args.device == "cuda":
34-
freer_gpu = str(utils_hdf5.get_freer_gpu())
33+
freer_gpu = str(utils_misc.get_freer_gpu())
3534
os.environ["CUDA_VISIBLE_DEVICES"] = ""+str(freer_gpu)
3635
args.experiment += "_"+freer_gpu
3736

38-
models_folder = os.path.join(args.root_folder, "models/")
37+
models_folder = "/home/ubuntu/models/"
38+
log_folder = "/home/ubuntu/logs/"
39+
3940
summarizer_model_start = os.path.join(models_folder, "gpt2_copier23.bin")
40-
args.log_folder = os.path.join(args.root_folder, "logs/", args.log_folder)
4141

4242
ckpt_every = args.ckpt_every
4343
ckpt_lookback = int((args.ckpt_lookback+args.train_batch_size-1)/args.train_batch_size)
@@ -72,7 +72,7 @@ def collate_func(inps):
7272
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
7373
]
7474

75-
logplot_file = os.path.join(args.log_folder, "summary_loop_"+args.experiment+".log")
75+
logplot_file = os.path.join(log_folder, "summary_loop_%s.log" % (args.experiment))
7676
logplot = LogPlot(logplot_file)
7777

7878
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
@@ -106,9 +106,9 @@ def background_tokenizer(bodies, out_queue):
106106
print("Started training")
107107

108108
if ".db" in args.dataset_file:
109-
all_dataset = utils_hdf5.SQLDataset(args.dataset_file)
109+
all_dataset = SQLDataset(args.dataset_file)
110110
else:
111-
all_dataset = utils_hdf5.HDF5Dataset(args.dataset_file, collection_name="name")
111+
all_dataset = HDF5Dataset(args.dataset_file, collection_name="name")
112112

113113
dataset = all_dataset
114114

@@ -234,7 +234,7 @@ def background_tokenizer(bodies, out_queue):
234234
print("==============================================================================")
235235

236236
if best_ckpt_score is None or current_score > best_ckpt_score:
237-
print("[CKPT] Saved new best at:", current_score, "["+str(datetime.now())+"]")
237+
print("[CKPT] Saved new best at: %.3f %s" % (current_score, "["+str(datetime.now())+"]"))
238238
best_ckpt_score = current_score
239239
torch.save(summarizer.model.state_dict(), ckpt_file)
240240
torch.save(optimizer.state_dict(), ckpt_optimizer_file)

0 commit comments

Comments
 (0)