-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathscan_dataset.py
71 lines (59 loc) · 1.88 KB
/
scan_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import csv
import difflib
import json
import numpy as np
with open("IMDB Dataset.csv", mode='r') as f:
# dataset = list(csv.reader(f, delimiter=","))
dataset = ""
csvreader = csv.reader(f)
for row in csvreader:
dataset += row[0]
def parse_commoncrawl(wet_file):
"""
Quick and ugly parsing of a WET file.
Tested for the May 2021 crawl.
"""
with open(wet_file) as f:
lines = f.readlines()
start_idxs = [i for i in range(len(lines)) if "WARC/1.0" in lines[i]]
all_eng = ""
count_eng = 0
for i in range(len(start_idxs)-1):
start = start_idxs[i]
end = start_idxs[i+1]
if "WARC-Identified-Content-Language: eng" in lines[start+7]:
count_eng += 1
for j in range(start+10, end):
all_eng += lines[j]
return all_eng
# dataset = parse_commoncrawl("commoncrawl.warc.wet")
def select_samples(ls, n=100):
s = np.random.choice(ls, n, replace=False)
return s
with open("gpt-2-imdb2.txt", mode = 'r', encoding="utf-8") as s:
ls = json.load(s)
samples = select_samples(ls)
# print(ls)
with open("gpt-2-xl.txt", 'r', encoding="utf-8") as s:
ls = json.load(s)
with open("llama-samples-perp.txt", encoding="utf-8") as s:
ls = json.load(s)
# print(ls)
# Naive substring match
# for sample in samples:
# if len(sample) <= 2:
# continue
# # for str in dataset:
# # if sample in str:
# # print((sample, str))
# if sample in dataset:
# print("Found")
# difflib
for sample in samples:
if len(sample) <= 2:
continue
# print(difflib.get_close_matches(sample, dataset, n=3))
match = difflib.SequenceMatcher(None, sample, dataset).find_longest_match()
print("====================")
print(sample[match.a:match.a + match.size])
print(dataset[match.b:match.b + match.size])