-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathGenerateSubsetOfDataset.py
54 lines (50 loc) · 3.19 KB
/
GenerateSubsetOfDataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import multiprocessing
import re
import json
import os
dataset_path = os.path.join('data', r'train.ft.txt')
def removeStopWords(ds):
stop_words = set(['a','about','above','after','again','against','all','am','an','and','any','are','aren\'t','as','at','be','because','been','before','being','below','between','both','but','by','can\'t','cannot','could','couldn\'t','did','didn\'t','do','does','doesn\'t','doing','don\'t','down','during','each','few','for','from','further','had','hadn\'t','has','hasn\'t','have','haven\'t','having','he','he\'d','he\'ll','he\'s','her','here','here\'s','hers','herself','him','himself','his','how','how\'s','i','i\'d','i\'ll','i\'m','i\'ve','if','in','into','is','isn\'t','it','it\'s','its','itself','let\'s','me','more','most','mustn\'t','my','myself','no','nor','not','of','off','on','once','only','or','other','ought','our','ours', 'ourselves','out','over','own','same','shan\'t','she','she\'d','she\'ll','she\'s','should','shouldn\'t','so','some','such','than','that','that\'s','the','their','theirs','them','themselves','then','there','there\'s','these','they','they\'d','they\'ll','they\'re','they\'ve','this','those','through','to','too','under','until','up','very','was','wasn\'t','we','we\'d','we\'ll','we\'re','we\'ve','were','weren\'t','what','what\'s','when','when\'s','where','where\'s','which','while','who','who\'s','whom','why','why\'s','with','won\'t','would','wouldn\'t','you','you\'d','you\'ll','you\'re','you\'ve','your','yours','yourself','yourselves'])
ds = re.sub(r'[\&\!\:\\\-\#\$\.\"\,\^\_\'\(\)]+', '', ds)
ds_without_stop_words = [word for word in ds.split() if word not in stop_words]
return ''.join(str(e) + " " for e in ds_without_stop_words)
def readBigFile(dataset_tpe, file_name, start_index):
pool = multiprocessing.Pool(8) # play around for performance
ds_count = 0
#ngt_file = open(r'negative.txt', 'w')
datasets = {}
datasets["positive"] = []
datasets["negative"] = []
#with open(r"train.ft.txt") as f:
# if index is 10:
# return
# pool.map(do_stuff, f)
# index += 1
with open(dataset_path, encoding="utf8") as f:
for line in f:
if ds_count >= start_index:
if (len(datasets['positive']) + len(datasets['negative'])) < 1000:
if ("__label__2" in line) and (len(datasets['positive']) <500):
datasets["positive"] = datasets["positive"] + [removeStopWords(line[11:])]
elif ("__label__1" in line) and (len(datasets['negative']) <500):
#ngt_file.write(line[11:] + "\r\n")
#ngt_wsw_file.write(removeStopWords(line[11:]) + "\r\n")
datasets["negative"] = datasets["negative"] + [removeStopWords(line[11:])]
else:
#ngt_file.close()
print(len(datasets['positive']))
print(len(datasets['negative']))
print(ds_count)
with open( file_name, 'w') as outfile:
json.dump(datasets, outfile)
return
ds_count = ds_count + 1
if __name__ == "__main__":
print('Please enter 1 for generating finite train data 2 for test data:')
choice = input()
file_name = ''
if int(choice,3) is 1:
file_name = os.path.join('data', 'train data without stop words.json')
else:
file_name = os.path.join('data', 'test data without stop words.json')
readBigFile(choice, file_name, 600)