-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset_preprocess.py
44 lines (36 loc) · 1.36 KB
/
dataset_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
"""
loop through a directory and extarct the text of each file and make a list of the text
"""
from fileinput import filename
import os
import re
import nltk
import gensim
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
def preprocess(data_dir):
word_list = []
for directory in os.listdir(data_dir):
#print(directory)
for file in os.listdir(data_dir+'/'+ directory):
#print(file)
filename=os.path.join(data_dir+'/'+ directory, file)
# print(filename)
with open(filename,'r') as f:
data=f.readlines()
data = [re.sub(r'[^\w\s]','',line) for line in data]
data = [re.sub(r'\n','',line) for line in data]
data = [re.sub(r'\s+',' ',line) for line in data]
data = [line.lower() for line in data]
data = [word for word in data if word not in stopwords.words('english')]
# if the len of a word is 1, remove it
data = [word for word in data if len(word)>1]
# remove space in the text
data = [word.strip() for word in data]
# print(data)
word_list.extend(data)
# print((word_list))
return word_list
if __name__ == '__main__':
dataset_dir="datasets/20news-bydate-test"
word_list=preprocess(dataset_dir)