-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprossing.py
72 lines (58 loc) · 1.94 KB
/
preprossing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# import libraries
import nltk
from nltk.stem import PorterStemmer
import re
import csv
class Preprossing:
def get_stopwords():
openfile = open('data/stopwords.txt','r')
stopwords=[]
for i in range(571):
word=openfile.readline()
stopwords.append(word[0:len(word)-1])
openfile.close()
return stopwords
def get_documents():
openfile=open('data/trec.sample.txt','r')
data=openfile.read()
text=re.findall(r': .+',data)
i=0
documents={}
while i<len(text):
id=re.search(r'\d+',text[i])
documents[int(id.group(0))]=text[i+1]+text[i+2]
i+=3
openfile.close()
return documents
def remove_Punctuation(documents):
for id in documents:
documents[id]=re.sub(r'\W'," ",documents[id].lower())
return documents
def remove_stopwords(documents):
for i in documents:
document=''
words=nltk.word_tokenize(documents[i])
for j in words:
if j not in stopwords:
document+=' '+j
documents[i]=document
return documents
def stemming(documents):
for i in documents:
document=''
words=nltk.word_tokenize(documents[i])
for j in words:
document+=' '+PorterStemmer().stem(j)
documents[i]=document
return documents
stopwords=Preprossing.get_stopwords()
documents=Preprossing.get_documents()
documents=Preprossing.remove_Punctuation(documents)
documents=Preprossing.remove_stopwords(documents)
documents=Preprossing.stemming(documents)
with open('preprossing.csv','w') as csvfile:
write=csv.writer(csvfile)
write.writerow(('id','document'))
for id in documents:
write.writerow((id,documents[id]))
csvfile.close()