-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathPreProcessing.py
23 lines (20 loc) · 2.11 KB
/
PreProcessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import re
from nltk.stem import PorterStemmer
ps = PorterStemmer()
StopWords = ['a','about','above','after','again','against','all','am','an','and','any','are','aren\'t','as','at','be','because','been','before','being','below','between','both','but','by','can\'t','cannot','could','couldn\'t','did','didn\'t','do','does','doesn\'t','doing','don\'t','down','during','each','few','for','from','further','had','hadn\'t','has','hasn\'t','have','haven\'t','having','he','he\'d','he\'ll','he\'s','her','here','here\'s','hers','herself','him','himself','his','how','how\'s','i','i\'d','i\'ll','i\'m','i\'ve','if','in','into','is','isn\'t','it','it\'s','its','itself','let\'s','me','more','most','mustn\'t','my','myself','no','nor','not','of','off','on','once','only','or','other','ought','our','ours', 'ourselves','out','over','own','same','shan\'t','she','she\'d','she\'ll','she\'s','should','shouldn\'t','so','some','such','than','that','that\'s','the','their','theirs','them','themselves','then','there','there\'s','these','they','they\'d','they\'ll','they\'re','they\'ve','this','those','through','to','too','under','until','up','very','was','wasn\'t','we','we\'d','we\'ll','we\'re','we\'ve','were','weren\'t','what','what\'s','when','when\'s','where','where\'s','which','while','who','who\'s','whom','why','why\'s','with','won\'t','would','wouldn\'t','you','you\'d','you\'ll','you\'re','you\'ve','your','yours','yourself','yourselves']
def perform_stemming(tds):
stemmed_td = []
for td in tds:
stemmed_td.append(ps.stem(td))
return stemmed_td
def StopWordAndSpecialCharRemoval(TestOrTrainData, stemming):
StopWordsRemovedTestData = ''
for word in TestOrTrainData.lower().split():
if word not in StopWords:
if stemming == True:
StopWordsRemovedTestData = StopWordsRemovedTestData + ps.stem(word) + ' '
else:
StopWordsRemovedTestData = StopWordsRemovedTestData + word + ' '
StopWordsRemovedTestData = ps.stem(StopWordsRemovedTestData[:-1])
StopWordsRemovedTestData = re.sub('[^A-Za-z0-9 ]+', '', StopWordsRemovedTestData)
return StopWordsRemovedTestData