PreProcessing.py

import re
from nltk.stem import PorterStemmer

ps = PorterStemmer()
StopWords = ['a','about','above','after','again','against','all','am','an','and','any','are','aren\'t','as','at','be','because','been','before','being','below','between','both','but','by','can\'t','cannot','could','couldn\'t','did','didn\'t','do','does','doesn\'t','doing','don\'t','down','during','each','few','for','from','further','had','hadn\'t','has','hasn\'t','have','haven\'t','having','he','he\'d','he\'ll','he\'s','her','here','here\'s','hers','herself','him','himself','his','how','how\'s','i','i\'d','i\'ll','i\'m','i\'ve','if','in','into','is','isn\'t','it','it\'s','its','itself','let\'s','me','more','most','mustn\'t','my','myself','no','nor','not','of','off','on','once','only','or','other','ought','our','ours', 'ourselves','out','over','own','same','shan\'t','she','she\'d','she\'ll','she\'s','should','shouldn\'t','so','some','such','than','that','that\'s','the','their','theirs','them','themselves','then','there','there\'s','these','they','they\'d','they\'ll','they\'re','they\'ve','this','those','through','to','too','under','until','up','very','was','wasn\'t','we','we\'d','we\'ll','we\'re','we\'ve','were','weren\'t','what','what\'s','when','when\'s','where','where\'s','which','while','who','who\'s','whom','why','why\'s','with','won\'t','would','wouldn\'t','you','you\'d','you\'ll','you\'re','you\'ve','your','yours','yourself','yourselves']

def perform_stemming(tds):
    stemmed_td = []
    for td in tds:
        stemmed_td.append(ps.stem(td))
    return stemmed_td

def StopWordAndSpecialCharRemoval(TestOrTrainData, stemming):
    StopWordsRemovedTestData = ''
    for word in TestOrTrainData.lower().split():
        if word not in StopWords:
            if stemming == True:
                StopWordsRemovedTestData = StopWordsRemovedTestData + ps.stem(word) + ' '
            else:
                StopWordsRemovedTestData = StopWordsRemovedTestData + word + ' '
    StopWordsRemovedTestData = ps.stem(StopWordsRemovedTestData[:-1])
    StopWordsRemovedTestData = re.sub('[^A-Za-z0-9 ]+', '', StopWordsRemovedTestData)
    return StopWordsRemovedTestData