-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCreateNaiveBayesModel.py
133 lines (114 loc) · 6.86 KB
/
CreateNaiveBayesModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!\Python35\python.exe
import json
import CreateVocabulary
import re
from collections import Counter
import joblib
import os
import numpy
import PreProcessing as pp
from time import gmtime, strftime
class NaiveBayesModel(object):
stemming = False
TrainDataPath = os.path.join('data', 'train data without stop words.json')
file_name_suffix = 'WithoutStemming'
model_name = ''
ModelPath = ''
VocabularyJsonDataPath = ''
LikelihoodDebugJsonTable = ''
LikelihoodDict = {}
PositiveClassProbability = 0
NegativeClassProbability = 0
def __init__(self, stemming):
self.stemming = stemming
if stemming == True:
self.file_name_suffix = 'WithStemming'
self.model_name = 'NaiveBayesModel'+ self.file_name_suffix +'.pickle'
self.ModelPath = os.path.join(os.path.join('output', self.file_name_suffix), self.model_name)
self.VocabularyJsonDataPath = os.path.join(os.path.join('output', self.file_name_suffix), 'vocabulary.json')
self.LikelihoodDebugJsonTable = os.path.join(os.path.join('output', self.file_name_suffix), 'likelihood_debug_json.json')
def generateWordCount(self, train_data, stemming):
word_and_word_count = {}
document = ''
for td in train_data:
document = document + ' ' + td
document = pp.StopWordAndSpecialCharRemoval(document, stemming)
word_array = numpy.array(document.split())
unique, counts = numpy.unique(word_array, return_counts=True)
return dict(zip(unique, counts))
def generateBigramCount(self, vacabulary_list, train_data, stemming):
bigram_and_count = {}
document = ''
for td in train_data:
document = document + ' ' + td
document = pp.StopWordAndSpecialCharRemoval(document, stemming)
TempBigrams = [b for b in zip(re.split('\s+',document)[:-1], re.split('\s+',document)[1:])]
for word in vacabulary_list:
FrequencyCount = 0
if tuple(word) in TempBigrams:
FrequencyCount = TempBigrams.count(tuple(word))
bigram_and_count[word[0]+','+word[1]] = FrequencyCount
return bigram_and_count
def calculateProbability(self, WordFrequency, TotalWordCount, VocabularySize):
return float(WordFrequency+1)/float(TotalWordCount+VocabularySize)
def log(self, msg):
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()) + msg)
return
def generateProbabilityTable(self):
self.log(' : Generating vocuabulary data(both unigram and bigram)...')
TotalWordCounts = CreateVocabulary.GenerateVocabularyData(self.stemming, self.VocabularyJsonDataPath, self.TrainDataPath)
self.log(' : Vocabulary geneartion successful!')
self.log(' : Generating frequency and likelihood table...')
with open(self.VocabularyJsonDataPath) as VocabularyJSONData:
VocabularyData = json.load(VocabularyJSONData)
with open(self.TrainDataPath) as TrainJSONData:
TrainingData = json.load(TrainJSONData)
VocabularySize = len(VocabularyData['unigram']) + len(VocabularyData['bigram'])
UnigramLikelihoodDict = {}
BigramLikelihoodDict = {}
positive_unigrams_and_its_count = self.generateWordCount(TrainingData['positive'], self.stemming)
negative_unigrams_and_its_count = self.generateWordCount(TrainingData['negative'], self.stemming)
self.log(' : Frequency table generation for unigram is finished!')
positive_bigrams_and_its_count = self.generateBigramCount(VocabularyData['bigram'], TrainingData['positive'], self.stemming)
negative_bigrams_and_its_count = self.generateBigramCount(VocabularyData['bigram'], TrainingData['negative'], self.stemming)
self.log(' : Frequency table generation for bigram is finished!')
for Word in VocabularyData['unigram']:
PositiveFrequencyCount = 0 if (positive_unigrams_and_its_count.get(Word.lower()) is None) else positive_unigrams_and_its_count[Word.lower()]
NegativeFrequencyCount = 0 if (negative_unigrams_and_its_count.get(Word.lower()) is None) else negative_unigrams_and_its_count[Word.lower()]
PositiveProbability = self.calculateProbability(PositiveFrequencyCount, TotalWordCounts['PositiveWordCount'], VocabularySize)
NegativeProbability = self.calculateProbability(NegativeFrequencyCount, TotalWordCounts['NegativeWordCount'], VocabularySize)
UnigramLikelihoodDict[Word] = [PositiveProbability, NegativeProbability]
self.log(' : Likelihood table generation for unigram is finished!')
for Word in VocabularyData['bigram']:
PositiveFrequencyCount = positive_bigrams_and_its_count[Word[0]+','+Word[1]]
NegativeFrequencyCount = negative_bigrams_and_its_count[Word[0]+','+Word[1]]
#probability of a word in a specific class for creating likelihood table
PositiveProbability = self.calculateProbability(PositiveFrequencyCount, TotalWordCounts['PositiveWordCount'], VocabularySize)
NegativeProbability = self.calculateProbability(NegativeFrequencyCount, TotalWordCounts['NegativeWordCount'], VocabularySize)
BigramLikelihoodDict[Word[0] + ',' + Word[1]] = [PositiveProbability , NegativeProbability]
self.log(' : Likelihood table generation for bigram is finished!')
self.LikelihoodDict['unigram'] = UnigramLikelihoodDict
self.LikelihoodDict['bigram'] = BigramLikelihoodDict
self.PositiveClassProbability = float(TotalWordCounts['PositiveWordCount'])/float(TotalWordCounts['PositiveWordCount']+TotalWordCounts['NegativeWordCount'])
self.NegativeClassProbability = float(TotalWordCounts['NegativeWordCount'])/float(TotalWordCounts['PositiveWordCount']+TotalWordCounts['NegativeWordCount'])
self.LikelihoodDict['PositiveClassProbability'] = self.PositiveClassProbability
self.LikelihoodDict['NegativeClassProbability'] = self.NegativeClassProbability
self.LikelihoodDict['TotalPositiveWordCount'] = TotalWordCounts['PositiveWordCount']
self.LikelihoodDict['TotalNegativeWordCount'] = TotalWordCounts['NegativeWordCount']
self.LikelihoodDict['VocabularySize'] = VocabularySize
#DEBUG
with open(self.LikelihoodDebugJsonTable, 'w') as outfile:
json.dump(self.LikelihoodDict, outfile, sort_keys=True, indent=4)
self.log(' : Table creation successful!')
return self.LikelihoodDict
if __name__ == '__main__':
print('Please enter 1 for model with stemming and 2 for model without stemming:')
choice = input()
if int(choice) == 1:
choice = True
else:
choice = False
NBM = NaiveBayesModel(choice)
print(NBM.ModelPath)
LikelihoodDict = NBM.generateProbabilityTable()
joblib.dump(LikelihoodDict, NBM.ModelPath, compress=9)