-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathP5_naive_bayes_classifier.py
46 lines (36 loc) · 1.42 KB
/
P5_naive_bayes_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from collections import defaultdict
import numpy as np
documents = [
("fun, couple, love, love", "comedy"),
("fast, furious, shoot", "action"),
("couple, fly, fast, fun, fun", "comedy"),
("furious, shoot, shoot, fun", "action"),
("fly, fast, shoot, love", "action")
]
new_doc = "fast, couple, shoot, fly"
def preprocess(doc):
return doc.lower().split(", ")
def likelihood(word, label):
return (word_counts[label][word] + 1) / (total_words[label] + vocab_size)
def compute_posterior(new_doc, label):
words = preprocess(new_doc)
posterior = np.log(priors[label])
for word in words:
posterior += np.log(likelihood(word, label))
return posterior
word_counts = {'comedy': defaultdict(int), 'action': defaultdict(int)}
class_counts = {'comedy': 0, 'action': 0}
vocab = set()
for doc, label in documents:
words = preprocess(doc)
class_counts[label] += 1
for word in words:
word_counts[label][word] += 1
vocab.add(word)
total_docs = sum(class_counts.values())
priors = {label: count / total_docs for label, count in class_counts.items()}
total_words = {label: sum(word_counts[label].values()) for label in word_counts}
vocab_size = len(vocab)
posteriors = {label: compute_posterior(new_doc, label) for label in priors}
predicted_class = max(posteriors, key=posteriors.get)
print(f"The most likely class for the new document - 'fast, couple, shoot, fly' is: {predicted_class}")