-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeature.py
63 lines (44 loc) · 1.62 KB
/
feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
"""
Implement a TfidfVectorizer!
"""
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
class TfidfVectorizer(BaseEstimator, TransformerMixin):
"""
Implements a TFIDF Vectorizer
"""
def __init__(self, numwords):
self.set_params(
numwords = numwords
)
# A list mapping index -> word, can also be used as a vocabulary
self.feature_names_ = None
# The IDF vector
self.idf_ = None
pass
def fit(self, X):
"""
Fits this TFIDF vectorizer. What you need to do:
1. Figure out which words we need to use:
* count occurences in each document, get the {self.numwords} most common
* assign those words in order of frequency to self.feature_names_
2. Build the self.idf_ vector:
* count the number of documents containing each word
* set idf of word i to log(N / 1 + # docs (i)) where N is number of docs
"""
#### TODO: YOUR CODE HERE
#### END YOUR CODE
return self
def transform(self, X):
"""
Transforms the data provided in X. What you need to do:
1. Compute the term frequency for each document in X
* count the number of times each word in self.feature_names_ is present
2. Multiply each TF vector with self.idf_
"""
check_is_fitted(self, ["feature_names_", "idf_"])
X_feat = np.zeros((len(X), self.numwords))
#### TODO: YOUR CODE HERE
#### END YOUR CODE
return X_feat