Skip to content

Commit ec5017e

Browse files
committed
Implemented a function to remove stop words
1 parent 22fc390 commit ec5017e

File tree

3 files changed

+39
-0
lines changed

3 files changed

+39
-0
lines changed

CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,8 @@ You should also add project tags for each release in Github, see [Managing relea
3131
- Sample text data from Project Gutenberg
3232
- Data Version Control stage for the corpus_counter_script.py
3333
- A sample Jupyter notebook that plots most frequent words the Gutenberg data
34+
35+
## [3.0.0] - 2022-05-30
36+
### Added
37+
- Added a function that removes stop words from the corpus (currently implemented as removing words from a given document and not integrated into the functional CorpusCounter).
38+
- Added a test case to test if the stop words are removed.

src/cdstemplate/word_count.py

+6
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,12 @@ def tokenize(text, pattern=r"\s"):
2727
logger.debug("%s token(s) found.", len(tokenized))
2828
return tokenized
2929

30+
def remove_stop_words(stop_words, tokens):
31+
"""Returns the list of tokens after removing the stop words.
32+
"""
33+
tokens_no_stop_words = [word for word in tokens if word.lower() not in stop_words]
34+
35+
return tokens_no_stop_words
3036

3137
class CorpusCounter:
3238
"""A simple class object that tracks document and token counts in a corpus.

tests/test_word_count.py

+28
Original file line numberDiff line numberDiff line change
@@ -103,3 +103,31 @@ def test_corpus_counter_save_csv(tmp_path):
103103
assert my_csv.is_file()
104104
expected_csv = "token,count\na,2\nb,1\nc,1\nx,1\ny,1\nz,1\n"
105105
assert my_csv.read_text() == expected_csv
106+
107+
108+
def test_stop_word_removal():
109+
my_document = "It was all very well to say `Drink me,' but the wise little Alice was not going to do that in a hurry."
110+
111+
expected_tokens = [
112+
"was",
113+
"all",
114+
"very",
115+
"well",
116+
"say",
117+
"`Drink",
118+
"me,'",
119+
"but",
120+
"wise",
121+
"little",
122+
"Alice",
123+
"was",
124+
"not",
125+
"going",
126+
"do",
127+
"hurry.",
128+
]
129+
130+
stop_words = ['the', 'and', 'a', 'to', 'of', 'in', 'is', 'that', 'it', 'on']
131+
tokens = word_count.tokenize(my_document)
132+
133+
assert word_count.remove_stop_words(stop_words, tokens) == expected_tokens

0 commit comments

Comments
 (0)