Implemented a function to remove stop words

anjus1313 · anjus1313 · commit ec5017ebf498 · 2024-05-30T12:20:11.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -31,3 +31,8 @@ You should also add project tags for each release in Github, see [Managing relea
 - Sample text data from Project Gutenberg
 - Data Version Control stage for the corpus_counter_script.py
 - A sample Jupyter notebook that plots most frequent words the Gutenberg data
+
+## [3.0.0] - 2022-05-30
+### Added
+- Added a function that removes stop words from the corpus (currently implemented as removing words from a given document and not integrated into the functional CorpusCounter).
+- Added a test case to test if the stop words are removed.
diff --git a/src/cdstemplate/word_count.py b/src/cdstemplate/word_count.py
@@ -27,6 +27,12 @@ def tokenize(text, pattern=r"\s"):
     logger.debug("%s token(s) found.", len(tokenized))
     return tokenized
 
+def remove_stop_words(stop_words, tokens):
+    """Returns the list of tokens after removing the stop words.
+    """
+    tokens_no_stop_words = [word for word in tokens if word.lower() not in stop_words]
+
+    return tokens_no_stop_words
 
 class CorpusCounter:
     """A simple class object that tracks document and token counts in a corpus.
diff --git a/tests/test_word_count.py b/tests/test_word_count.py
@@ -103,3 +103,31 @@ def test_corpus_counter_save_csv(tmp_path):
     assert my_csv.is_file()
     expected_csv = "token,count\na,2\nb,1\nc,1\nx,1\ny,1\nz,1\n"
     assert my_csv.read_text() == expected_csv
+
+
+def test_stop_word_removal():
+    my_document = "It was all very well to say `Drink me,' but the wise little Alice was not going to do that in a hurry."
+
+    expected_tokens = [
+        "was",
+        "all",
+        "very",
+        "well",
+        "say",
+        "`Drink",
+        "me,'",
+        "but",
+        "wise",
+        "little",
+        "Alice",
+        "was",
+        "not",
+        "going",
+        "do",
+        "hurry.",
+    ]
+
+    stop_words = ['the', 'and', 'a', 'to', 'of', 'in', 'is', 'that', 'it', 'on']
+    tokens = word_count.tokenize(my_document)
+
+    assert word_count.remove_stop_words(stop_words, tokens) == expected_tokens