UMassCDS · ginic · Nov 27, 2024 · Nov 27, 2024 · Nov 27, 2024 · Nov 27, 2024
diff --git a/.github/workflows/python_package.yml b/.github/workflows/python_package.yml
@@ -13,24 +13,28 @@ jobs:
     strategy:
       matrix:
         python-version: ["3.10", "3.11", "3.12"]
-
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v3
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
-          pip install flake8
           pip install .[test]
-      - name: Lint with flake8
-        run: |
-          # stop the build if there are Python syntax errors or undefined names
-          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
       - name: Test with pytest
         run: |
-          pytest
+          pytest
+
+  ruff:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Ruff Lint
+        uses: astral-sh/ruff-action@v1
+        with:
+          args: "check --config pyproject.toml"
+      - name: Ruff Format
+        uses: astral-sh/ruff-action@v1
+        with:
+          args: "format --diff"
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 You should also add project tags for each release in Github, see [Managing releases in a repository](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository).
 
+## [Unreleased]
+### Changed
+- GitHub workflow for linting and formatting uses ruff as a separate job
+
+### Removed
+- GitHub action to run flake8 for linting in build
+
+
 ## [2.0.0] - 2024-05-29
 ### Added
 - Added example auto-built Sphinx documentation in the `docs` folder

diff --git a/docs/conf.py b/docs/conf.py
@@ -12,44 +12,44 @@
 #
 import os
 import sys
-sys.path.insert(0, os.path.abspath('..'))
+
+sys.path.insert(0, os.path.abspath(".."))
 
 
 # -- Project information -----------------------------------------------------
 
-project = 'testdoc'
-copyright = '2023, Luke Ruud'
-author = 'Luke Ruud'
+project = "testdoc"
+copyright = "2023, Luke Ruud"
+author = "Luke Ruud"
 
 # The full version, including alpha/beta/rc tags
-release = '0.1'
+release = "0.1"
 
 
 # -- General configuration ---------------------------------------------------
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon"
-]
+extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon"]
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 
 
 # -- Options for HTML output -------------------------------------------------
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'alabaster'
+html_theme = "alabaster"
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
diff --git a/notebooks/word_count_prototype.ipynb b/notebooks/word_count_prototype.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,7 +32,8 @@ dependencies = [
 # Extra dependencies only needed for running tests go here
 test = ["pytest"]
 
-# Dependencies that are useful only to developers, like an autoformatter and support for visualizations in jupyter notebooks go here
+# Dependencies that are useful only to developers, like an autoformatter and
+# support for visualizations in jupyter notebooks go here
 dev = [
     "ruff",
     "jupyter",
@@ -45,3 +46,8 @@ dev = [
 # The value must be of the form "<package_name>:<module_name>.<function>"
 [project.scripts]
 corpus-counter = "cdstemplate:corpus_counter_script.main_cli"
+
+# Add customizations to the Ruff linter as described on https://docs.astral.sh/ruff/configuration/
+[tool.ruff]
+# Override ruff's default so that lines aren't more than 120 characters
+line-length = 119
diff --git a/src/cdstemplate/corpus_counter_script.py b/src/cdstemplate/corpus_counter_script.py
@@ -1,6 +1,7 @@
 """An example of a script you can run. It tokenizes an folder of input documents and
 writes the corpus counts to a user-specified CSV file
 """
+
 # Import modules, functions and classes from external libraries
 import argparse
 import logging
@@ -11,19 +12,16 @@
 
 logger = logging.getLogger(__name__)
 
+
 def main_cli():
-    """A wrapper function that defines command line arguments and help messages for 
-    when the user wants run this module's code as a script. 
+    """A wrapper function that defines command line arguments and help messages for
+    when the user wants run this module's code as a script.
     """
     # The argument parser gives nice ways to include help message and specify which arguments
     # are required or optional, see https://docs.python.org/3/library/argparse.html#prog for usage instructions
-    parser = argparse.ArgumentParser(
-        description="A script to generate counts of tokens in a corpus"
-    )
+    parser = argparse.ArgumentParser(description="A script to generate counts of tokens in a corpus")
 
-    parser.add_argument(
-        "csv", help="Path to the output CSV storing token counts. Required."
-    )
+    parser.add_argument("csv", help="Path to the output CSV storing token counts. Required.")
 
     parser.add_argument(
         "documents",
@@ -64,5 +62,5 @@ def main(csv_out, documents, case_insensitive=False):
 
 # The entry point of your script - if a user runs it from the command line, for example using `python -m <package>.<module>`
 # or `python <script_path>.py`, this is what will be run.
-if __name__ == "__main__":    
+if __name__ == "__main__":
     main_cli()
diff --git a/src/cdstemplate/utils.py b/src/cdstemplate/utils.py
@@ -1,11 +1,10 @@
-"""A module for important set-up and configuration functionality, but doesn't implement the library's key features.
-"""
+"""A module for important set-up and configuration functionality, but doesn't implement the library's key features."""
+
 import logging
 
 
 def configure_logging():
-    """A helper method that configures logging, usable by any script in this library.
-    """
+    """A helper method that configures logging, usable by any script in this library."""
     logging.basicConfig(
         level=logging.DEBUG,
         format="%(levelname)s : %(asctime)s : %(name)s : %(message)s",

diff --git a/src/cdstemplate/word_count.py b/src/cdstemplate/word_count.py
@@ -1,6 +1,7 @@
 """An example of an module with functions and a class that can be imported once the package is installed.
 This module provides operations for tokenization and tracking cumulative word counts in a set of documents.
 """
+
 from collections import Counter
 import logging
 import re
@@ -29,8 +30,7 @@ def tokenize(text, pattern=r"\s"):
 
 
 class CorpusCounter:
-    """A simple class object that tracks document and token counts in a corpus.
-    """
+    """A simple class object that tracks document and token counts in a corpus."""
 
     def __init__(self, tokenization_pattern=r"\s", case_insensitive=False):
         """Constructor instantiates with empty counters
@@ -89,16 +89,12 @@ def get_token_count(self, token):
         return self.token_counter[token]
 
     def get_vocab_size(self):
-        """Returns vocabulary size (number of unique tokens)
-        """
+        """Returns vocabulary size (number of unique tokens)"""
         return len(self.token_counter)
 
     def get_token_counts_as_dataframe(self):
-        """Returns the token counts of the corpus as a Pandas DataFrame with columns 'token', 'count'
-        """
-        dataframe = pd.DataFrame.from_records(
-            list(self.token_counter.items()), columns=["token", "count"]
-        )
+        """Returns the token counts of the corpus as a Pandas DataFrame with columns 'token', 'count'"""
+        dataframe = pd.DataFrame.from_records(list(self.token_counter.items()), columns=["token", "count"])
         dataframe = dataframe.sort_values("token")
         return dataframe
 
@@ -111,4 +107,3 @@ def save_token_counts(self, csv_file):
         """
         logger.info("Saving token counts to %s", csv_file)
         self.get_token_counts_as_dataframe().to_csv(csv_file, index=False, header=True)
-
diff --git a/tests/test_word_count.py b/tests/test_word_count.py
@@ -2,12 +2,15 @@
 
 In pytest, each individual test is a python function that starts with `test`.
 """
+
 # Import your library for testing
 from cdstemplate import word_count
 
 
 def test_tokenize_document():
-    my_document = "It was all very well to say `Drink me,' but the wise little Alice was not going to do that in a hurry."
+    my_document = (
+        "It was all very well to say `Drink me,' but the wise little Alice was not going to do that in a hurry."
+    )
 
     expected_tokens = [
         "It",