UMassCDS
diff --git a/‎.dvc/.gitignore
Lines changed: 3 additions & 0 deletions b/‎.dvc/.gitignore
Lines changed: 3 additions & 0 deletions
diff --git a/‎.dvc/config b/‎.dvc/config
diff --git a/‎.dvcignore
Lines changed: 3 additions & 0 deletions b/‎.dvcignore
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/python_package.yml
Lines changed: 36 additions & 0 deletions b/‎.github/workflows/python_package.yml
Lines changed: 36 additions & 0 deletions
diff --git a/‎CHANGELOG.md
Lines changed: 20 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 20 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 215 additions & 1 deletion b/‎README.md
Lines changed: 215 additions & 1 deletion
diff --git a/‎cdstemplate/__init__.py
Lines changed: 8 additions & 0 deletions b/‎cdstemplate/__init__.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎cdstemplate/corpus_counter_script.py
Lines changed: 63 additions & 0 deletions b/‎cdstemplate/corpus_counter_script.py
Lines changed: 63 additions & 0 deletions
diff --git a/‎cdstemplate/utils.py
Lines changed: 12 additions & 0 deletions b/‎cdstemplate/utils.py
Lines changed: 12 additions & 0 deletions
diff --git a/‎cdstemplate/word_count.py
Lines changed: 114 additions & 0 deletions b/‎cdstemplate/word_count.py
Lines changed: 114 additions & 0 deletions
@@ -0,0 +1,3 @@
+/config.local
+/tmp
+/cache
@@ -0,0 +1,3 @@
+# Add patterns of files dvc should ignore, which could improve
+# the performance. Learn more at
+# https://dvc.org/doc/user-guide/dvcignore
@@ -0,0 +1,36 @@
+name: Python package
+
+on:
+  pull_request:
+  push:
+    branches: [ $default-branch ]
+
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10"]
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install flake8
+          pip install .[test]
+      - name: Lint with flake8
+        run: |
+          # stop the build if there are Python syntax errors or undefined names
+          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+      - name: Test with pytest
+        run: |
+          pytest
@@ -0,0 +1,20 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+You should also add project tags for each release in Github, see [Managing releases in a repository](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository).
+
+## [Unreleased]
+
+## [1.0.0] - 2022-05-23
+### Added
+- README and CHANGELOG
+- cdstemplate packages for computing word count from input text
+- corpus_counter_script.py as a user-facing script with argparse examples
+- Tests of cdstemplate packages
+- A github workflow to trigger tests on pull request to the main branch
+- Sample text data from Project Gutenberg
+- Data Version Control stage for the corpus_counter_script.py
+- A sample Jupyter notebook that plots most frequent words the Gutenberg data
@@ -0,0 +1,8 @@
+# Avoid RuntimeWarnings about double imports when executing scripts, see https://stackoverflow.com/questions/43393764/python-3-6-project-structure-leads-to-runtimewarning
+import sys
+
+if not "-m" in sys.argv:
+    from . import corpus_counter_script
+    from . import word_count
+    from . import utils
+
@@ -0,0 +1,63 @@
+"""An example of a script you can run. It tokenizes an folder of input documents and
+writes the corpus counts to a user-specified CSV file
+"""
+# Import modules, functions and classes from external libraries
+import argparse
+import logging
+from pathlib import Path
+
+# Import the code from this project needed for this script
+from cdstemplate import word_count, utils
+
+logger = logging.getLogger(__name__)
+
+
+def main(csv_out, documents, case_insensitive=False):
+    """Determine cumulative word counts for a list of documents and write the results to a CSV file
+
+    :param csv_out: output CSV file path
+    :type csv_out: str or Path
+    :param documents: list of paths to documents to parse word counts from
+    :type documents: list of str
+    :param case_insensitive: Set to True to lowercase all words in cumulative counts, defaults to False
+    :type case_insensitive: bool, optional
+    """
+    cc = word_count.CorpusCounter(case_insensitive=case_insensitive)
+    for i, doc in enumerate(documents):
+        if i % 2 == 0:
+            logger.info("Tokenizing document number %s: %s", i, doc)
+            cc.add_doc(Path(doc).read_text())
+
+    cc.save_token_counts(csv_out)
+
+
+# The argument parser gives nice ways to include help message and specify which arguments
+# are required or optional, see https://docs.python.org/3/library/argparse.html#prog for usage instructions
+parser = argparse.ArgumentParser(
+    description="A script to generate counts of tokens in a corpus"
+)
+
+parser.add_argument(
+    "csv", help="Path to the output CSV storing token counts. Required."
+)
+
+parser.add_argument(
+    "documents",
+    nargs="+",
+    help="Paths to at least one raw text document that make up the corpus. Required.",
+)
+parser.add_argument(
+    "--case-insensitive",
+    "-c",
+    action="store_true",
+    help="Default is to have case sensitive tokenization. Use this flag to make the token counting case insensitive. Optional.",
+)
+
+
+# The entry point of your script - if a user runs it from the commane line, this is what will be run.
+if __name__ == "__main__":
+    args = parser.parse_args()
+    utils.configure_logging()
+    logger.info("Command line arguments: %s", args)
+    main(args.csv, args.documents, args.case_insensitive)
+
@@ -0,0 +1,12 @@
+"""A module for important set-up and configuration functionality, but doesn't implement the library's key features.
+"""
+import logging
+
+
+def configure_logging():
+    """A helper method that configures logging, usable by any script in this library.
+    """
+    logging.basicConfig(
+        level=logging.DEBUG,
+        format="%(levelname)s : %(asctime)s : %(name)s : %(message)s",
+    )
@@ -0,0 +1,114 @@
+"""An example of an module with functions and a class that can be imported once the package is installed.
+This module provides operations for tokenization and tracking cumulative word counts in a set of docuents.
+"""
+from collections import Counter
+import logging
+import re
+
+import pandas as pd
+
+# You should use logging instead of print statements in code others will use,
+# so they can customize how much detail to see from your package
+# Refer to https://realpython.com/python-logging/ for detailed examples.
+logger = logging.getLogger(__name__)
+
+
+def tokenize(text, pattern=r"\s"):
+    """Returns a list of strings, the text split into tokens based on the regex pattern to identify boundaries.
+
+    :param text: the document to tokenize
+    :type text: str
+    :param pattern: regex string to split the text on
+    :type pattern: str
+    """
+    logger.debug("Tokenizing '%s' with pattern '%s'", text, pattern)
+
+    tokenized = re.split(pattern, text)
+    logger.debug("%s token(s) found.", len(tokenized))
+    return tokenized
+
+
+class CorpusCounter:
+    """A simple class object that tracks document and token counts in a corpus.
+    """
+
+    def __init__(self, tokenization_pattern=r"\s", case_insensitive=False):
+        """Constructor instantiates with empty counters
+
+        :param tokenization_pattern: An optional tokenization pattern so that you are consistently tokenizing all documents the same. Defaults to splitting on whitespace
+        :param case_insensitive: Set to True to downcase tokens before counting, defaults to False
+        """
+        self.token_counter = Counter()
+        self.doc_counter = 0
+        self.tokenization_pattern = tokenization_pattern
+        self.case_insensitive = case_insensitive
+        logger.debug(
+            "CorpusCounter instantiated, tokenization pattern: %s, case insensitive: %s",
+            tokenization_pattern,
+            case_insensitive,
+        )
+
+    def add_tokenized_doc(self, token_list):
+        """Tallies an already tokenized document in the corpus.
+
+        :param token_list: A tokenized document
+        :type token_list: list or iterable of strings
+        """
+        before_vocab_size = self.get_vocab_size()
+        non_empty_tokens = [w for w in token_list if w != ""]
+        if self.case_insensitive:
+            logger.info("Adding %s token(s) case insensitively", len(token_list))
+            self.token_counter.update([w.lower() for w in non_empty_tokens])
+        else:
+            logger.info("Adding %s token(s) case insensitively", len(token_list))
+            self.token_counter.update(non_empty_tokens)
+        after_vocab_size = self.get_vocab_size()
+
+        logger.info(
+            "Vocabulary size increased by %s word types",
+            after_vocab_size - before_vocab_size,
+        )
+
+        self.doc_counter += 1
+
+    def add_doc(self, untokenized_doc):
+        """Tokenizes a document and adds it in the corpus.
+
+        :param untokenized_doc: The document to count tokens for
+        :type untokenized_doc: str
+        """
+        tokenized = tokenize(untokenized_doc, self.tokenization_pattern)
+        self.add_tokenized_doc(tokenized)
+
+    def get_token_count(self, token):
+        """Returns the count of a given token in the corpus
+
+        :param token: The token to retrieve counts of
+        :type token: str
+        """
+        return self.token_counter[token]
+
+    def get_vocab_size(self):
+        """Returns vocabulary size (number of unique tokens)
+        """
+        return len(self.token_counter)
+
+    def get_token_counts_as_dataframe(self):
+        """Returns the token counts of the corpus as a Pandas DataFrame with columns 'token', 'count'
+        """
+        dataframe = pd.DataFrame.from_records(
+            list(self.token_counter.items()), columns=["token", "count"]
+        )
+        dataframe = dataframe.sort_values("token")
+        return dataframe
+
+    def save_token_counts(self, csv_file):
+        """Saves the counts of tokens the corpus to a specified
+        CSV file in alphabetical order
+
+        :param csv_file: Path to desired CSV output file
+        :type csv_file: str or Path
+        """
+        logger.info("Saving token counts to %s", csv_file)
+        self.get_token_counts_as_dataframe().to_csv(csv_file, index=False, header=True)
+
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Add patterns of files dvc should ignore, which could improve`
	`2`	`+# the performance. Learn more at`
	`3`	`+# https://dvc.org/doc/user-guide/dvcignore`