Skip to content

Commit bc2e623

Browse files
authored
Initial package with all documentation and workflow configs (#1)
* Outlined readme and started package contents * Testing and logging examples * Token counter script and example data * Dependency tool ssection added * DVC init and pipelines * DVC section in Readme * Updated module names, added directory and build info to readme * Updated changelog for v1.0.0 release
1 parent 140ae93 commit bc2e623

29 files changed

+1306
-1
lines changed

.dvc/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
/config.local
2+
/tmp
3+
/cache

.dvc/config

Whitespace-only changes.

.dvcignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Add patterns of files dvc should ignore, which could improve
2+
# the performance. Learn more at
3+
# https://dvc.org/doc/user-guide/dvcignore

.github/workflows/python_package.yml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
name: Python package
2+
3+
on:
4+
pull_request:
5+
push:
6+
branches: [ $default-branch ]
7+
8+
9+
jobs:
10+
build:
11+
12+
runs-on: ubuntu-latest
13+
strategy:
14+
matrix:
15+
python-version: ["3.9", "3.10"]
16+
17+
steps:
18+
- uses: actions/checkout@v3
19+
- name: Set up Python ${{ matrix.python-version }}
20+
uses: actions/setup-python@v3
21+
with:
22+
python-version: ${{ matrix.python-version }}
23+
- name: Install dependencies
24+
run: |
25+
python -m pip install --upgrade pip
26+
pip install flake8
27+
pip install .[test]
28+
- name: Lint with flake8
29+
run: |
30+
# stop the build if there are Python syntax errors or undefined names
31+
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
32+
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
33+
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
34+
- name: Test with pytest
35+
run: |
36+
pytest

CHANGELOG.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Changelog
2+
All notable changes to this project will be documented in this file.
3+
4+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
5+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6+
7+
You should also add project tags for each release in Github, see [Managing releases in a repository](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository).
8+
9+
## [Unreleased]
10+
11+
## [1.0.0] - 2022-05-23
12+
### Added
13+
- README and CHANGELOG
14+
- cdstemplate packages for computing word count from input text
15+
- corpus_counter_script.py as a user-facing script with argparse examples
16+
- Tests of cdstemplate packages
17+
- A github workflow to trigger tests on pull request to the main branch
18+
- Sample text data from Project Gutenberg
19+
- Data Version Control stage for the corpus_counter_script.py
20+
- A sample Jupyter notebook that plots most frequent words the Gutenberg data

README.md

Lines changed: 215 additions & 1 deletion
Large diffs are not rendered by default.

cdstemplate/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Avoid RuntimeWarnings about double imports when executing scripts, see https://stackoverflow.com/questions/43393764/python-3-6-project-structure-leads-to-runtimewarning
2+
import sys
3+
4+
if not "-m" in sys.argv:
5+
from . import corpus_counter_script
6+
from . import word_count
7+
from . import utils
8+

cdstemplate/corpus_counter_script.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"""An example of a script you can run. It tokenizes an folder of input documents and
2+
writes the corpus counts to a user-specified CSV file
3+
"""
4+
# Import modules, functions and classes from external libraries
5+
import argparse
6+
import logging
7+
from pathlib import Path
8+
9+
# Import the code from this project needed for this script
10+
from cdstemplate import word_count, utils
11+
12+
logger = logging.getLogger(__name__)
13+
14+
15+
def main(csv_out, documents, case_insensitive=False):
16+
"""Determine cumulative word counts for a list of documents and write the results to a CSV file
17+
18+
:param csv_out: output CSV file path
19+
:type csv_out: str or Path
20+
:param documents: list of paths to documents to parse word counts from
21+
:type documents: list of str
22+
:param case_insensitive: Set to True to lowercase all words in cumulative counts, defaults to False
23+
:type case_insensitive: bool, optional
24+
"""
25+
cc = word_count.CorpusCounter(case_insensitive=case_insensitive)
26+
for i, doc in enumerate(documents):
27+
if i % 2 == 0:
28+
logger.info("Tokenizing document number %s: %s", i, doc)
29+
cc.add_doc(Path(doc).read_text())
30+
31+
cc.save_token_counts(csv_out)
32+
33+
34+
# The argument parser gives nice ways to include help message and specify which arguments
35+
# are required or optional, see https://docs.python.org/3/library/argparse.html#prog for usage instructions
36+
parser = argparse.ArgumentParser(
37+
description="A script to generate counts of tokens in a corpus"
38+
)
39+
40+
parser.add_argument(
41+
"csv", help="Path to the output CSV storing token counts. Required."
42+
)
43+
44+
parser.add_argument(
45+
"documents",
46+
nargs="+",
47+
help="Paths to at least one raw text document that make up the corpus. Required.",
48+
)
49+
parser.add_argument(
50+
"--case-insensitive",
51+
"-c",
52+
action="store_true",
53+
help="Default is to have case sensitive tokenization. Use this flag to make the token counting case insensitive. Optional.",
54+
)
55+
56+
57+
# The entry point of your script - if a user runs it from the commane line, this is what will be run.
58+
if __name__ == "__main__":
59+
args = parser.parse_args()
60+
utils.configure_logging()
61+
logger.info("Command line arguments: %s", args)
62+
main(args.csv, args.documents, args.case_insensitive)
63+

cdstemplate/utils.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
"""A module for important set-up and configuration functionality, but doesn't implement the library's key features.
2+
"""
3+
import logging
4+
5+
6+
def configure_logging():
7+
"""A helper method that configures logging, usable by any script in this library.
8+
"""
9+
logging.basicConfig(
10+
level=logging.DEBUG,
11+
format="%(levelname)s : %(asctime)s : %(name)s : %(message)s",
12+
)

cdstemplate/word_count.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
"""An example of an module with functions and a class that can be imported once the package is installed.
2+
This module provides operations for tokenization and tracking cumulative word counts in a set of docuents.
3+
"""
4+
from collections import Counter
5+
import logging
6+
import re
7+
8+
import pandas as pd
9+
10+
# You should use logging instead of print statements in code others will use,
11+
# so they can customize how much detail to see from your package
12+
# Refer to https://realpython.com/python-logging/ for detailed examples.
13+
logger = logging.getLogger(__name__)
14+
15+
16+
def tokenize(text, pattern=r"\s"):
17+
"""Returns a list of strings, the text split into tokens based on the regex pattern to identify boundaries.
18+
19+
:param text: the document to tokenize
20+
:type text: str
21+
:param pattern: regex string to split the text on
22+
:type pattern: str
23+
"""
24+
logger.debug("Tokenizing '%s' with pattern '%s'", text, pattern)
25+
26+
tokenized = re.split(pattern, text)
27+
logger.debug("%s token(s) found.", len(tokenized))
28+
return tokenized
29+
30+
31+
class CorpusCounter:
32+
"""A simple class object that tracks document and token counts in a corpus.
33+
"""
34+
35+
def __init__(self, tokenization_pattern=r"\s", case_insensitive=False):
36+
"""Constructor instantiates with empty counters
37+
38+
:param tokenization_pattern: An optional tokenization pattern so that you are consistently tokenizing all documents the same. Defaults to splitting on whitespace
39+
:param case_insensitive: Set to True to downcase tokens before counting, defaults to False
40+
"""
41+
self.token_counter = Counter()
42+
self.doc_counter = 0
43+
self.tokenization_pattern = tokenization_pattern
44+
self.case_insensitive = case_insensitive
45+
logger.debug(
46+
"CorpusCounter instantiated, tokenization pattern: %s, case insensitive: %s",
47+
tokenization_pattern,
48+
case_insensitive,
49+
)
50+
51+
def add_tokenized_doc(self, token_list):
52+
"""Tallies an already tokenized document in the corpus.
53+
54+
:param token_list: A tokenized document
55+
:type token_list: list or iterable of strings
56+
"""
57+
before_vocab_size = self.get_vocab_size()
58+
non_empty_tokens = [w for w in token_list if w != ""]
59+
if self.case_insensitive:
60+
logger.info("Adding %s token(s) case insensitively", len(token_list))
61+
self.token_counter.update([w.lower() for w in non_empty_tokens])
62+
else:
63+
logger.info("Adding %s token(s) case insensitively", len(token_list))
64+
self.token_counter.update(non_empty_tokens)
65+
after_vocab_size = self.get_vocab_size()
66+
67+
logger.info(
68+
"Vocabulary size increased by %s word types",
69+
after_vocab_size - before_vocab_size,
70+
)
71+
72+
self.doc_counter += 1
73+
74+
def add_doc(self, untokenized_doc):
75+
"""Tokenizes a document and adds it in the corpus.
76+
77+
:param untokenized_doc: The document to count tokens for
78+
:type untokenized_doc: str
79+
"""
80+
tokenized = tokenize(untokenized_doc, self.tokenization_pattern)
81+
self.add_tokenized_doc(tokenized)
82+
83+
def get_token_count(self, token):
84+
"""Returns the count of a given token in the corpus
85+
86+
:param token: The token to retrieve counts of
87+
:type token: str
88+
"""
89+
return self.token_counter[token]
90+
91+
def get_vocab_size(self):
92+
"""Returns vocabulary size (number of unique tokens)
93+
"""
94+
return len(self.token_counter)
95+
96+
def get_token_counts_as_dataframe(self):
97+
"""Returns the token counts of the corpus as a Pandas DataFrame with columns 'token', 'count'
98+
"""
99+
dataframe = pd.DataFrame.from_records(
100+
list(self.token_counter.items()), columns=["token", "count"]
101+
)
102+
dataframe = dataframe.sort_values("token")
103+
return dataframe
104+
105+
def save_token_counts(self, csv_file):
106+
"""Saves the counts of tokens the corpus to a specified
107+
CSV file in alphabetical order
108+
109+
:param csv_file: Path to desired CSV output file
110+
:type csv_file: str or Path
111+
"""
112+
logger.info("Saving token counts to %s", csv_file)
113+
self.get_token_counts_as_dataframe().to_csv(csv_file, index=False, header=True)
114+

0 commit comments

Comments
 (0)