|
| 1 | +"""An example of an module with functions and a class that can be imported once the package is installed. |
| 2 | +This module provides operations for tokenization and tracking cumulative word counts in a set of docuents. |
| 3 | +""" |
| 4 | +from collections import Counter |
| 5 | +import logging |
| 6 | +import re |
| 7 | + |
| 8 | +import pandas as pd |
| 9 | + |
| 10 | +# You should use logging instead of print statements in code others will use, |
| 11 | +# so they can customize how much detail to see from your package |
| 12 | +# Refer to https://realpython.com/python-logging/ for detailed examples. |
| 13 | +logger = logging.getLogger(__name__) |
| 14 | + |
| 15 | + |
| 16 | +def tokenize(text, pattern=r"\s"): |
| 17 | + """Returns a list of strings, the text split into tokens based on the regex pattern to identify boundaries. |
| 18 | +
|
| 19 | + :param text: the document to tokenize |
| 20 | + :type text: str |
| 21 | + :param pattern: regex string to split the text on |
| 22 | + :type pattern: str |
| 23 | + """ |
| 24 | + logger.debug("Tokenizing '%s' with pattern '%s'", text, pattern) |
| 25 | + |
| 26 | + tokenized = re.split(pattern, text) |
| 27 | + logger.debug("%s token(s) found.", len(tokenized)) |
| 28 | + return tokenized |
| 29 | + |
| 30 | + |
| 31 | +class CorpusCounter: |
| 32 | + """A simple class object that tracks document and token counts in a corpus. |
| 33 | + """ |
| 34 | + |
| 35 | + def __init__(self, tokenization_pattern=r"\s", case_insensitive=False): |
| 36 | + """Constructor instantiates with empty counters |
| 37 | +
|
| 38 | + :param tokenization_pattern: An optional tokenization pattern so that you are consistently tokenizing all documents the same. Defaults to splitting on whitespace |
| 39 | + :param case_insensitive: Set to True to downcase tokens before counting, defaults to False |
| 40 | + """ |
| 41 | + self.token_counter = Counter() |
| 42 | + self.doc_counter = 0 |
| 43 | + self.tokenization_pattern = tokenization_pattern |
| 44 | + self.case_insensitive = case_insensitive |
| 45 | + logger.debug( |
| 46 | + "CorpusCounter instantiated, tokenization pattern: %s, case insensitive: %s", |
| 47 | + tokenization_pattern, |
| 48 | + case_insensitive, |
| 49 | + ) |
| 50 | + |
| 51 | + def add_tokenized_doc(self, token_list): |
| 52 | + """Tallies an already tokenized document in the corpus. |
| 53 | +
|
| 54 | + :param token_list: A tokenized document |
| 55 | + :type token_list: list or iterable of strings |
| 56 | + """ |
| 57 | + before_vocab_size = self.get_vocab_size() |
| 58 | + non_empty_tokens = [w for w in token_list if w != ""] |
| 59 | + if self.case_insensitive: |
| 60 | + logger.info("Adding %s token(s) case insensitively", len(token_list)) |
| 61 | + self.token_counter.update([w.lower() for w in non_empty_tokens]) |
| 62 | + else: |
| 63 | + logger.info("Adding %s token(s) case insensitively", len(token_list)) |
| 64 | + self.token_counter.update(non_empty_tokens) |
| 65 | + after_vocab_size = self.get_vocab_size() |
| 66 | + |
| 67 | + logger.info( |
| 68 | + "Vocabulary size increased by %s word types", |
| 69 | + after_vocab_size - before_vocab_size, |
| 70 | + ) |
| 71 | + |
| 72 | + self.doc_counter += 1 |
| 73 | + |
| 74 | + def add_doc(self, untokenized_doc): |
| 75 | + """Tokenizes a document and adds it in the corpus. |
| 76 | +
|
| 77 | + :param untokenized_doc: The document to count tokens for |
| 78 | + :type untokenized_doc: str |
| 79 | + """ |
| 80 | + tokenized = tokenize(untokenized_doc, self.tokenization_pattern) |
| 81 | + self.add_tokenized_doc(tokenized) |
| 82 | + |
| 83 | + def get_token_count(self, token): |
| 84 | + """Returns the count of a given token in the corpus |
| 85 | +
|
| 86 | + :param token: The token to retrieve counts of |
| 87 | + :type token: str |
| 88 | + """ |
| 89 | + return self.token_counter[token] |
| 90 | + |
| 91 | + def get_vocab_size(self): |
| 92 | + """Returns vocabulary size (number of unique tokens) |
| 93 | + """ |
| 94 | + return len(self.token_counter) |
| 95 | + |
| 96 | + def get_token_counts_as_dataframe(self): |
| 97 | + """Returns the token counts of the corpus as a Pandas DataFrame with columns 'token', 'count' |
| 98 | + """ |
| 99 | + dataframe = pd.DataFrame.from_records( |
| 100 | + list(self.token_counter.items()), columns=["token", "count"] |
| 101 | + ) |
| 102 | + dataframe = dataframe.sort_values("token") |
| 103 | + return dataframe |
| 104 | + |
| 105 | + def save_token_counts(self, csv_file): |
| 106 | + """Saves the counts of tokens the corpus to a specified |
| 107 | + CSV file in alphabetical order |
| 108 | +
|
| 109 | + :param csv_file: Path to desired CSV output file |
| 110 | + :type csv_file: str or Path |
| 111 | + """ |
| 112 | + logger.info("Saving token counts to %s", csv_file) |
| 113 | + self.get_token_counts_as_dataframe().to_csv(csv_file, index=False, header=True) |
| 114 | + |
0 commit comments