WordSiv Release 0.3.0

* add langauge-support.md * fix links * delete punctuation.md
tallpauley · Feb 1, 2025 · d3db268 · d3db268
1 parent 2859a84
commit d3db268
Show file tree

Hide file tree

Showing 14 changed files with 197 additions and 34 deletions.
diff --git a/docs/release-notes.md b/docs/release-notes.md
@@ -1,3 +1,18 @@
+## 0.3.0 – 2025-02-01
+
+### Added
+
+- Documentation on [language support](usage/language-support.md)
+
+### Fixed
+
+- internal: `tests/test_snippets.py` now uses the current working directory
+- Minor site/docs fixes
+
+### Removed
+
+- `docs/punctuation.md` (unused)
+
 ## 0.2.4 – 2025-01-21
 
 ### Fixed

diff --git a/docs/snippets/add-vocab-punc.py b/docs/snippets/add-vocab-punc.py
@@ -0,0 +1,36 @@
+from wordsiv import Vocab, WordSiv
+
+# Define the punctuation dictionary
+de_punc = {
+    "insert": {
+        " ": 0.365,
+        ", ": 0.403,
+        ": ": 0.088,
+        "; ": 0.058,
+        "–": 0.057,
+        "—": 0.022,
+        " … ": 0.006,
+    },
+    "wrap_sent": {
+        ("", "."): 0.923,
+        ("", "!"): 0.034,
+        ("", "?"): 0.04,
+        ("", "…"): 0.003,
+    },
+    "wrap_inner": {
+        ("", ""): 0.825,
+        ("(", ")"): 0.133,
+        ("‘", "’"): 0.013,
+        ("“", "”"): 0.028,
+    },
+}
+
+# Create a Vocab from a file, this time passing punctuation
+de_vocab = Vocab(lang="de", data_file="de.tsv", bicameral=True, punctuation=de_punc)
+
+# Add Vocab to WordSiv Object
+ws = WordSiv()
+ws.add_vocab("de-subtitles", de_vocab)
+
+# Try it out, turning up punctuation randomness so we see more variation
+print(ws.para(vocab="de-subtitles", rnd_punc=0.5))
diff --git a/docs/snippets/add-vocab.py b/docs/snippets/add-vocab.py
@@ -0,0 +1,11 @@
+from wordsiv import Vocab, WordSiv
+
+# Create a Vocab from a file
+de_vocab = Vocab(lang="de", data_file="de.tsv", bicameral=True)
+
+# Add Vocab to WordSiv object
+ws = WordSiv()
+ws.add_vocab("de-subtitles", de_vocab)
+
+# Try it out
+print(ws.sent(vocab="de-subtitles"))
diff --git a/docs/snippets/de.tsv b/docs/snippets/de.tsv
@@ -0,0 +1,20 @@
+ich	3699605
+sie	2409949
+das	1952794
+ist	1920535
+du	1890181
+nicht	1734016
+die	1585020
+es	1460530
+und	1441012
+der	1109693
+wir	1075801
+was	1072372
+zu	918548
+er	851812
+ein	841835
+in	793011
+mir	645137
+mit	641744
+ja	635186
+den	588653
diff --git a/docs/usage/basic-usage.md b/docs/usage/basic-usage.md
@@ -10,12 +10,12 @@ sentence:
 
 You should see a random sentence in the console in the lower-right of DrawBot!
 
-Check out the [Quick Reference](../../examples/quick-reference) if you want to
+Check out the [Quick Reference](../examples/quick-reference.md) if you want to
 quickly jump into WordSiv, or read on for more detailed information.
 
 ## Listing Vocabs
 
-WordSiv generates text using [Vocabs](../../api-reference/#wordsiv.Vocab):
+WordSiv generates text using [Vocabs](../api-reference.md#wordsiv.Vocab):
 objects that contain a word list (usually with occurrence counts) for a given
 language. WordSiv includes some Vocabs, and you can make your own (instructions
 coming soon!). You can see all available Vocabs with: `list_vocabs()`:

diff --git a/docs/usage/filtering-words.md b/docs/usage/filtering-words.md
@@ -1,7 +1,7 @@
 # Filtering Words
 
 WordSiv provides options for filtering the words that are used
-to [generate text](../generating-text):
+to [generate text](generating-text.md):
 
 - **Letter Case: [`case`](#filter-words-by-letter-case)**
 - **Word Length: [`wl`](#filter-words-by-word-length), [`min_wl`](#filter-words-by-word-length), [`max_wl`](#filter-words-by-word-length)**

diff --git a/docs/usage/generating-text.md b/docs/usage/generating-text.md
@@ -41,7 +41,7 @@ text generation methods it calls:
 
 ### Random Word (`word()`)
 
-The `word()` method returns a single word, randomly selected from the Vocab (weighted by word probability). See also [word filter arguments](../filtering-words).
+The `word()` method returns a single word, randomly selected from the Vocab (weighted by word probability). See also [word filter arguments](filtering-words.md).
 
 ```python
 --8<-- "word.py"
@@ -50,7 +50,7 @@ The `word()` method returns a single word, randomly selected from the Vocab (wei
 ### Most Common Word (`top_word()`)
 
 The `top_word()` method returns the most common word or the nth common word.
-See also [word filter arguments](../filtering-words).
+See also [word filter arguments](filtering-words.md).
 
 ```python
 --8<-- "top-word.py"
@@ -59,7 +59,7 @@ See also [word filter arguments](../filtering-words).
 ### List of Random Words (`words()`)
 
 The `words()` method returns a list of words generated by `word()` (randomly
-selected from the Vocab, weighted by word probability). See also [word filter arguments](../filtering-words).
+selected from the Vocab, weighted by word probability). See also [word filter arguments](filtering-words.md).
 
 ```python
 --8<-- "words.py"
@@ -68,7 +68,7 @@ selected from the Vocab, weighted by word probability). See also [word filter ar
 ### List of Most Common Words (`top_words()`)
 
 The `top_words()` method returns a list of the most common words in descending
-frequency order. See also [word filter arguments](../filtering-words).
+frequency order. See also [word filter arguments](filtering-words.md).
 
 ```python
 --8<-- "top-words.py"
@@ -78,7 +78,7 @@ frequency order. See also [word filter arguments](../filtering-words).
 
 The `sent()` method returns a single sentence, joining the output of `words()`
 with punctuation (optionally). See also [word filter
-arguments](../filtering-words).
+arguments](filtering-words.md).
 
 ```python
 --8<-- "sent.py"
@@ -87,7 +87,7 @@ arguments](../filtering-words).
 ### List of Sentences (`sents()`)
 
 The `sents()` method returns a list of sentences generated from `sent()`. See
-also [word filter arguments](../filtering-words).
+also [word filter arguments](filtering-words.md).
 
 ```python
 --8<-- "sents.py"
@@ -96,7 +96,7 @@ also [word filter arguments](../filtering-words).
 ### Paragraph (`para()`)
 
 The `para()` method returns a single paragraph, joining the output of `sents()`.
-See also [word filter arguments](../filtering-words).
+See also [word filter arguments](filtering-words.md).
 
 ```python
 --8<-- "para.py"
@@ -105,7 +105,7 @@ See also [word filter arguments](../filtering-words).
 ### Multiple Paragraphs (`paras()`)
 
 The `paras()` method returns a list of paragraphs generated from `para()`.
-See also [word filter arguments](../filtering-words).
+See also [word filter arguments](filtering-words.md).
 
 ```python
 --8<-- "paras.py"
@@ -114,7 +114,7 @@ See also [word filter arguments](../filtering-words).
 ### Text Block (`text()`)
 
 The `text()` method generates a text block, joining the output of `paras()`. See
-also [word filter arguments](../filtering-words).
+also [word filter arguments](filtering-words.md).
 
 ```python
 --8<-- "text.py"

diff --git a/docs/usage/language-support.md b/docs/usage/language-support.md
@@ -0,0 +1,91 @@
+# Language Support
+
+## Vocab
+In WordSiv, a [Vocab](../api-reference.md#wordsiv.Vocab) is an object that contains
+a word list and other language-specific data that allow a WordSiv object to
+appropriately filter words and generate text.
+
+!!! Note
+    I considered naming this object **WordList**, but it also can contain
+    word counts and punctuation data. I considered calling it **Lang**, but it's
+    possible to have more than one set of words (and punctuation, etc.) per
+    language. I can imagine having Vocabs derived from different genres of text:
+    `en-news`, `en-wiki`, etc!
+
+### Using a Built-in Vocab
+
+See [Basic Usage](basic-usage.md) for how to list and select a built-in Vocab.
+If you're curious about the origin/license[^1] of these lists you can examine
+the built-in Vocabs in [wordsiv/_vocab_data][vocab-data].
+
+### Creating a custom Vocab
+
+It's easy to add your own Vocab to WordSiv. The harder part is actually deriving
+wordlists from a [text corpus](https://en.wikipedia.org/wiki/Text_corpus)) and
+refining the capitalization (if applicable), which we won't detail here.
+
+Let's say we grab the top 20 German words from this [frequency wordlist derived
+from OpenSubtitles][hermit-de], and save it as `de-words.tsv` (replacing spaces
+with tabs):
+```
+--8<-- "de.tsv"
+```
+
+We can now create a Vocab and add it to WordSiv:
+```python
+--8<-- "add-vocab.py"
+```
+
+We get the output:
+> Die du die der ich nicht sie das und e
+
+#### Adding Custom Punctuation to a Vocab
+
+But what if we want punctuation? We have some default punctuation for the
+built-in languages in [wordsiv/_punctuation.py][punctuation-py], but not yet for
+German (at the time of writing). Let's copy/paste the English one (for now[^2])
+and try it out:
+```python
+--8<-- "add-vocab-punc.py"
+```
+
+Now we see punctuation:
+> Ich ist mit das ich (du und) mit es sie… Nicht das was zu sie—du die ja nicht
+> und zu ist du? Das er das “wir” ich was sie der du mit das die und zu ich. In
+> und in, ich ja ich die der das (nicht er sie ich) mir.
+
+
+### Contributing Vocabs to WordSiv
+
+WordSiv is as only as good as the Vocabs (and punctuation dictionaries!) that
+are available to it, and we'd love any help on improving language support. Feel
+free to [create an issue on the GitHub
+repo](https://github.com/tallpauley/wordsiv/issues) if you're interested in
+helping us improve language support. You don't even have to be a programmer—we
+just need native speakers to help us construct useful Vocabs. However, if you
+are looking to learn some programming, building wordlists and punctuation can be
+a fun first project (and I'd be glad to help!).
+
+My long-term vision is to build a community-maintained project (outside of
+WordSiv) that has a huge selection of multilingual proofing text, wordlists,
+punctuation, etc. and resources and code that enable the global type community
+to more easily leverage the language data that is commonplace in
+NLP/linguistics/engineering circles. A lot of the source data
+[already](https://github.com/simoncozens/gobbet)
+[exists](https://cldr.unicode.org/), it just needs to be adapted for the
+needs/tooling of type designers.
+
+[^1]: Licensing for wordlists is a bit odd, because they're often built by
+crawling a bunch of data with all kinds of licenses. I'm just doing my best here
+to respect licenses where I can!
+[^2]: I'd recommend deriving punctuation frequencies for the target language
+from [real text][leipzig], and normalizing the probabilities between 0 and 1. I
+have a script that builds these dictionaries, which I hope to publish soon!
+
+[leipzig]: https://wortschatz.uni-leipzig.de/en/
+[vocab-data]:
+    https://github.com/tallpauley/wordsiv/tree/main/wordsiv/_vocab_data
+[punctuation-py]:
+    https://github.com/tallpauley/wordsiv/tree/main/wordsiv/_punctuation.py
+[hermit-de]:
+    https://github.com/hermitdave/FrequencyWords/blob/master/content/2016/de/de_50k.txt
diff --git a/docs/usage/punctuation.md b/docs/usage/punctuation.md
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -10,6 +10,7 @@ theme:
     - navigation.tabs
     - navigation.tabs.sticky
 markdown_extensions:
+  - footnotes
   - attr_list
   - admonition
   - pymdownx.highlight:
@@ -33,9 +34,9 @@ nav:
       - Basic Usage: usage/basic-usage.md
       - Generating Text: usage/generating-text.md
       - Filtering Words: usage/filtering-words.md
+      - Language Support: usage/language-support.md
   - API Reference:
-      wordsiv:
-        wordsiv: api-reference.md
+      - wordsiv: api-reference.md
   - Release Notes: release-notes.md
 repo_url: https://github.com/tallpauley/wordsiv
 watch:

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "wordsiv"
-version = "0.2.4"
+version = "0.2.5"
 description = "Generate text with a limited character set for font proofing"
 authors = ["Chris Pauley <[email protected]>"]
 license = "MIT"

diff --git a/tests/test_snippets.py b/tests/test_snippets.py
@@ -1,22 +1,24 @@
 import pytest
 from pathlib import Path
 import subprocess
+import os
 
 # Directory containing the snippet files
 SNIPPETS_DIR = Path("docs/snippets")
 
 
 def get_snippet_files():
     """Retrieve a list of all files in the snippets directory."""
-    return [f for f in SNIPPETS_DIR.iterdir() if f.is_file()]
+    return [f for f in SNIPPETS_DIR.iterdir() if f.is_file() and f.suffix == ".py"]
 
 
 @pytest.mark.parametrize("snippet_file", get_snippet_files())
 def test_snippet(snippet_file):
     """Run each snippet file as a test."""
-    # Assuming the snippets are Python scripts, you can execute them using subprocess
+    cwd = os.path.dirname(os.path.realpath(snippet_file))
+    filename = os.path.basename(snippet_file)
     result = subprocess.run(
-        ["python", str(snippet_file)], capture_output=True, text=True
+        ["python", filename], capture_output=True, text=True, cwd=cwd
     )
 
     # Check if the script ran successfully

diff --git a/wordsiv/__init__.py b/wordsiv/__init__.py
@@ -789,7 +789,7 @@ def para(
             seed (float | str | None): Seed the random number generator if seed is not
                 None.
             sent_sep (str): The string used to join sentences.
-             **sents_kwargs: Keyword arguments passed to `sents(...)`.
+            **sents_kwargs: Keyword arguments passed to `sents(...)`.
 
         Returns:
             str: A single paragraph containing multiple sentences.

diff --git a/wordsiv/_filter.py b/wordsiv/_filter.py
@@ -26,8 +26,8 @@ class FilterError(Exception):
 ]
 """
 Options for setting case via the `case` argument.
-See [Letter Case](../guide/filtering-words/#letter-case) in the Guide for detailed descriptions and examples of each
-option
+See [Letter Case](usage/filtering-words.md#filter-words-by-letter-case) in the Guide for
+detailed descriptions and examples of each option
 """