Skip to content

Commit

Permalink
method for calculating document frequencies
Browse files Browse the repository at this point in the history
  • Loading branch information
jtauber committed Feb 9, 2024
1 parent 89d85fd commit 14bda2f
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 0 deletions.
43 changes: 43 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,49 @@ You can get a term frequency with `tf(term)` or `tf(term, address)`.

```

You can also get document frequency with `df(term)`.

```python
>>> c = termdoc.HTDM()
>>> c.add("1.1", ["foo", "bar"])
>>> c.add("1.2", ["bar"])
>>> c.add("2.1", ["foo"])
>>> c.add("2.2", ["foo", "bar", "baz"])
>>> c.df("foo")
0.75
>>> c.df("bar")
0.75
>>> c.df("baz")
0.25

```

By default this treats the leaves of the tree at the documents but you can instead specify an explicit number of levels to go down. For example this following will only tree the `1` and `2` as the documents (not `1.1`, `1.2`, `2.1`, `2.2`):

```python
>>> c.df("foo", level=1)
1.0
>>> c.df("bar", level=1)
1.0
>>> c.df("baz", level=1)
0.5

```

Furthermore you can scope the calculate to a subtree, in this case just the documents `1.1` and `1.2` under `1`:

```python
>>> c.df("foo", "1")
0.5
>>> c.df("bar", "1")
1.0
>>> c.df("baz", "1")
0.0

```
This scoping can be combined with the level limit.


### Duplicates Policy

You can optionally pass in a `duplicates` setting to the constructor indicating the policy you want to follow if a term-document count is updated more than once.
Expand Down
13 changes: 13 additions & 0 deletions termdoc/htdm.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,16 @@ def copy(self, prefix=None):

def tf(self, term, address=""):
return self.get_counts(address)[term] / self.get_counts(address).total()

def df(self, term, prefix="", level=None):
if level:
leaves = self.counters[self.depth(prefix) + level]
else:
leaves = self.leaves()
characteristic = [
1 if term in counter else 0
for document, counter in leaves.items()
if document.startswith(prefix)
]
return sum(characteristic) / len(characteristic)

54 changes: 54 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,60 @@ def test_term_frequency(self):
self.assertEqual(c.tf("foo"), 0.5)
self.assertEqual(c.tf("foo", "2"), 0.75)

def test_document_frequency(self):
import termdoc

c = termdoc.HTDM()
c.add("1.1", ["foo", "bar"])
c.add("1.2", ["bar"])
c.add("2.1", ["foo"])
c.add("2.2", ["foo", "bar", "baz"])

self.assertEqual(c.df("foo"), 0.75)
self.assertEqual(c.df("bar"), 0.75)
self.assertEqual(c.df("baz"), 0.25)
self.assertEqual(c.df("foo", level=1), 1)
self.assertEqual(c.df("bar", level=1), 1)
self.assertEqual(c.df("baz", level=1), 0.5)
self.assertEqual(c.df("foo", "1"), 0.5)
self.assertEqual(c.df("bar", "1"), 1)
self.assertEqual(c.df("baz", "1"), 0)
self.assertEqual(c.df("foo", "2"), 1)
self.assertEqual(c.df("bar", "2"), 0.5)
self.assertEqual(c.df("baz", "2"), 0.5)
self.assertEqual(c.df("foo", ""), 0.75)
self.assertEqual(c.df("bar", ""), 0.75)
self.assertEqual(c.df("baz", ""), 0.25)
self.assertEqual(c.df("foo", "", level=1), 1)
self.assertEqual(c.df("bar", "", level=1), 1)
self.assertEqual(c.df("baz", "", level=1), 0.5)
self.assertEqual(c.df("foo", level=2), 0.75)
self.assertEqual(c.df("bar", level=2), 0.75)
self.assertEqual(c.df("baz", level=2), 0.25)
self.assertEqual(c.df("foo", "", level=2), 0.75)
self.assertEqual(c.df("bar", "", level=2), 0.75)
self.assertEqual(c.df("baz", "", level=2), 0.25)
self.assertEqual(c.df("foo", "1", level=1), 0.5)
self.assertEqual(c.df("bar", "1", level=1), 1)
self.assertEqual(c.df("baz", "1", level=1), 0)
self.assertEqual(c.df("foo", "2", level=1), 1)
self.assertEqual(c.df("bar", "2", level=1), 0.5)
self.assertEqual(c.df("baz", "2", level=1), 0.5)

def test_wikipedia(self):
import termdoc

c = termdoc.HTDM()
c.add("1", ["this", "is", "a", "a", "sample"])
c.add("2", ["this", "is", "another", "another", "example", "example", "example"])

self.assertEqual(c.tf("this", "1"), 1/5)
self.assertEqual(c.tf("this", "2"), 1/7)
self.assertEqual(c.df("this"), 1)
self.assertEqual(c.tf("example", "1"), 0)
self.assertEqual(c.tf("example", "2"), 3/7)
self.assertEqual(c.df("example"), 1/2)


if __name__ == "__main__":
unittest.main()

0 comments on commit 14bda2f

Please sign in to comment.