Skip to content

Commit 2debe32

Browse files
committed
Implement garbage collection (#141)
1 parent 0425ee6 commit 2debe32

File tree

2 files changed

+115
-2
lines changed

2 files changed

+115
-2
lines changed

backend/catalogs/triplestore.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Functions that deal with adding and updating catalog records in the
22
triplestore."""
3+
from typing import Optional
34
from itertools import chain
45
import datetime as dt
56
from django.conf import settings
@@ -17,6 +18,11 @@
1718
RECORDS_GC_GRAPH_IDENTIFIER = URIRef(RECORDS_GC_GRAPH_URI)
1819
SCHEMA = Namespace('https://schema.org/')
1920

21+
# When we retrieve records from a catalog, they might be duplicates of records
22+
# that were retrieved before. The following query gets rid of the duplicates. It
23+
# is meant to be executed just before the newly retrieved records are added. The
24+
# net effect should be that all `obsolete_records` remain in the triplestore;
25+
# the query is not meant for permanently deleting records.
2026
purge_old_update = '''
2127
delete {{
2228
graph <{records_graph}> {{
@@ -40,6 +46,40 @@
4046
}}
4147
'''.format
4248

49+
# The following update query identifies and removes records that are not in any
50+
# collection and that have not been retrieved since the given `cutoff_date`.
51+
garbage_collect_update = '''
52+
delete {{
53+
graph <{records_graph}> {{
54+
?r ?p1 ?o1 .
55+
?f ?p2 ?o2 .
56+
}}
57+
graph <{gc_graph}> {{
58+
?r schema:uploadDate ?d ;
59+
schema:upvoteCount 0 .
60+
}}
61+
}}
62+
where {{
63+
graph <{gc_graph}> {{
64+
{{
65+
?r schema:uploadDate ?d ;
66+
schema:upvoteCount 0 .
67+
}}
68+
union
69+
{{
70+
?r schema:uploadDate ?d .
71+
filter not exists {{ ?r schema:upvoteCount ?c }}
72+
}}
73+
filter ( ?d < {cutoff_date} )
74+
}}
75+
graph <{records_graph}> {{
76+
?r ?p1 ?o1 ;
77+
?pt ?f .
78+
optional {{?f ?p2 ?o2 .}}
79+
}}
80+
}}
81+
'''.format
82+
4383

4484
def prune_recursively(graph: Graph, subject: Node):
4585
"""Recursively prune triples """
@@ -81,3 +121,18 @@ def save_to_triplestore(content_graph: Graph, records: list[Node]) -> None:
81121
quads_gc = ((rec, SCHEMA.uploadDate, now, gc_graph) for rec in records)
82122
store.addN(chain(quads, quads_gc))
83123
store.commit()
124+
125+
126+
def collect_garbage(until: Optional[dt.date]=None) -> None:
127+
"""Forget all unused records that were added before `until`.
128+
129+
`until` defaults to two weeks ago."""
130+
if until is None:
131+
until = dt.date.today() - dt.timedelta(weeks=2)
132+
store = settings.RDFLIB_STORE
133+
store.update(garbage_collect_update(
134+
records_graph=RECORDS_GRAPH_URI,
135+
gc_graph=RECORDS_GC_GRAPH_URI,
136+
cutoff_date=Literal(until).n3(),
137+
), initNs={'schema': SCHEMA})
138+
store.commit()

backend/catalogs/triplestore_test.py

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
import pytest
2+
import datetime as dt
3+
24
from edpop_explorer import Record, EDPOPREC, BibliographicalRecord, Field
3-
from rdflib import Graph, RDF, URIRef
5+
from rdflib import Graph, RDF, URIRef, Literal
46

57
from .graphs_test import MockReader
6-
from .triplestore import save_to_triplestore, remove_from_triplestore, SCHEMA
8+
from .triplestore import collect_garbage, save_to_triplestore, \
9+
remove_from_triplestore, SCHEMA, RECORDS_GC_GRAPH_IDENTIFIER
710
from operator import attrgetter
811

912

@@ -29,6 +32,14 @@ def record_nodes(record_instances):
2932
return map(attrgetter('subject_node'), record_instances)
3033

3134

35+
@pytest.fixture
36+
def working_data_saved(working_data_graph, triplestore):
37+
records, graph = working_data_graph
38+
nodes = list(record_nodes(records))
39+
save_to_triplestore(graph, nodes)
40+
return nodes, records, graph
41+
42+
3243
def stored_records(triplestore):
3344
""" Retrieve the records currently stored in `triplestore`. """
3445
return list(triplestore.subjects(RDF.type, EDPOPREC.Record))
@@ -85,3 +96,50 @@ def test_remove_nonexistent_subject(working_data_graph):
8596
# Removing subjects that don't exist should not cause any problems
8697
records, _ = working_data_graph
8798
remove_from_triplestore(records)
99+
100+
101+
def test_gc_retain_recent(working_data_saved, triplestore):
102+
cutoff = dt.date.today() - dt.timedelta(weeks=1)
103+
collect_garbage(cutoff)
104+
assert len(stored_records(triplestore)) == 2
105+
assert stored_records_match_tracked_records(triplestore)
106+
107+
108+
def test_gc_remove_outdated(working_data_saved, triplestore):
109+
cutoff = dt.date.today() + dt.timedelta(weeks=1)
110+
collect_garbage(cutoff)
111+
assert len(stored_records(triplestore)) == 0
112+
assert stored_records_match_tracked_records(triplestore)
113+
114+
115+
def test_gc_retain_used(working_data_saved, triplestore):
116+
cutoff = dt.date.today() + dt.timedelta(weeks=1)
117+
nodes, _, _ = working_data_saved
118+
chosen = nodes[0]
119+
triplestore.addN([(
120+
chosen,
121+
SCHEMA.upvoteCount,
122+
Literal(1),
123+
Graph(identifier=RECORDS_GC_GRAPH_IDENTIFIER),
124+
)])
125+
collect_garbage(cutoff)
126+
remaining_subjects = stored_records(triplestore)
127+
assert len(remaining_subjects) == 1
128+
assert remaining_subjects[0] == chosen
129+
assert stored_records_match_tracked_records(triplestore)
130+
131+
132+
def test_gc_remove_obsolete(working_data_saved, triplestore):
133+
cutoff = dt.date.today() + dt.timedelta(weeks=1)
134+
nodes, _, _ = working_data_saved
135+
chosen = nodes[0]
136+
triplestore.addN([(
137+
chosen,
138+
SCHEMA.upvoteCount,
139+
Literal(0),
140+
Graph(identifier=RECORDS_GC_GRAPH_IDENTIFIER),
141+
)])
142+
collect_garbage(cutoff)
143+
remaining_subjects = stored_records(triplestore)
144+
assert len(remaining_subjects) == 0
145+
assert stored_records_match_tracked_records(triplestore)

0 commit comments

Comments
 (0)