Skip to content

Commit 1591723

Browse files
authored
remove unpublished canvas content (#2386)
* removing newly unpublished contentfiles * adding tests
1 parent 66a3ebb commit 1591723

File tree

2 files changed

+107
-2
lines changed

2 files changed

+107
-2
lines changed

learning_resources/etl/canvas.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,20 @@
1111

1212
from learning_resources.constants import LearningResourceType, PlatformType
1313
from learning_resources.etl.constants import ETLSource
14-
from learning_resources.etl.utils import _process_olx_path, calc_checksum
14+
from learning_resources.etl.utils import (
15+
_process_olx_path,
16+
calc_checksum,
17+
get_edx_module_id,
18+
)
1519
from learning_resources.models import (
1620
LearningResource,
1721
LearningResourcePlatform,
1822
LearningResourceRun,
1923
)
24+
from learning_resources.utils import bulk_resources_unpublished_actions
25+
from learning_resources_search.constants import (
26+
CONTENT_FILE_TYPE,
27+
)
2028

2129
log = logging.getLogger(__name__)
2230

@@ -119,18 +127,28 @@ def transform_canvas_content_files(
119127
published_items = [
120128
Path(item["path"]).resolve() for item in module_metadata["active"]
121129
]
130+
published_keys = []
122131
with (
123132
TemporaryDirectory(prefix=basedir) as olx_path,
124133
zipfile.ZipFile(course_zipfile.absolute(), "r") as course_archive,
125134
):
126135
for member in course_archive.infolist():
127136
if Path(member.filename).resolve() in published_items:
137+
full_path = Path(olx_path) / Path(member.filename)
138+
published_keys.append(get_edx_module_id(str(full_path), run))
128139
course_archive.extract(member, path=olx_path)
129140
log.debug("processing active file %s", member.filename)
130141
else:
131142
log.debug("skipping unpublished file %s", member.filename)
132143
yield from _process_olx_path(olx_path, run, overwrite=overwrite)
133144

145+
unpublished_content = run.content_files.exclude(key__in=published_keys)
146+
147+
bulk_resources_unpublished_actions(
148+
list(unpublished_content.values_list("id", flat=True)), CONTENT_FILE_TYPE
149+
)
150+
unpublished_content.delete()
151+
134152

135153
def transform_canvas_problem_files(
136154
course_zipfile: Path, run: LearningResourceRun, *, overwrite
@@ -182,7 +200,6 @@ def parse_module_meta(course_archive_path: str) -> dict:
182200
root = ElementTree.fromstring(module_xml)
183201
for module in root.findall(".//ns:module", namespaces):
184202
module_title = module.find("ns:title", namespaces).text
185-
186203
for item in module.findall("ns:items/ns:item", namespaces):
187204
item_state = item.find("ns:workflow_state", namespaces).text
188205
item_title = item.find("ns:title", namespaces).text

learning_resources/etl/canvas_test.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Tests for Canvas ETL functionality"""
22

33
import zipfile
4+
from pathlib import Path
45

56
import pytest
67

@@ -9,13 +10,18 @@
910
parse_canvas_settings,
1011
parse_module_meta,
1112
run_for_canvas_archive,
13+
transform_canvas_content_files,
1214
)
1315
from learning_resources.etl.constants import ETLSource
16+
from learning_resources.etl.utils import get_edx_module_id
1417
from learning_resources.factories import (
18+
ContentFileFactory,
1519
LearningResourceFactory,
1620
LearningResourcePlatformFactory,
21+
LearningResourceRunFactory,
1722
)
1823
from learning_resources.models import LearningResource
24+
from learning_resources_search.constants import CONTENT_FILE_TYPE
1925

2026
pytestmark = pytest.mark.django_db
2127

@@ -250,3 +256,85 @@ def test_parse_module_meta_handles_missing_identifierref(tmp_path):
250256
assert "active" in result
251257
assert len(result["active"]) == 0
252258
assert len(result["unpublished"]) == 0
259+
260+
261+
def test_transform_canvas_content_files_removes_unpublished_content(mocker, tmp_path):
262+
"""
263+
Test that transform_canvas_content_files removes content files not marked as published.
264+
"""
265+
266+
# Setup: create a fake run with some content files
267+
resource = LearningResourceFactory.create(etl_source=ETLSource.canvas.name)
268+
run = LearningResourceRunFactory.create(learning_resource=resource)
269+
270+
published_path = "/test/published/file1.html"
271+
unpublished_path = "/test/unpublished/file2.html"
272+
unpublished_cf = ContentFileFactory.create(
273+
run=run, published=True, key=get_edx_module_id(unpublished_path, run)
274+
)
275+
module_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
276+
<modules xmlns="http://canvas.instructure.com/xsd/cccv1p0">
277+
<module>
278+
<title>Module 1</title>
279+
<items>
280+
<item>
281+
<workflow_state>active</workflow_state>
282+
<title>Item 1</title>
283+
<identifierref>RES1</identifierref>
284+
<content_type>resource</content_type>
285+
</item>
286+
<item>
287+
<workflow_state>unpublished</workflow_state>
288+
<title>Item 2</title>
289+
<identifierref>RES2</identifierref>
290+
<content_type>resource</content_type>
291+
</item>
292+
</items>
293+
</module>
294+
</modules>
295+
"""
296+
manifest_xml = bytes(
297+
f"""<?xml version="1.0" encoding="UTF-8"?>
298+
<manifest xmlns="http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1">
299+
<resources>
300+
<resource identifier="RES1" type="webcontent">
301+
<file href="{published_path}"/>
302+
</resource>
303+
<resource identifier="RES2" type="webcontent">
304+
<file href="{unpublished_path}"/>
305+
</resource>
306+
</resources>
307+
<organizations>
308+
<organization>
309+
<item identifierref="RES1">
310+
<title>Item 1</title>
311+
</item>
312+
<item identifierref="RES2">
313+
<title>Item 2</title>
314+
</item>
315+
</organization>
316+
</organizations>
317+
</manifest>
318+
""",
319+
"utf-8",
320+
)
321+
zip_path = tmp_path / "canvas_course.zip"
322+
with zipfile.ZipFile(zip_path, "w") as zf:
323+
zf.writestr("course_settings/module_meta.xml", module_xml)
324+
zf.writestr("imsmanifest.xml", manifest_xml)
325+
zf.writestr(published_path, "content")
326+
zf.writestr(unpublished_path, "content")
327+
mocker.patch(
328+
"learning_resources.etl.utils.extract_text_metadata",
329+
return_value={"content": "test"},
330+
)
331+
bulk_unpub = mocker.patch(
332+
"learning_resources.etl.canvas.bulk_resources_unpublished_actions"
333+
)
334+
335+
# Create a fake zipfile with the published file
336+
337+
list(transform_canvas_content_files(Path(zip_path), run, overwrite=True))
338+
339+
# Ensure unpublished content is deleted and unpublished actions called
340+
bulk_unpub.assert_called_once_with([unpublished_cf.id], CONTENT_FILE_TYPE)

0 commit comments

Comments
 (0)