Skip to content

Commit e30c640

Browse files
authored
Ingest canvas courses (#2307)
1 parent d705b53 commit e30c640

22 files changed

+625
-67
lines changed

learning_resources/etl/canvas.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import logging
2+
import zipfile
3+
from collections.abc import Generator
4+
from pathlib import Path
5+
from tempfile import TemporaryDirectory
6+
7+
from defusedxml import ElementTree
8+
9+
from learning_resources.constants import LearningResourceType
10+
from learning_resources.etl.constants import ETLSource
11+
from learning_resources.etl.utils import _process_olx_path, calc_checksum
12+
from learning_resources.models import LearningResource, LearningResourceRun
13+
14+
log = logging.getLogger(__name__)
15+
16+
17+
def sync_canvas_archive(bucket, key: str, overwrite):
18+
"""
19+
Sync a Canvas course archive from S3
20+
"""
21+
from learning_resources.etl.loaders import load_content_files
22+
23+
with TemporaryDirectory() as export_tempdir:
24+
course_archive_path = Path(export_tempdir, key.split("/")[-1])
25+
bucket.download_file(key, course_archive_path)
26+
run = run_for_canvas_archive(course_archive_path, overwrite=overwrite)
27+
checksum = calc_checksum(course_archive_path)
28+
if run:
29+
load_content_files(
30+
run,
31+
transform_canvas_content_files(
32+
course_archive_path, run, overwrite=overwrite
33+
),
34+
)
35+
run.checksum = checksum
36+
run.save()
37+
38+
39+
def run_for_canvas_archive(course_archive_path, overwrite):
40+
"""
41+
Generate and return a LearningResourceRun for a Canvas course
42+
"""
43+
checksum = calc_checksum(course_archive_path)
44+
course_info = parse_canvas_settings(course_archive_path)
45+
course_title = course_info.get("title")
46+
readable_id = course_info.get("course_code")
47+
# create placeholder learning resource
48+
resource, _ = LearningResource.objects.get_or_create(
49+
readable_id=readable_id,
50+
defaults={
51+
"title": course_title,
52+
"published": False,
53+
"test_mode": True,
54+
"etl_source": ETLSource.canvas.name,
55+
"resource_type": LearningResourceType.course.name,
56+
},
57+
)
58+
if resource.runs.count() == 0:
59+
LearningResourceRun.objects.create(
60+
run_id=f"{readable_id}+canvas", learning_resource=resource, published=True
61+
)
62+
run = resource.runs.first()
63+
if run.checksum == checksum and not overwrite:
64+
log.info("Checksums match for %s, skipping load", readable_id)
65+
return None
66+
run.checksum = checksum
67+
run.save()
68+
return run
69+
70+
71+
def parse_canvas_settings(course_archive_path):
72+
"""
73+
Get course attributes from a Canvas course archive
74+
"""
75+
with zipfile.ZipFile(course_archive_path, "r") as course_archive:
76+
xml_string = course_archive.read("course_settings/course_settings.xml")
77+
tree = ElementTree.fromstring(xml_string)
78+
attributes = {}
79+
for node in tree.iter():
80+
tag = node.tag.split("}")[1] if "}" in node.tag else node.tag
81+
attributes[tag] = node.text
82+
return attributes
83+
84+
85+
def transform_canvas_content_files(
86+
course_zipfile: Path, run: LearningResourceRun, *, overwrite
87+
) -> Generator[dict, None, None]:
88+
"""
89+
Transform content files from a Canvas course zipfile
90+
"""
91+
basedir = course_zipfile.name.split(".")[0]
92+
with (
93+
TemporaryDirectory(prefix=basedir) as olx_path,
94+
zipfile.ZipFile(course_zipfile.absolute(), "r") as course_archive,
95+
):
96+
for member in course_archive.infolist():
97+
course_archive.extract(member, path=olx_path)
98+
yield from _process_olx_path(olx_path, run, overwrite=overwrite)

learning_resources/etl/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ class ETLSource(ExtendedEnum):
9090
see = "see"
9191
xpro = "xpro"
9292
youtube = "youtube"
93+
canvas = "canvas"
9394

9495

9596
class CourseNumberType(Enum):

learning_resources/etl/utils.py

Lines changed: 71 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import re
1010
import tarfile
1111
import uuid
12+
import zipfile
1213
from collections import Counter
1314
from collections.abc import Generator
1415
from datetime import UTC, datetime
@@ -406,6 +407,69 @@ def text_from_sjson_content(content: str):
406407
return " ".join(data.get("text", []))
407408

408409

410+
def _process_olx_path(olx_path: str, run: LearningResourceRun, *, overwrite):
411+
for document, metadata in documents_from_olx(olx_path):
412+
source_path = metadata.get("source_path")
413+
edx_module_id = get_edx_module_id(source_path, run)
414+
key = edx_module_id
415+
content_type = metadata["content_type"]
416+
mime_type = metadata.get("mime_type")
417+
file_extension = metadata.get("file_extension")
418+
existing_content = ContentFile.objects.filter(key=key, run=run).first()
419+
if (
420+
not existing_content
421+
or existing_content.archive_checksum != metadata.get("archive_checksum")
422+
) or overwrite:
423+
if settings.SKIP_TIKA and settings.ENVIRONMENT != "production":
424+
content_dict = {
425+
"content": "",
426+
"content_title": "",
427+
}
428+
else:
429+
tika_output = extract_text_metadata(
430+
document,
431+
other_headers={"Content-Type": mime_type} if mime_type else {},
432+
)
433+
if tika_output is None:
434+
log.info("No tika response for %s", key)
435+
continue
436+
437+
tika_content = tika_output.get("content") or ""
438+
tika_metadata = tika_output.get("metadata") or {}
439+
content = tika_content.strip()
440+
if file_extension == ".srt":
441+
content = text_from_srt_content(content)
442+
elif file_extension == ".sjson":
443+
content = text_from_sjson_content(content)
444+
445+
if not content:
446+
continue
447+
448+
content_dict = {
449+
"content": content,
450+
"content_title": (
451+
metadata.get("title") or tika_metadata.get("title") or ""
452+
)[: get_max_contentfile_length("content_title")],
453+
}
454+
else:
455+
content_dict = {
456+
"content": existing_content.content,
457+
"content_title": existing_content.content_title,
458+
}
459+
yield (
460+
{
461+
"key": key,
462+
"published": True,
463+
"content_type": content_type,
464+
"archive_checksum": metadata.get("archive_checksum"),
465+
"file_extension": file_extension,
466+
"source_path": source_path,
467+
"edx_module_id": edx_module_id,
468+
**content_dict,
469+
}
470+
)
471+
472+
409473
def transform_content_files(
410474
course_tarpath: Path, run: LearningResourceRun, *, overwrite: bool
411475
) -> Generator[dict, None, None]:
@@ -423,68 +487,7 @@ def transform_content_files(
423487
with TemporaryDirectory(prefix=basedir) as inner_tempdir:
424488
check_call(["tar", "xf", course_tarpath], cwd=inner_tempdir) # noqa: S603,S607
425489
olx_path = glob.glob(inner_tempdir + "/*")[0] # noqa: PTH207
426-
for document, metadata in documents_from_olx(olx_path):
427-
source_path = metadata.get("source_path")
428-
edx_module_id = get_edx_module_id(source_path, run)
429-
key = edx_module_id
430-
content_type = metadata["content_type"]
431-
mime_type = metadata.get("mime_type")
432-
file_extension = metadata.get("file_extension")
433-
434-
existing_content = ContentFile.objects.filter(key=key, run=run).first()
435-
if (
436-
not existing_content
437-
or existing_content.archive_checksum != metadata.get("archive_checksum")
438-
) or overwrite:
439-
if settings.SKIP_TIKA and settings.ENVIRONMENT != "production":
440-
content_dict = {
441-
"content": "",
442-
"content_title": "",
443-
}
444-
else:
445-
tika_output = extract_text_metadata(
446-
document,
447-
other_headers={"Content-Type": mime_type} if mime_type else {},
448-
)
449-
450-
if tika_output is None:
451-
log.info("No tika response for %s", key)
452-
continue
453-
454-
tika_content = tika_output.get("content") or ""
455-
tika_metadata = tika_output.get("metadata") or {}
456-
content = tika_content.strip()
457-
if file_extension == ".srt":
458-
content = text_from_srt_content(content)
459-
elif file_extension == ".sjson":
460-
content = text_from_sjson_content(content)
461-
462-
if not content:
463-
continue
464-
465-
content_dict = {
466-
"content": content,
467-
"content_title": (
468-
metadata.get("title") or tika_metadata.get("title") or ""
469-
)[: get_max_contentfile_length("content_title")],
470-
}
471-
else:
472-
content_dict = {
473-
"content": existing_content.content,
474-
"content_title": existing_content.content_title,
475-
}
476-
yield (
477-
{
478-
"key": key,
479-
"published": True,
480-
"content_type": content_type,
481-
"archive_checksum": metadata.get("archive_checksum"),
482-
"file_extension": file_extension,
483-
"source_path": source_path,
484-
"edx_module_id": edx_module_id,
485-
**content_dict,
486-
}
487-
)
490+
yield from _process_olx_path(olx_path, run, overwrite=overwrite)
488491

489492

490493
def get_learning_course_bucket_name(etl_source: str) -> str:
@@ -502,6 +505,7 @@ def get_learning_course_bucket_name(etl_source: str) -> str:
502505
ETLSource.xpro.name: settings.XPRO_LEARNING_COURSE_BUCKET_NAME,
503506
ETLSource.mitxonline.name: settings.MITX_ONLINE_LEARNING_COURSE_BUCKET_NAME,
504507
ETLSource.oll.name: settings.OLL_LEARNING_COURSE_BUCKET_NAME,
508+
ETLSource.canvas.name: settings.CANVAS_COURSE_BUCKET_NAME,
505509
}
506510
return bucket_names.get(etl_source)
507511

@@ -536,6 +540,11 @@ def calc_checksum(filename) -> str:
536540
Returns:
537541
str: The md5 checksum of the file
538542
"""
543+
if zipfile.is_zipfile(filename):
544+
with zipfile.ZipFile(filename, "r") as zip_file:
545+
return str(
546+
hash(tuple(f"{zp.filename}:{zp.file_size}" for zp in zip_file.filelist))
547+
)
539548
with tarfile.open(filename, "r") as tgz_file:
540549
return str(hash(tuple(ti.chksum for ti in tgz_file.getmembers())))
541550

0 commit comments

Comments
 (0)