mitodl
diff --git a/‎learning_resources/etl/canvas.py‎
Lines changed: 98 additions & 0 deletions b/‎learning_resources/etl/canvas.py‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎learning_resources/etl/constants.py‎
Lines changed: 1 addition & 0 deletions b/‎learning_resources/etl/constants.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎learning_resources/etl/utils.py‎
Lines changed: 71 additions & 62 deletions b/‎learning_resources/etl/utils.py‎
Lines changed: 71 additions & 62 deletions
@@ -0,0 +1,98 @@
+import logging
+import zipfile
+from collections.abc import Generator
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from defusedxml import ElementTree
+
+from learning_resources.constants import LearningResourceType
+from learning_resources.etl.constants import ETLSource
+from learning_resources.etl.utils import _process_olx_path, calc_checksum
+from learning_resources.models import LearningResource, LearningResourceRun
+
+log = logging.getLogger(__name__)
+
+
+def sync_canvas_archive(bucket, key: str, overwrite):
+    """
+    Sync a Canvas course archive from S3
+    """
+    from learning_resources.etl.loaders import load_content_files
+
+    with TemporaryDirectory() as export_tempdir:
+        course_archive_path = Path(export_tempdir, key.split("/")[-1])
+        bucket.download_file(key, course_archive_path)
+        run = run_for_canvas_archive(course_archive_path, overwrite=overwrite)
+        checksum = calc_checksum(course_archive_path)
+        if run:
+            load_content_files(
+                run,
+                transform_canvas_content_files(
+                    course_archive_path, run, overwrite=overwrite
+                ),
+            )
+            run.checksum = checksum
+            run.save()
+
+
+def run_for_canvas_archive(course_archive_path, overwrite):
+    """
+    Generate and return a LearningResourceRun for a Canvas course
+    """
+    checksum = calc_checksum(course_archive_path)
+    course_info = parse_canvas_settings(course_archive_path)
+    course_title = course_info.get("title")
+    readable_id = course_info.get("course_code")
+    # create placeholder learning resource
+    resource, _ = LearningResource.objects.get_or_create(
+        readable_id=readable_id,
+        defaults={
+            "title": course_title,
+            "published": False,
+            "test_mode": True,
+            "etl_source": ETLSource.canvas.name,
+            "resource_type": LearningResourceType.course.name,
+        },
+    )
+    if resource.runs.count() == 0:
+        LearningResourceRun.objects.create(
+            run_id=f"{readable_id}+canvas", learning_resource=resource, published=True
+        )
+    run = resource.runs.first()
+    if run.checksum == checksum and not overwrite:
+        log.info("Checksums match for %s, skipping load", readable_id)
+        return None
+    run.checksum = checksum
+    run.save()
+    return run
+
+
+def parse_canvas_settings(course_archive_path):
+    """
+    Get course attributes from a Canvas course archive
+    """
+    with zipfile.ZipFile(course_archive_path, "r") as course_archive:
+        xml_string = course_archive.read("course_settings/course_settings.xml")
+    tree = ElementTree.fromstring(xml_string)
+    attributes = {}
+    for node in tree.iter():
+        tag = node.tag.split("}")[1] if "}" in node.tag else node.tag
+        attributes[tag] = node.text
+    return attributes
+
+
+def transform_canvas_content_files(
+    course_zipfile: Path, run: LearningResourceRun, *, overwrite
+) -> Generator[dict, None, None]:
+    """
+    Transform content files from a Canvas course zipfile
+    """
+    basedir = course_zipfile.name.split(".")[0]
+    with (
+        TemporaryDirectory(prefix=basedir) as olx_path,
+        zipfile.ZipFile(course_zipfile.absolute(), "r") as course_archive,
+    ):
+        for member in course_archive.infolist():
+            course_archive.extract(member, path=olx_path)
+        yield from _process_olx_path(olx_path, run, overwrite=overwrite)
@@ -90,6 +90,7 @@ class ETLSource(ExtendedEnum):
     see = "see"
     xpro = "xpro"
     youtube = "youtube"
+    canvas = "canvas"
 
 
 class CourseNumberType(Enum):
 
@@ -9,6 +9,7 @@
 import re
 import tarfile
 import uuid
+import zipfile
 from collections import Counter
 from collections.abc import Generator
 from datetime import UTC, datetime
@@ -406,6 +407,69 @@ def text_from_sjson_content(content: str):
     return " ".join(data.get("text", []))
 
 
+def _process_olx_path(olx_path: str, run: LearningResourceRun, *, overwrite):
+    for document, metadata in documents_from_olx(olx_path):
+        source_path = metadata.get("source_path")
+        edx_module_id = get_edx_module_id(source_path, run)
+        key = edx_module_id
+        content_type = metadata["content_type"]
+        mime_type = metadata.get("mime_type")
+        file_extension = metadata.get("file_extension")
+        existing_content = ContentFile.objects.filter(key=key, run=run).first()
+        if (
+            not existing_content
+            or existing_content.archive_checksum != metadata.get("archive_checksum")
+        ) or overwrite:
+            if settings.SKIP_TIKA and settings.ENVIRONMENT != "production":
+                content_dict = {
+                    "content": "",
+                    "content_title": "",
+                }
+            else:
+                tika_output = extract_text_metadata(
+                    document,
+                    other_headers={"Content-Type": mime_type} if mime_type else {},
+                )
+                if tika_output is None:
+                    log.info("No tika response for %s", key)
+                    continue
+
+                tika_content = tika_output.get("content") or ""
+                tika_metadata = tika_output.get("metadata") or {}
+                content = tika_content.strip()
+                if file_extension == ".srt":
+                    content = text_from_srt_content(content)
+                elif file_extension == ".sjson":
+                    content = text_from_sjson_content(content)
+
+                if not content:
+                    continue
+
+                content_dict = {
+                    "content": content,
+                    "content_title": (
+                        metadata.get("title") or tika_metadata.get("title") or ""
+                    )[: get_max_contentfile_length("content_title")],
+                }
+        else:
+            content_dict = {
+                "content": existing_content.content,
+                "content_title": existing_content.content_title,
+            }
+        yield (
+            {
+                "key": key,
+                "published": True,
+                "content_type": content_type,
+                "archive_checksum": metadata.get("archive_checksum"),
+                "file_extension": file_extension,
+                "source_path": source_path,
+                "edx_module_id": edx_module_id,
+                **content_dict,
+            }
+        )
+
+
 def transform_content_files(
     course_tarpath: Path, run: LearningResourceRun, *, overwrite: bool
 ) -> Generator[dict, None, None]:
@@ -423,68 +487,7 @@ def transform_content_files(
     with TemporaryDirectory(prefix=basedir) as inner_tempdir:
         check_call(["tar", "xf", course_tarpath], cwd=inner_tempdir)  # noqa: S603,S607
         olx_path = glob.glob(inner_tempdir + "/*")[0]  # noqa: PTH207
-        for document, metadata in documents_from_olx(olx_path):
-            source_path = metadata.get("source_path")
-            edx_module_id = get_edx_module_id(source_path, run)
-            key = edx_module_id
-            content_type = metadata["content_type"]
-            mime_type = metadata.get("mime_type")
-            file_extension = metadata.get("file_extension")
-
-            existing_content = ContentFile.objects.filter(key=key, run=run).first()
-            if (
-                not existing_content
-                or existing_content.archive_checksum != metadata.get("archive_checksum")
-            ) or overwrite:
-                if settings.SKIP_TIKA and settings.ENVIRONMENT != "production":
-                    content_dict = {
-                        "content": "",
-                        "content_title": "",
-                    }
-                else:
-                    tika_output = extract_text_metadata(
-                        document,
-                        other_headers={"Content-Type": mime_type} if mime_type else {},
-                    )
-
-                    if tika_output is None:
-                        log.info("No tika response for %s", key)
-                        continue
-
-                    tika_content = tika_output.get("content") or ""
-                    tika_metadata = tika_output.get("metadata") or {}
-                    content = tika_content.strip()
-                    if file_extension == ".srt":
-                        content = text_from_srt_content(content)
-                    elif file_extension == ".sjson":
-                        content = text_from_sjson_content(content)
-
-                    if not content:
-                        continue
-
-                    content_dict = {
-                        "content": content,
-                        "content_title": (
-                            metadata.get("title") or tika_metadata.get("title") or ""
-                        )[: get_max_contentfile_length("content_title")],
-                    }
-            else:
-                content_dict = {
-                    "content": existing_content.content,
-                    "content_title": existing_content.content_title,
-                }
-            yield (
-                {
-                    "key": key,
-                    "published": True,
-                    "content_type": content_type,
-                    "archive_checksum": metadata.get("archive_checksum"),
-                    "file_extension": file_extension,
-                    "source_path": source_path,
-                    "edx_module_id": edx_module_id,
-                    **content_dict,
-                }
-            )
+        yield from _process_olx_path(olx_path, run, overwrite=overwrite)
 
 
 def get_learning_course_bucket_name(etl_source: str) -> str:
@@ -502,6 +505,7 @@ def get_learning_course_bucket_name(etl_source: str) -> str:
         ETLSource.xpro.name: settings.XPRO_LEARNING_COURSE_BUCKET_NAME,
         ETLSource.mitxonline.name: settings.MITX_ONLINE_LEARNING_COURSE_BUCKET_NAME,
         ETLSource.oll.name: settings.OLL_LEARNING_COURSE_BUCKET_NAME,
+        ETLSource.canvas.name: settings.CANVAS_COURSE_BUCKET_NAME,
     }
     return bucket_names.get(etl_source)
 
@@ -536,6 +540,11 @@ def calc_checksum(filename) -> str:
     Returns:
         str: The md5 checksum of the file
     """
+    if zipfile.is_zipfile(filename):
+        with zipfile.ZipFile(filename, "r") as zip_file:
+            return str(
+                hash(tuple(f"{zp.filename}:{zp.file_size}" for zp in zip_file.filelist))
+            )
     with tarfile.open(filename, "r") as tgz_file:
         return str(hash(tuple(ti.chksum for ti in tgz_file.getmembers())))