add tutorbot problems model and etl (#2373)

abeglova · web-flow · commit 3d3e75c46fa1 · 2025-07-21T16:45:13.000-04:00
diff --git a/app.json b/app.json
@@ -704,6 +704,10 @@
     "POSTHOG_TIMEOUT_MS": {
       "description": "Timeout for communication with PostHog API",
       "required": false
+    },
+    "CANVAS_TUTORBOT_FOLDER": {
+      "description": "Folder in Canvas course zip files where tutorbot problem and solution files are stored",
+      "required": false
     }
   },
   "keywords": ["Django", "Python", "MIT", "Office of Digital Learning"],
diff --git a/learning_resources/constants.py b/learning_resources/constants.py
@@ -180,6 +180,11 @@ class LearningResourceRelationTypes(TextChoices):
     zip(VALID_COURSE_CONTENT_TYPES, VALID_COURSE_CONTENT_TYPES)
 )
 
+VALID_TUTOR_PROBLEM_TYPES = ["problem", "solution"]
+VALID_TUTOR_PROBLEM_TYPE_CHOICES = list(
+    zip(VALID_TUTOR_PROBLEM_TYPES, VALID_TUTOR_PROBLEM_TYPES)
+)
+
 DEPARTMENTS = {
     "1": "Civil and Environmental Engineering",
     "2": "Mechanical Engineering",
diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py
@@ -25,7 +25,7 @@ def sync_canvas_archive(bucket, key: str, overwrite):
     """
     Sync a Canvas course archive from S3
     """
-    from learning_resources.etl.loaders import load_content_files
+    from learning_resources.etl.loaders import load_content_files, load_problem_files
 
     course_folder = key.lstrip(settings.CANVAS_COURSE_BUCKET_PREFIX).split("/")[0]
 
@@ -43,8 +43,16 @@ def sync_canvas_archive(bucket, key: str, overwrite):
                     course_archive_path, run, overwrite=overwrite
                 ),
             )
+
+            load_problem_files(
+                run,
+                transform_canvas_problem_files(
+                    course_archive_path, run, overwrite=overwrite
+                ),
+            )
             run.checksum = checksum
             run.save()
+
     return resource_readable_id, run
 
 
@@ -124,6 +132,42 @@ def transform_canvas_content_files(
         yield from _process_olx_path(olx_path, run, overwrite=overwrite)
 
 
+def transform_canvas_problem_files(
+    course_zipfile: Path, run: LearningResourceRun, *, overwrite
+) -> Generator[dict, None, None]:
+    """
+    Transform problem files from a Canvas course zipfile
+    """
+    basedir = course_zipfile.name.split(".")[0]
+    with (
+        TemporaryDirectory(prefix=basedir) as olx_path,
+        zipfile.ZipFile(course_zipfile.absolute(), "r") as course_archive,
+    ):
+        for member in course_archive.infolist():
+            if member.filename.startswith(settings.CANVAS_TUTORBOT_FOLDER):
+                course_archive.extract(member, path=olx_path)
+                log.debug("processing active problem set file %s", member.filename)
+        for file_data in _process_olx_path(olx_path, run, overwrite=overwrite):
+            keys_to_keep = [
+                "run",
+                "content",
+                "archive_checksum",
+                "source_path",
+                "file_extension",
+            ]
+            problem_file_data = {
+                key: file_data[key] for key in keys_to_keep if key in file_data
+            }
+            path = file_data["source_path"]
+            path = path[len(settings.CANVAS_TUTORBOT_FOLDER) :]
+            path_parts = path.split("/")
+            problem_file_data["problem_title"] = path_parts[0]
+
+            if path_parts[1] in ["problem", "solution"]:
+                problem_file_data["type"] = path_parts[1]
+            yield problem_file_data
+
+
 def parse_module_meta(course_archive_path: str) -> dict:
     """
     Parse module_meta.xml and return publish/active status of resources.
diff --git a/learning_resources/etl/loaders.py b/learning_resources/etl/loaders.py
@@ -41,6 +41,7 @@
     PodcastEpisode,
     Program,
     RunInstructorRelationship,
+    TutorProblemFile,
     Video,
     VideoChannel,
     VideoPlaylist,
@@ -807,6 +808,63 @@ def load_content_files(
     return None
 
 
+def load_problem_file(
+    course_run: LearningResourceRun, problem_file_data: dict
+) -> ContentFile:
+    """
+    Sync a tutorbot problem to the database
+
+    Args:
+        course_run (LearningResourceRun): a LearningResourceRun for a Course
+        problem_file_data (dict): File metadata as JSON
+
+    Returns:
+        Int: the id of the object that was created or updated
+    """
+    try:
+        problem_file, _ = TutorProblemFile.objects.update_or_create(
+            run=course_run,
+            source_path=problem_file_data.get("source_path"),
+            defaults=problem_file_data,
+        )
+        return problem_file.id  # noqa: TRY300
+    except:  # noqa: E722
+        log.exception(
+            "ERROR syncing problem file  %s for run %d",
+            problem_file_data.get("source_path", ""),
+            course_run.id,
+        )
+
+
+def load_problem_files(
+    course_run: LearningResourceRun,
+    problem_files_data: list[dict],
+) -> list[int]:
+    """
+    Sync all problem files for canvas course
+
+    Args:
+        course_run (LearningResourceRun): a course run
+        problem_files_data (list or generator): Details about the problem files
+
+    Returns:
+        list of int: Ids of the TutorProblemFile objects that were created/updated
+
+    """
+    problem_files_ids = [
+        load_problem_file(course_run, problem_file)
+        for problem_file in problem_files_data
+    ]
+    for file in (
+        TutorProblemFile.objects.filter(run=course_run)
+        .exclude(id__in=problem_files_ids)
+        .all()
+    ):
+        file.delete()
+
+    return problem_files_ids
+
+
 def load_podcast_episode(episode_data: dict) -> LearningResource:
     """
     Load a podcast_episode into the database
diff --git a/learning_resources/etl/loaders_test.py b/learning_resources/etl/loaders_test.py
@@ -43,6 +43,8 @@
     load_podcast,
     load_podcast_episode,
     load_podcasts,
+    load_problem_file,
+    load_problem_files,
     load_program,
     load_programs,
     load_run,
@@ -81,6 +83,7 @@
     LearningResourceRun,
     PodcastEpisode,
     Program,
+    TutorProblemFile,
     Video,
     VideoChannel,
     VideoPlaylist,
@@ -1094,6 +1097,67 @@ def test_load_content_file():
         )
 
 
+def test_load_problem_file():
+    """Test that load_problem_file saves a TutorProblemFile object"""
+    learning_resource_run = LearningResourceRunFactory.create()
+
+    props = {
+        "problem_title": "Problem 1",
+        "type": "problem",
+        "source_path": "ai/tutor/problems/Problem 1/problem/problem1",
+        "content": "This is the content of the problem file.",
+    }
+
+    result = load_problem_file(learning_resource_run, props)
+
+    # assert we got an integer back
+    assert isinstance(result, int)
+
+    assert TutorProblemFile.objects.count() == 1
+
+    loaded_file = TutorProblemFile.objects.get(pk=result)
+    assert loaded_file.run == learning_resource_run
+
+    for key, value in props.items():
+        assert getattr(loaded_file, key) == value, (
+            f"Property {key} should equal {value}"
+        )
+
+
+def test_load_problem_files(mocker):
+    """Test that load_content_files calls the expected functions"""
+    course = LearningResourceFactory.create(is_course=True, create_runs=False)
+    course_run = LearningResourceRunFactory.create(learning_resource=course)
+    LearningResourceRunFactory.create(
+        learning_resource=course,
+        start_date=now_in_utc() - timedelta(days=365),
+    )
+    assert course.runs.count() == 2
+
+    deleted_problem_file = ContentFileFactory.create(run=course_run)
+
+    content_data = [
+        {
+            "problem_title": "Problem 1",
+            "type": "problem",
+            "source_path": "ai/tutor/problems/Problem 1/problem/problem1",
+        },
+        {
+            "problem_title": "Problem 1",
+            "type": "solution",
+            "source_path": "ai/tutor/problems/Problem 1/solution/sol1",
+        },
+    ]
+
+    load_problem_files(course_run, content_data)
+
+    assert TutorProblemFile.objects.filter(id=deleted_problem_file.id).exists() is False
+    for file in content_data:
+        assert TutorProblemFile.objects.filter(
+            run=course_run, source_path=file["source_path"]
+        ).exists()
+
+
 def test_load_image():
     """Test that image resources are uniquely created or retrieved based on parameters"""
     resource_url = "https://mit.edu"
diff --git a/learning_resources/factories.py b/learning_resources/factories.py
@@ -878,6 +878,25 @@ class Meta:
         skip_postgeneration_save = True
 
 
+class TutorProblemFileFactory(DjangoModelFactory):
+    """Factory for TutorProblemFiles"""
+
+    run = None
+    problem_title = factory.Faker("sentence")
+    type = FuzzyChoice("problem", "solution")
+    content = factory.Faker("text")
+    source_path = factory.Faker("file_path", extension="txt")
+
+    @classmethod
+    def _create(cls, model_class, *args, **kwargs):
+        run = kwargs.pop("run", None)
+        kwargs["run"] = run
+        return super()._create(model_class, *args, **kwargs)
+
+    class Meta:
+        model = models.TutorProblemFile
+
+
 class VideoPlaylistFactory(DjangoModelFactory):
     """Factory for Video Playlists"""
 
diff --git a/learning_resources/migrations/0092_tutorproblemfile.py b/learning_resources/migrations/0092_tutorproblemfile.py
@@ -0,0 +1,64 @@
+# Generated by Django 4.2.23 on 2025-07-18 23:26
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("learning_resources", "0091_content_file_content_view_group"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="TutorProblemFile",
+            fields=[
+                (
+                    "id",
+                    models.AutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("created_on", models.DateTimeField(auto_now_add=True, db_index=True)),
+                ("updated_on", models.DateTimeField(auto_now=True)),
+                (
+                    "problem_title",
+                    models.CharField(blank=True, max_length=1024, null=True),
+                ),
+                (
+                    "type",
+                    models.CharField(
+                        choices=[("problem", "problem"), ("solution", "solution")],
+                        max_length=128,
+                    ),
+                ),
+                ("content", models.TextField(blank=True, null=True)),
+                (
+                    "archive_checksum",
+                    models.CharField(blank=True, max_length=32, null=True),
+                ),
+                (
+                    "source_path",
+                    models.CharField(blank=True, max_length=1024, null=True),
+                ),
+                (
+                    "file_extension",
+                    models.CharField(blank=True, max_length=32, null=True),
+                ),
+                (
+                    "run",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        related_name="problem_files",
+                        to="learning_resources.learningresourcerun",
+                    ),
+                ),
+            ],
+            options={
+                "abstract": False,
+            },
+        ),
+    ]
diff --git a/learning_resources/models.py b/learning_resources/models.py
@@ -861,6 +861,32 @@ def for_serialization(self):
         )
 
 
+class TutorProblemFile(TimestampedModel):
+    """
+    Tutor Problem and Solution model
+    """
+
+    run = models.ForeignKey(
+        LearningResourceRun,
+        related_name="problem_files",
+        on_delete=models.CASCADE,
+        blank=False,
+        null=False,
+    )
+
+    problem_title = models.CharField(max_length=1024, null=True, blank=True)  # noqa: DJ001
+
+    type = models.CharField(
+        max_length=128, choices=constants.VALID_TUTOR_PROBLEM_TYPE_CHOICES
+    )
+
+    content = models.TextField(null=True, blank=True)  # noqa: DJ001
+
+    archive_checksum = models.CharField(max_length=32, null=True, blank=True)  # noqa: DJ001
+    source_path = models.CharField(max_length=1024, null=True, blank=True)  # noqa: DJ001
+    file_extension = models.CharField(max_length=32, null=True, blank=True)  # noqa: DJ001
+
+
 class ContentFile(TimestampedModel):
     """
     ContentFile model for LearningResourceRun files
diff --git a/main/settings.py b/main/settings.py
@@ -850,3 +850,4 @@ def get_all_config_keys():
 OPENTELEMETRY_ENDPOINT = get_string("OPENTELEMETRY_ENDPOINT", None)
 OPENTELEMETRY_TRACES_BATCH_SIZE = get_int("OPENTELEMETRY_TRACES_BATCH_SIZE", 512)
 OPENTELEMETRY_EXPORT_TIMEOUT_MS = get_int("OPENTELEMETRY_EXPORT_TIMEOUT_MS", 5000)
+CANVAS_TUTORBOT_FOLDER = get_string("CANVAS_TUTORBOT_FOLDER", "web_resources/ai/tutor/")