pulp · jobselko · Nov 21, 2025 · Nov 21, 2025 · Dec 15, 2025 · gerrod3
diff --git a/pulp_python/app/management/commands/repair-python-metadata.py b/pulp_python/app/management/commands/repair-python-metadata.py
@@ -24,8 +24,14 @@ def repair_metadata(content):
     set_of_update_fields = set()
     total_repaired = 0
     for package in immediate_content.prefetch_related("_artifacts").iterator(chunk_size=1000):
+        # Get the main artifact
+        main_artifact = (
+            package.contentartifact_set.exclude(relative_path__endswith=".metadata")
+            .first()
+            .artifact
+        )
         new_data = artifact_to_python_content_data(
-            package.filename, package._artifacts.get(), package.pulp_domain
+            package.filename, main_artifact, package.pulp_domain
         )
         changed = False
         for field, value in new_data.items():

diff --git a/pulp_python/app/migrations/0019_create_missing_metadata_artifacts.py b/pulp_python/app/migrations/0019_create_missing_metadata_artifacts.py
@@ -0,0 +1,215 @@
+# Generated manually on 2025-12-15 14:00 for creating missing metadata artifacts
+
+from django.db import migrations
+
+
+def pulp_hashlib_new(name, *args, **kwargs):
+    """
+    Copied and updated (to comply with migrations) from pulpcore.
+    """
+    import hashlib as the_real_hashlib
+    from django.conf import settings
+
+    if name not in settings.ALLOWED_CONTENT_CHECKSUMS:
+        return None
+
+    return the_real_hashlib.new(name, *args, **kwargs)
+
+
+def init_and_validate(file, artifact_model, expected_digests=None, expected_size=None):
+    """
+    Copied and updated (to comply with migrations) from pulpcore.
+    """
+    from django.conf import settings
+
+    digest_fields = []
+    for alg in ("sha512", "sha384", "sha256", "sha224", "sha1", "md5"):
+        if alg in settings.ALLOWED_CONTENT_CHECKSUMS:
+            digest_fields.append(alg)
+
+    if isinstance(file, str):
+        with open(file, "rb") as f:
+            hashers = {
+                n: hasher for n in digest_fields if (hasher := pulp_hashlib_new(n)) is not None
+            }
+            if not hashers:
+                return None
+
+            size = 0
+            while True:
+                chunk = f.read(1048576)  # 1 megabyte
+                if not chunk:
+                    break
+                for algorithm in hashers.values():
+                    algorithm.update(chunk)
+                size = size + len(chunk)
+    else:
+        size = file.size
+        hashers = file.hashers
+
+    if expected_size:
+        if size != expected_size:
+            return None
+
+    if expected_digests:
+        for algorithm, expected_digest in expected_digests.items():
+            if algorithm not in hashers:
+                return None
+            actual_digest = hashers[algorithm].hexdigest()
+            if expected_digest != actual_digest:
+                return None
+
+    attributes = {"size": size, "file": file}
+    for algorithm in digest_fields:
+        attributes[algorithm] = hashers[algorithm].hexdigest()
+
+    return artifact_model(**attributes)
+
+
+def extract_wheel_metadata(filename):
+    """
+    Extract the metadata file content from a wheel file.
+    Returns the raw metadata content as bytes or None if metadata cannot be extracted.
+    """
+    import zipfile
+
+    if not filename.endswith(".whl"):
+        return None
+    try:
+        with zipfile.ZipFile(filename, "r") as f:
+            for file_path in f.namelist():
+                if file_path.endswith(".dist-info/METADATA"):
+                    return f.read(file_path)
+    except (zipfile.BadZipFile, KeyError, OSError):
+        pass
+    return None
+
+
+def artifact_to_metadata_artifact(filename, artifact, tmp_dir, artifact_model):
+    """
+    Creates artifact for metadata from the provided wheel artifact.
+    """
+    import os
+    import shutil
+    import tempfile
+
+    if not filename.endswith(".whl"):
+        return None
+
+    temp_wheel_path = None
+    temp_metadata_path = None
+    try:
+        with tempfile.NamedTemporaryFile(
+            "wb", dir=tmp_dir, suffix=filename, delete=False
+        ) as temp_file:
+            temp_wheel_path = temp_file.name
+            artifact.file.seek(0)
+            shutil.copyfileobj(artifact.file, temp_file)
+            temp_file.flush()
+
+        metadata_content = extract_wheel_metadata(temp_wheel_path)
+        if not metadata_content:
+            return None
+
+        with tempfile.NamedTemporaryFile(
+            "wb", dir=tmp_dir, suffix=".metadata", delete=False
+        ) as temp_md:
+            temp_metadata_path = temp_md.name
+            temp_md.write(metadata_content)
+            temp_md.flush()
+
+        # todo: pass metadata_sha256 from PPC to expected_digests in init_and_validate?
+        #  if not, simplify init_and_validate
+        metadata_artifact = init_and_validate(temp_metadata_path, artifact_model)
+        if not metadata_artifact:
+            return None
+
+        try:
+            metadata_artifact.save()
+        except Exception:
+            return None
+
+        return metadata_artifact
+
+    finally:
+        if temp_wheel_path and os.path.exists(temp_wheel_path):
+            os.unlink(temp_wheel_path)
+        if temp_metadata_path and os.path.exists(temp_metadata_path):
+            os.unlink(temp_metadata_path)
+
+
+# todo: bulk create?
+def create_missing_metadata_artifacts(apps, schema_editor):
+    """
+    Create metadata artifacts for PythonPackageContent instances that have metadata_sha256
+    but are missing the corresponding metadata artifact.
+    """
+    import tempfile
+    from django.conf import settings
+
+    PythonPackageContent = apps.get_model("python", "PythonPackageContent")
+    ContentArtifact = apps.get_model("core", "ContentArtifact")
+    Artifact = apps.get_model("core", "Artifact")
+
+    packages = (
+        PythonPackageContent.objects.filter(metadata_sha256__isnull=False)
+        .exclude(metadata_sha256="")
+        .prefetch_related("contentartifact_set")
-        .prefetch_related("contentartifact_set")
+        .prefetch_related("contentartifact_set")
+        .only("filename", "metadata_sha256")
-        .prefetch_related("contentartifact_set")
+        .prefetch_related("contentartifact_set")
+        .only("filename", "metadata_sha256")
+    )
+    # todo: only for testing, remove later
+    created_count = 0
+    skipped_count = 0
+
+    # todo: do i need temp dir? (not needed in localhost)
+    with tempfile.TemporaryDirectory(dir=settings.WORKING_DIRECTORY) as temp_dir:
+        for package in packages:
+            metadata_relative_path = f"{package.filename}.metadata"
+            content_artifacts = list(package.contentartifact_set.all())
+
+            if any(ca.relative_path == metadata_relative_path for ca in content_artifacts):
+                # Metadata artifact already exist
+                continue
+
+            main_content_artifact = next(
+                (ca for ca in content_artifacts if ca.relative_path == package.filename),
+                None,
+            )
+            if not main_content_artifact:
+                # Main artifact does not exist
+                skipped_count += 1
+                continue
+
+            metadata_artifact = artifact_to_metadata_artifact(
+                package.filename, main_content_artifact.artifact, temp_dir, Artifact
+            )
+            if not metadata_artifact:
+                # Failed to create metadata artifact
+                skipped_count += 1
+                continue
+
+            try:
+                ContentArtifact.objects.create(
+                    artifact=metadata_artifact,
+                    content=package,
+                    relative_path=metadata_relative_path,
+                )
+                created_count += 1
+            except Exception:
+                # Failed to save metadata artifact
+                skipped_count += 1
+
+    print(f"Created {created_count} missing metadata artifacts. Skipped {skipped_count} packages.")
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("python", "0018_packageprovenance"),
+    ]
+
+    operations = [
+        migrations.RunPython(
+            create_missing_metadata_artifacts,
+            reverse_code=migrations.RunPython.noop,
+        ),
+    ]
diff --git a/pulp_python/app/pypi/views.py b/pulp_python/app/pypi/views.py
@@ -352,8 +352,6 @@ def parse_package(release_package):
     @extend_schema(operation_id="pypi_simple_package_read", summary="Get package simple page")
     def retrieve(self, request, path, package):
         """Retrieves the simple api html/json page for a package."""
-        media_type = request.accepted_renderer.media_type
-
         repo_ver, content = self.get_rvc()
         # Should I redirect if the normalized name is different?
         normalized = canonicalize_name(package)

diff --git a/pulp_python/app/serializers.py b/pulp_python/app/serializers.py
@@ -1,5 +1,6 @@
 import logging
 import os
+import tempfile
 from gettext import gettext as _
 from django.conf import settings
 from django.db.utils import IntegrityError
@@ -22,6 +23,7 @@
 )
 from pulp_python.app.utils import (
     DIST_EXTENSIONS,
+    artifact_to_metadata_artifact,
     artifact_to_python_content_data,
     get_project_metadata_from_file,
     parse_project_metadata,
@@ -93,11 +95,31 @@ class Meta:
         model = python_models.PythonDistribution
 
 
+class PythonSingleContentArtifactField(core_serializers.SingleContentArtifactField):
+    """
+    Custom field with overridden get_attribute method. Meant to be used only in
+    PythonPackageContentSerializer to handle possible existence of metadata artifact.
+    """
+
+    def get_attribute(self, instance):
+        # When content has multiple artifacts (wheel + metadata), return the main one
+        if instance._artifacts.count() > 1:
+            for ca in instance.contentartifact_set.all():
+                if not ca.relative_path.endswith(".metadata"):
+                    return ca.artifact
+
+        return super().get_attribute(instance)
+
+
 class PythonPackageContentSerializer(core_serializers.SingleArtifactContentUploadSerializer):
     """
     A Serializer for PythonPackageContent.
     """
 
+    artifact = PythonSingleContentArtifactField(
+        help_text=_("Artifact file representing the physical content"),
+    )
+
     # Core metadata
     # Version 1.0
     author = serializers.CharField(
@@ -386,8 +408,21 @@ def deferred_validate(self, data):
         if attestations := data.pop("attestations", None):
             data["provenance"] = self.handle_attestations(filename, data["sha256"], attestations)
 
+        # Create metadata artifact for wheel files
+        if filename.endswith(".whl"):
+            if metadata_artifact := artifact_to_metadata_artifact(filename, artifact):
+                data["metadata_artifact"] = metadata_artifact
+                data["metadata_sha256"] = metadata_artifact.sha256
+
         return data
 
+    def get_artifacts(self, validated_data):
+        artifacts = super().get_artifacts(validated_data)
+        if metadata_artifact := validated_data.pop("metadata_artifact", None):
+            relative_path = f"{validated_data['filename']}.metadata"
+            artifacts[relative_path] = metadata_artifact
+        return artifacts
+
     def retrieve(self, validated_data):
         content = python_models.PythonPackageContent.objects.filter(
             sha256=validated_data["sha256"], _pulp_domain=get_domain()
@@ -419,6 +454,7 @@ def create(self, validated_data):
 
     class Meta:
         fields = core_serializers.SingleArtifactContentUploadSerializer.Meta.fields + (
+            "artifact",
             "author",
             "author_email",
             "description",
@@ -514,6 +550,15 @@ def validate(self, data):
             data["provenance"] = self.handle_attestations(
                 filename, data["sha256"], attestations, offline=True
             )
+        # Create metadata artifact for wheel files
+        if filename.endswith(".whl"):
+            with tempfile.TemporaryDirectory(dir=settings.WORKING_DIRECTORY) as temp_dir:
+                if metadata_artifact := artifact_to_metadata_artifact(
+                    filename, artifact, tmp_dir=temp_dir
+                ):
+                    data["metadata_artifact"] = metadata_artifact
+                    data["metadata_sha256"] = metadata_artifact.sha256
+
         return data
 
     class Meta(PythonPackageContentSerializer.Meta):

diff --git a/pulp_python/app/tasks/repair.py b/pulp_python/app/tasks/repair.py
@@ -95,9 +95,13 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
     progress_report.save()
     with progress_report:
         for package in progress_report.iter(immediate_content.iterator(chunk_size=BULK_SIZE)):
-            new_data = artifact_to_python_content_data(
-                package.filename, package._artifacts.get(), domain
+            # Get the main artifact
+            main_artifact = (
+                package.contentartifact_set.exclude(relative_path__endswith=".metadata")
+                .first()
+                .artifact
             )
+            new_data = artifact_to_python_content_data(package.filename, main_artifact, domain)
             total_repaired += update_package_if_needed(
                 package, new_data, batch, set_of_update_fields
             )
@@ -113,7 +117,11 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
             grouped_by_url = defaultdict(list)
 
             for package in group_set:
-                for ra in package.contentartifact_set.get().remoteartifact_set.all():
+                for ra in (
+                    package.contentartifact_set.exclude(relative_path__endswith=".metadata")
+                    .first()
+                    .remoteartifact_set.all()
+                ):
                     grouped_by_url[ra.remote.url].append((package, ra))
 
             # Prioritize the URL that can serve the most packages