Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,14 @@ def repair_metadata(content):
set_of_update_fields = set()
total_repaired = 0
for package in immediate_content.prefetch_related("_artifacts").iterator(chunk_size=1000):
# Get the main artifact
main_artifact = (
package.contentartifact_set.exclude(relative_path__endswith=".metadata")
.first()
.artifact
)
new_data = artifact_to_python_content_data(
package.filename, package._artifacts.get(), package.pulp_domain
package.filename, main_artifact, package.pulp_domain
)
changed = False
for field, value in new_data.items():
Expand Down
215 changes: 215 additions & 0 deletions pulp_python/app/migrations/0019_create_missing_metadata_artifacts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
# Generated manually on 2025-12-15 14:00 for creating missing metadata artifacts

from django.db import migrations


def pulp_hashlib_new(name, *args, **kwargs):
"""
Copied and updated (to comply with migrations) from pulpcore.
"""
import hashlib as the_real_hashlib
from django.conf import settings

if name not in settings.ALLOWED_CONTENT_CHECKSUMS:
return None

return the_real_hashlib.new(name, *args, **kwargs)


def init_and_validate(file, artifact_model, expected_digests=None, expected_size=None):
"""
Copied and updated (to comply with migrations) from pulpcore.
"""
from django.conf import settings

digest_fields = []
for alg in ("sha512", "sha384", "sha256", "sha224", "sha1", "md5"):
if alg in settings.ALLOWED_CONTENT_CHECKSUMS:
digest_fields.append(alg)

if isinstance(file, str):
with open(file, "rb") as f:
hashers = {
n: hasher for n in digest_fields if (hasher := pulp_hashlib_new(n)) is not None
}
if not hashers:
return None

size = 0
while True:
chunk = f.read(1048576) # 1 megabyte
if not chunk:
break
for algorithm in hashers.values():
algorithm.update(chunk)
size = size + len(chunk)
else:
size = file.size
hashers = file.hashers

if expected_size:
if size != expected_size:
return None

if expected_digests:
for algorithm, expected_digest in expected_digests.items():
if algorithm not in hashers:
return None
actual_digest = hashers[algorithm].hexdigest()
if expected_digest != actual_digest:
return None

attributes = {"size": size, "file": file}
for algorithm in digest_fields:
attributes[algorithm] = hashers[algorithm].hexdigest()

return artifact_model(**attributes)


def extract_wheel_metadata(filename):
"""
Extract the metadata file content from a wheel file.
Returns the raw metadata content as bytes or None if metadata cannot be extracted.
"""
import zipfile

if not filename.endswith(".whl"):
return None
try:
with zipfile.ZipFile(filename, "r") as f:
for file_path in f.namelist():
if file_path.endswith(".dist-info/METADATA"):
return f.read(file_path)
except (zipfile.BadZipFile, KeyError, OSError):
pass
return None


def artifact_to_metadata_artifact(filename, artifact, tmp_dir, artifact_model):
"""
Creates artifact for metadata from the provided wheel artifact.
"""
import os
import shutil
import tempfile

if not filename.endswith(".whl"):
return None

temp_wheel_path = None
temp_metadata_path = None
try:
with tempfile.NamedTemporaryFile(
"wb", dir=tmp_dir, suffix=filename, delete=False
) as temp_file:
temp_wheel_path = temp_file.name
artifact.file.seek(0)
shutil.copyfileobj(artifact.file, temp_file)
temp_file.flush()

metadata_content = extract_wheel_metadata(temp_wheel_path)
if not metadata_content:
return None

with tempfile.NamedTemporaryFile(
"wb", dir=tmp_dir, suffix=".metadata", delete=False
) as temp_md:
temp_metadata_path = temp_md.name
temp_md.write(metadata_content)
temp_md.flush()

# todo: pass metadata_sha256 from PPC to expected_digests in init_and_validate?
# if not, simplify init_and_validate
metadata_artifact = init_and_validate(temp_metadata_path, artifact_model)
if not metadata_artifact:
return None

try:
metadata_artifact.save()
except Exception:
return None

return metadata_artifact

finally:
if temp_wheel_path and os.path.exists(temp_wheel_path):
os.unlink(temp_wheel_path)
if temp_metadata_path and os.path.exists(temp_metadata_path):
os.unlink(temp_metadata_path)


# todo: bulk create?
def create_missing_metadata_artifacts(apps, schema_editor):
"""
Create metadata artifacts for PythonPackageContent instances that have metadata_sha256
but are missing the corresponding metadata artifact.
"""
import tempfile
from django.conf import settings

PythonPackageContent = apps.get_model("python", "PythonPackageContent")
ContentArtifact = apps.get_model("core", "ContentArtifact")
Artifact = apps.get_model("core", "Artifact")

packages = (
PythonPackageContent.objects.filter(metadata_sha256__isnull=False)
.exclude(metadata_sha256="")
.prefetch_related("contentartifact_set")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
.prefetch_related("contentartifact_set")
.prefetch_related("contentartifact_set")
.only("filename", "metadata_sha256")

)
# todo: only for testing, remove later
created_count = 0
skipped_count = 0

# todo: do i need temp dir? (not needed in localhost)
with tempfile.TemporaryDirectory(dir=settings.WORKING_DIRECTORY) as temp_dir:
for package in packages:
metadata_relative_path = f"{package.filename}.metadata"
content_artifacts = list(package.contentartifact_set.all())

if any(ca.relative_path == metadata_relative_path for ca in content_artifacts):
# Metadata artifact already exist
continue

main_content_artifact = next(
(ca for ca in content_artifacts if ca.relative_path == package.filename),
None,
)
if not main_content_artifact:
# Main artifact does not exist
skipped_count += 1
continue

metadata_artifact = artifact_to_metadata_artifact(
package.filename, main_content_artifact.artifact, temp_dir, Artifact
)
if not metadata_artifact:
# Failed to create metadata artifact
skipped_count += 1
continue

try:
ContentArtifact.objects.create(
artifact=metadata_artifact,
content=package,
relative_path=metadata_relative_path,
)
created_count += 1
except Exception:
# Failed to save metadata artifact
skipped_count += 1

print(f"Created {created_count} missing metadata artifacts. Skipped {skipped_count} packages.")


class Migration(migrations.Migration):

dependencies = [
("python", "0018_packageprovenance"),
]

operations = [
migrations.RunPython(
create_missing_metadata_artifacts,
reverse_code=migrations.RunPython.noop,
),
]
2 changes: 0 additions & 2 deletions pulp_python/app/pypi/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,8 +352,6 @@ def parse_package(release_package):
@extend_schema(operation_id="pypi_simple_package_read", summary="Get package simple page")
def retrieve(self, request, path, package):
"""Retrieves the simple api html/json page for a package."""
media_type = request.accepted_renderer.media_type

repo_ver, content = self.get_rvc()
# Should I redirect if the normalized name is different?
normalized = canonicalize_name(package)
Expand Down
45 changes: 45 additions & 0 deletions pulp_python/app/serializers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import os
import tempfile
from gettext import gettext as _
from django.conf import settings
from django.db.utils import IntegrityError
Expand All @@ -22,6 +23,7 @@
)
from pulp_python.app.utils import (
DIST_EXTENSIONS,
artifact_to_metadata_artifact,
artifact_to_python_content_data,
get_project_metadata_from_file,
parse_project_metadata,
Expand Down Expand Up @@ -93,11 +95,31 @@ class Meta:
model = python_models.PythonDistribution


class PythonSingleContentArtifactField(core_serializers.SingleContentArtifactField):
"""
Custom field with overridden get_attribute method. Meant to be used only in
PythonPackageContentSerializer to handle possible existence of metadata artifact.
"""

def get_attribute(self, instance):
# When content has multiple artifacts (wheel + metadata), return the main one
if instance._artifacts.count() > 1:
for ca in instance.contentartifact_set.all():
if not ca.relative_path.endswith(".metadata"):
return ca.artifact

return super().get_attribute(instance)


class PythonPackageContentSerializer(core_serializers.SingleArtifactContentUploadSerializer):
"""
A Serializer for PythonPackageContent.
"""

artifact = PythonSingleContentArtifactField(
help_text=_("Artifact file representing the physical content"),
)

# Core metadata
# Version 1.0
author = serializers.CharField(
Expand Down Expand Up @@ -386,8 +408,21 @@ def deferred_validate(self, data):
if attestations := data.pop("attestations", None):
data["provenance"] = self.handle_attestations(filename, data["sha256"], attestations)

# Create metadata artifact for wheel files
if filename.endswith(".whl"):
if metadata_artifact := artifact_to_metadata_artifact(filename, artifact):
data["metadata_artifact"] = metadata_artifact
data["metadata_sha256"] = metadata_artifact.sha256

return data

def get_artifacts(self, validated_data):
artifacts = super().get_artifacts(validated_data)
if metadata_artifact := validated_data.pop("metadata_artifact", None):
relative_path = f"{validated_data['filename']}.metadata"
artifacts[relative_path] = metadata_artifact
return artifacts

def retrieve(self, validated_data):
content = python_models.PythonPackageContent.objects.filter(
sha256=validated_data["sha256"], _pulp_domain=get_domain()
Expand Down Expand Up @@ -419,6 +454,7 @@ def create(self, validated_data):

class Meta:
fields = core_serializers.SingleArtifactContentUploadSerializer.Meta.fields + (
"artifact",
"author",
"author_email",
"description",
Expand Down Expand Up @@ -514,6 +550,15 @@ def validate(self, data):
data["provenance"] = self.handle_attestations(
filename, data["sha256"], attestations, offline=True
)
# Create metadata artifact for wheel files
if filename.endswith(".whl"):
with tempfile.TemporaryDirectory(dir=settings.WORKING_DIRECTORY) as temp_dir:
if metadata_artifact := artifact_to_metadata_artifact(
filename, artifact, tmp_dir=temp_dir
):
data["metadata_artifact"] = metadata_artifact
data["metadata_sha256"] = metadata_artifact.sha256

return data

class Meta(PythonPackageContentSerializer.Meta):
Expand Down
14 changes: 11 additions & 3 deletions pulp_python/app/tasks/repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,13 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
progress_report.save()
with progress_report:
for package in progress_report.iter(immediate_content.iterator(chunk_size=BULK_SIZE)):
new_data = artifact_to_python_content_data(
package.filename, package._artifacts.get(), domain
# Get the main artifact
main_artifact = (
package.contentartifact_set.exclude(relative_path__endswith=".metadata")
.first()
.artifact
)
new_data = artifact_to_python_content_data(package.filename, main_artifact, domain)
total_repaired += update_package_if_needed(
package, new_data, batch, set_of_update_fields
)
Expand All @@ -113,7 +117,11 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
grouped_by_url = defaultdict(list)

for package in group_set:
for ra in package.contentartifact_set.get().remoteartifact_set.all():
for ra in (
package.contentartifact_set.exclude(relative_path__endswith=".metadata")
.first()
.remoteartifact_set.all()
):
grouped_by_url[ra.remote.url].append((package, ra))

# Prioritize the URL that can serve the most packages
Expand Down
Loading