Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion minecode_pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
#


VERSION = "0.0.1b23"
VERSION = "0.0.1b25"
184 changes: 68 additions & 116 deletions minecode_pipelines/miners/cargo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,132 +6,84 @@
# See https://github.com/aboutcode-org/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
from datetime import datetime

from minecode_pipelines.pipes import get_checkpoint_from_file
from minecode_pipelines.pipes import get_commit_at_distance_ahead
from minecode_pipelines.pipes import update_checkpoints_in_github
from minecode_pipelines.pipes import get_changed_files
from minecode_pipelines.pipes.cargo import store_cargo_packages
from scanpipe.pipes.federatedcode import commit_changes
from scanpipe.pipes.federatedcode import push_changes
from minecode_pipelines import VERSION

from scanpipe.pipes.federatedcode import commit_and_push_changes
import json
from pathlib import Path
from django.conf import settings
from scancodeio import VERSION
from aboutcode.pipeline import LoopProgress


PACKAGE_BATCH_SIZE = 500
COMMIT_BATCH_SIZE = 10
def cargo_commit_message(commit_batch, total_commit_batch="many"):
author_name = settings.FEDERATEDCODE_GIT_SERVICE_NAME
author_email = settings.FEDERATEDCODE_GIT_SERVICE_EMAIL
tool_name = "pkg:github/aboutcode-org/scancode.io"

CARGO_CHECKPOINT_PATH = "cargo/checkpoints.json"
return f"""\
Collect PackageURLs from crates.io index ({commit_batch}/{total_commit_batch})

Tool: {tool_name}@v{VERSION}
Reference: https://{settings.ALLOWED_HOSTS[0]}

def process_cargo_packages(cargo_index_repo, cloned_data_repo, config_repo, logger):
"""
Process Cargo index files commit by commit.
Push changes to fed_repo after:
- every `commit_batch` commits, OR when reaching HEAD.
"""
Signed-off-by: {author_name} <{author_email}>
"""

base_path = Path(cargo_index_repo.working_tree_dir)

while True:
cargo_checkpoints = get_checkpoint_from_file(
cloned_repo=config_repo, path=CARGO_CHECKPOINT_PATH
)

checkpoints_last_commit = cargo_checkpoints.get("last_commit")

try:
next_commit = get_commit_at_distance_ahead(
cargo_index_repo,
checkpoints_last_commit,
num_commits_ahead=COMMIT_BATCH_SIZE,
branch_name="master",
)
except ValueError as e:
logger(str(e))
break

if next_commit == checkpoints_last_commit:
logger("No new commits to mine")
break

changed_files = get_changed_files(
cargo_index_repo, commit_x=checkpoints_last_commit, commit_y=next_commit
)
logger(f"Found {len(changed_files)} changed files in Cargo index.")

file_counter = 0
purl_files = []
purls = []
for idx, rel_path in enumerate(changed_files):
file_path = base_path / rel_path
logger(f"Found {file_path}.")

if not file_path.is_file() or file_path.name in {
"config.json",
"README.md",
"update-dl-url.yml",
}:
continue

packages = []
with open(file_path, encoding="utf-8") as f:
for line in f:
if line.strip():
try:
packages.append(json.loads(line))
except json.JSONDecodeError as e:
logger(f"Skipping invalid JSON in {file_path}: {e}")

file_counter += 1

# Commit and push after each full batch or when processing the last file
commit_and_push = (file_counter % PACKAGE_BATCH_SIZE == 0) or (
idx == len(changed_files) - 1
)

result_store = store_cargo_packages(packages, cloned_data_repo)
if result_store:
purl_file, base_purl = result_store
logger(f"writing packageURLs for package: {base_purl} at: {purl_file}")

purl_files.append(purl_file)
purls.append(str(base_purl))

if not commit_and_push:
continue

commit_changes(
def process_cargo_packages(cargo_index_repo, cloned_data_repo, logger):
"""Mine and publish Cargo PackageURLs from Crates.io package index."""

base_path = Path(cargo_index_repo.working_tree_dir)
batch_size = 4000
file_counter = 0
purl_files = []
commit_count = 1

package_dir = [p for p in base_path.iterdir() if p.is_dir() and not p.name.startswith(".")]
package_paths = [f for dir in package_dir for f in dir.rglob("*") if f.is_file()]
package_count = len(package_paths)

progress = LoopProgress(
total_iterations=package_count,
logger=logger,
)

logger(f"Mine PackageURL for {package_count:,d} Cargo packages.")
for path in progress.iter(package_paths):
packages = []

with open(path, encoding="utf-8") as f:
for line_number, line in enumerate(f, start=1):
line = line.strip()
if not line:
continue
try:
packages.append(json.loads(line))
except json.JSONDecodeError as e:
logger(f"Skipping invalid JSON in {path} at line {line_number}: {e}")

file_counter += 1
result = store_cargo_packages(packages, cloned_data_repo)
if result:
purl_file, _ = result
purl_files.append(purl_file)

if file_counter % batch_size == 0 and purl_files:
if commit_and_push_changes(
repo=cloned_data_repo,
files_to_commit=purl_files,
purls=purls,
mine_type="packageURL",
tool_name="pkg:pypi/minecode-pipelines",
tool_version=VERSION,
)

push_changes(repo=cloned_data_repo)
purl_files = []
purls = []

if logger:
logger(
f"Updating checkpoint at: {CARGO_CHECKPOINT_PATH} with last commit: {checkpoints_last_commit}"
)

if next_commit != checkpoints_last_commit:
settings_data = {
"date": str(datetime.now()),
"last_commit": next_commit,
}

update_checkpoints_in_github(
checkpoint=settings_data,
cloned_repo=config_repo,
path=CARGO_CHECKPOINT_PATH,
)

logger(f"Pushed batch for commit range {checkpoints_last_commit}:{next_commit}.")
commit_message=cargo_commit_message(commit_count),
logger=logger,
):
commit_count += 1
purl_files.clear()

commit_and_push_changes(
repo=cloned_data_repo,
files_to_commit=purl_files,
commit_message=cargo_commit_message(commit_count, commit_count),
logger=logger,
)
logger(f"Processed PackageURL for {file_counter:,d} Cargo packages.")
logger(f"Pushed new PackageURL in {commit_count:,d} commits.")
20 changes: 4 additions & 16 deletions minecode_pipelines/pipelines/mine_cargo.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from scanpipe.pipes import federatedcode
from minecode_pipelines.miners import cargo
from minecode_pipelines import pipes
from minecode_pipelines.pipes import MINECODE_PIPELINES_CONFIG_REPO

MINECODE_DATA_CARGO_REPO = os.environ.get(
"MINECODE_DATA_CARGO_REPO", "https://github.com/aboutcode-data/minecode-data-cargo-test"
Expand Down Expand Up @@ -59,26 +58,15 @@ def clone_cargo_repos(self):
"""
self.cargo_index_repo = federatedcode.clone_repository(MINECODE_CARGO_INDEX_REPO)
self.cloned_data_repo = federatedcode.clone_repository(MINECODE_DATA_CARGO_REPO)
self.cloned_config_repo = federatedcode.clone_repository(MINECODE_PIPELINES_CONFIG_REPO)

if self.log:
self.log(
f"{MINECODE_CARGO_INDEX_REPO} repo cloned at: {self.cargo_index_repo.working_dir}"
)
self.log(
f"{MINECODE_DATA_CARGO_REPO} repo cloned at: {self.cloned_data_repo.working_dir}"
)
self.log(
f"{MINECODE_PIPELINES_CONFIG_REPO} repo cloned at: {self.cloned_config_repo.working_dir}"
)
self.log(f"{MINECODE_CARGO_INDEX_REPO} repo cloned at: {self.cargo_index_repo.working_dir}")
self.log(f"{MINECODE_DATA_CARGO_REPO} repo cloned at: {self.cloned_data_repo.working_dir}")

def mine_and_publish_cargo_packageurls(self):
cargo.process_cargo_packages(
self.cargo_index_repo, self.cloned_data_repo, self.cloned_config_repo, self.log
)
cargo.process_cargo_packages(self.cargo_index_repo, self.cloned_data_repo, self.log)

def delete_cloned_repos(self):
pipes.delete_cloned_repos(
repos=[self.cargo_index_repo, self.cloned_data_repo, self.cloned_config_repo],
repos=[self.cargo_index_repo, self.cloned_data_repo],
logger=self.log,
)
2 changes: 1 addition & 1 deletion minecode_pipelines/pipes/cargo.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@ def store_cargo_packages(packages, repo):

ppath = hashid.get_package_purls_yml_file_path(base_purl)
purl_file_full_path = Path(repo.working_dir) / ppath
write_data_to_yaml_file(path=purl_file_full_path, data=updated_purls)
write_data_to_yaml_file(path=purl_file_full_path, data=sorted(updated_purls))
return purl_file_full_path, base_purl
4 changes: 2 additions & 2 deletions pyproject-minecode_pipelines.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "flot.buildapi"

[project]
name = "minecode_pipelines"
version = "0.0.1b23"
version = "0.0.1b25"
description = "A library for mining packageURLs and package metadata from ecosystem repositories."
readme = "minecode_pipelines/README.rst"
license = { text = "Apache-2.0" }
Expand Down Expand Up @@ -60,7 +60,7 @@ mine_swift = "minecode_pipelines.pipelines.mine_swift:MineSwift"
mine_composer = "minecode_pipelines.pipelines.mine_composer:MineComposer"

[tool.bumpversion]
current_version = "0.0.1b23"
current_version = "0.0.1b25"
allow_dirty = true

files = [
Expand Down
Loading