Skip to content

Commit 8783ec5

Browse files
Add support for mining cpan packageURLs
Reference: #685 Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent b89d903 commit 8783ec5

File tree

4 files changed

+330
-1
lines changed

4 files changed

+330
-1
lines changed

minecode_pipelines/miners/cpan.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import gzip
11+
import requests
12+
13+
from bs4 import BeautifulSoup
14+
15+
16+
from packageurl import PackageURL
17+
18+
from minecode_pipelines.utils import get_temp_file
19+
from minecode_pipelines.pipes import write_data_to_json_file
20+
21+
"""
22+
Visitors for cpan and cpan-like perl package repositories.
23+
"""
24+
25+
26+
CPAN_REPO = "https://www.cpan.org/"
27+
CPAN_TYPE = "cpan"
28+
29+
30+
def get_cpan_packages(cpan_repo, logger=None):
31+
cpan_packages_url = CPAN_REPO + "modules/02packages.details.txt.gz"
32+
local_filename = "cpan_packages.gz"
33+
34+
response = requests.get(cpan_packages_url, stream=True)
35+
if not response.ok:
36+
return
37+
38+
with open(local_filename, "wb") as f:
39+
for chunk in response.iter_content(chunk_size=8192):
40+
f.write(chunk)
41+
42+
with gzip.open("cpan_packages.gz", "rb") as f_in:
43+
with open("cpan_packages.txt", "wb") as f_out:
44+
f_out.writelines(f_in)
45+
46+
with open("cpan_packages.txt", encoding="utf-8") as file:
47+
packages_content = file.read()
48+
49+
package_path_by_name = {}
50+
51+
modules = packages_content.split("\n")[9:-1]
52+
for module in modules:
53+
info = [section for section in module.split(" ") if section]
54+
package_path = info[-1]
55+
path_segments = package_path.split("/")
56+
filename = path_segments.pop()
57+
path_prefix = "/".join(path_segments)
58+
59+
name_version = filename.replace(".tar.gz", "").split("-")
60+
_version = name_version.pop()
61+
name = "-".join(name_version)
62+
63+
package_path_by_name[name] = path_prefix
64+
65+
return package_path_by_name
66+
67+
68+
def write_packages_json(packages, name):
69+
temp_file = get_temp_file(name)
70+
write_data_to_json_file(path=temp_file, data=packages)
71+
return temp_file
72+
73+
74+
def get_cpan_packageurls(name, path_prefix):
75+
packageurls = []
76+
77+
# file extensions found in cpan index
78+
ignorable_extensions = [".meta", ".readme", ".tar.gz"]
79+
80+
cpan_authors_path = "/authors/id/"
81+
cpan_authors_url = CPAN_REPO + cpan_authors_path
82+
83+
cpan_author_page_url = cpan_authors_url + path_prefix
84+
85+
response = requests.get(cpan_author_page_url)
86+
if not response.ok:
87+
return packageurls
88+
89+
soup = BeautifulSoup(response.text, "html.parser")
90+
package_list_elements = soup.find("ul").text.split("\n")
91+
92+
package_elements = [
93+
element.replace(" ", "")
94+
for element in package_list_elements
95+
if element and element not in {" Parent Directory", " CHECKSUMS"}
96+
]
97+
98+
versions = []
99+
for package_file in package_elements:
100+
for extension in ignorable_extensions:
101+
if extension in package_file:
102+
package_file = package_file.replace(extension, "")
103+
104+
name_version = package_file.split("-")
105+
version = name_version.pop()
106+
package_name = "-".join(name_version)
107+
if package_name != name:
108+
continue
109+
110+
versions.append(version)
111+
112+
unique_versions = list(set(versions))
113+
for version in unique_versions:
114+
purl = PackageURL(
115+
type=CPAN_TYPE,
116+
name=name,
117+
version=version,
118+
)
119+
packageurls.append(purl.to_string())
120+
121+
return packageurls
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
from scanpipe.pipelines import Pipeline
24+
from scanpipe.pipes import federatedcode
25+
26+
from minecode_pipelines import pipes
27+
from minecode_pipelines.pipes import cpan
28+
29+
30+
class MineCpan(Pipeline):
31+
"""
32+
Mine all packageURLs from a cpan index and publish them to
33+
a FederatedCode repo.
34+
"""
35+
36+
@classmethod
37+
def steps(cls):
38+
return (
39+
cls.check_federatedcode_eligibility,
40+
cls.mine_cpan_packages,
41+
cls.mine_and_publish_cpan_packageurls,
42+
cls.delete_cloned_repos,
43+
)
44+
45+
def check_federatedcode_eligibility(self):
46+
"""
47+
Check if the project fulfills the following criteria for
48+
pushing the project result to FederatedCode.
49+
"""
50+
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
51+
52+
def mine_cpan_packages(self):
53+
"""Mine cpan package names from cpan indexes or checkpoint."""
54+
self.cpan_packages = cpan.mine_cpan_packages(logger=self.log)
55+
56+
def mine_and_publish_cpan_packageurls(self):
57+
"""Get cpan packageURLs for all mined cpan package names."""
58+
self.repos = cpan.mine_and_publish_cpan_packageurls(
59+
packages_file=self.cpan_packages,
60+
logger=self.log,
61+
)
62+
63+
def delete_cloned_repos(self):
64+
pipes.delete_cloned_repos(repos=self.repos, logger=self.log)

minecode_pipelines/pipes/cpan.py

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
from minecode_pipelines import VERSION
24+
from minecode_pipelines.pipes import write_packageurls_to_file
25+
26+
from minecode_pipelines.miners.cpan import get_cpan_packages
27+
from minecode_pipelines.miners.cpan import get_cpan_packageurls
28+
from minecode_pipelines.miners.cpan import CPAN_REPO
29+
30+
from minecode_pipelines.miners.cpan import CPAN_TYPE
31+
from minecode_pipelines.utils import grouper
32+
33+
from aboutcode.hashid import get_package_base_dir
34+
from packageurl import PackageURL
35+
from scanpipe.pipes.federatedcode import clone_repository
36+
37+
from scanpipe.pipes.federatedcode import commit_changes
38+
from scanpipe.pipes.federatedcode import push_changes
39+
40+
41+
# If True, show full details on fetching packageURL for
42+
# a package name present in the index
43+
LOG_PACKAGEURL_DETAILS = False
44+
45+
PACKAGE_BATCH_SIZE = 500
46+
47+
48+
# We are testing and storing mined packageURLs in one single repo per ecosystem for now
49+
MINECODE_DATA_CPAN_REPO = "https://github.com/aboutcode-data/minecode-data-cpan-test"
50+
51+
52+
def mine_cpan_packages(logger=None):
53+
if logger:
54+
logger("Getting packages from cpan index")
55+
56+
package_path_by_name = get_cpan_packages(cpan_repo=CPAN_REPO, logger=logger)
57+
58+
if logger:
59+
packages_count = len(package_path_by_name.keys())
60+
logger(f"Mined {packages_count} packages from cpan index")
61+
62+
return package_path_by_name
63+
64+
65+
def mine_and_publish_cpan_packageurls(package_path_by_name, logger=None):
66+
if not package_path_by_name:
67+
return
68+
69+
# clone repo
70+
cloned_data_repo = clone_repository(repo_url=MINECODE_DATA_CPAN_REPO)
71+
if logger:
72+
logger(f"{MINECODE_DATA_CPAN_REPO} repo cloned at: {cloned_data_repo.working_dir}")
73+
74+
for package_batch in grouper(n=PACKAGE_BATCH_SIZE, iterable=package_path_by_name.keys()):
75+
packages_mined = []
76+
purls = []
77+
purl_files = []
78+
79+
if logger and LOG_PACKAGEURL_DETAILS:
80+
logger("Starting package mining for a batch of packages")
81+
82+
for package_name in package_batch:
83+
if not package_name:
84+
continue
85+
86+
# fetch packageURLs for package
87+
if logger and LOG_PACKAGEURL_DETAILS:
88+
logger(f"getting packageURLs for package: {package_name}")
89+
90+
path_prefix = package_path_by_name.get(package_name)
91+
if not path_prefix:
92+
continue
93+
94+
packageurls = get_cpan_packageurls(name=package_name, path_prefix=path_prefix)
95+
if not packageurls:
96+
if logger and LOG_PACKAGEURL_DETAILS:
97+
logger(f"Package versions not present for package: {package_name}")
98+
99+
# We don't want to try fetching versions for these again
100+
packages_mined.append(package_name)
101+
continue
102+
103+
# get repo and path for package
104+
base_purl = PackageURL(type=CPAN_TYPE, name=package_name).to_string()
105+
package_base_dir = get_package_base_dir(purl=base_purl)
106+
107+
if logger and LOG_PACKAGEURL_DETAILS:
108+
logger(f"writing packageURLs for package: {base_purl} at: {package_base_dir}")
109+
purls_string = " ".join(packageurls)
110+
logger(f"packageURLs: {purls_string}")
111+
112+
# write packageURLs to file
113+
purl_file = write_packageurls_to_file(
114+
repo=cloned_data_repo,
115+
base_dir=package_base_dir,
116+
packageurls=packageurls,
117+
)
118+
purl_files.append(purl_file)
119+
purls.append(base_purl)
120+
121+
packages_mined.append(package_name)
122+
123+
if logger:
124+
purls_string = " ".join(purls)
125+
logger("Committing and pushing changes for a batch of packages: ")
126+
logger(f"{purls_string}")
127+
128+
# commit changes
129+
commit_changes(
130+
repo=cloned_data_repo,
131+
files_to_commit=purl_files,
132+
purls=purls,
133+
mine_type="packageURL",
134+
tool_name="pkg:cpan/minecode-pipelines",
135+
tool_version=VERSION,
136+
)
137+
138+
# Push changes to remote repository
139+
push_changes(repo=cloned_data_repo)
140+
141+
repos_to_clean = [cloned_data_repo]
142+
return repos_to_clean

pyproject-minecode_pipelines.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@ dependencies = [
4242
"scancodeio >= 35.3.0",
4343
"ftputil >= 5.1.0",
4444
"jawa >= 2.2.0",
45-
"arrow >= 1.3.0"
45+
"arrow >= 1.3.0",
46+
"beautifulsoup4 >= 4.14.2"
4647
]
4748

4849
urls = { Homepage = "https://github.com/aboutcode-org/purldb" }
@@ -54,6 +55,7 @@ mine_cargo = "minecode_pipelines.pipelines.mine_cargo:MineCargo"
5455
mine_debian = "minecode_pipelines.pipelines.mine_debian:MineDebian"
5556
mine_alpine = "minecode_pipelines.pipelines.mine_alpine:MineAlpine"
5657
mine_conan = "minecode_pipelines.pipelines.mine_conan:MineConan"
58+
mine_cpan = "minecode_pipelines.pipelines.mine_cpan:MineCpan"
5759

5860
[tool.bumpversion]
5961
current_version = "0.0.1b13"

0 commit comments

Comments
 (0)