Skip to content

Commit 583c4ef

Browse files
Address review feedback
Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent 2ee926a commit 583c4ef

File tree

1 file changed

+46
-24
lines changed

1 file changed

+46
-24
lines changed

minecode_pipelines/miners/cpan.py

Lines changed: 46 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,10 @@
1111
import requests
1212

1313
from bs4 import BeautifulSoup
14-
15-
1614
from packageurl import PackageURL
1715

18-
from minecode_pipelines.utils import get_temp_file
19-
from minecode_pipelines.pipes import write_data_to_json_file
16+
from scanpipe.pipes.fetch import fetch_http
17+
2018

2119
"""
2220
Visitors for cpan and cpan-like perl package repositories.
@@ -28,29 +26,44 @@
2826

2927

3028
def get_cpan_packages(cpan_repo=CPAN_REPO, logger=None):
29+
"""
30+
Get cpan package names parsed from the `02packages.details.txt`
31+
which conatins a list of all modules and their respective
32+
package archive paths. We parse the package names and their respective
33+
path_prefixes with author page path from this list.
34+
"""
3135
cpan_packages_url = cpan_repo + "modules/02packages.details.txt.gz"
32-
local_filename = "cpan_packages.gz"
36+
cpan_packages_gz_download = fetch_http(cpan_packages_url)
37+
with gzip.open(cpan_packages_gz_download, "rb") as file_content:
38+
packages_content = file_content.read()
3339

34-
response = requests.get(cpan_packages_url, stream=True)
35-
if not response.ok:
36-
return
37-
38-
with open(local_filename, "wb") as f:
39-
for chunk in response.iter_content(chunk_size=8192):
40-
f.write(chunk)
40+
package_path_by_name = {}
4141

42-
with gzip.open("cpan_packages.gz", "rb") as f_in:
43-
with open("cpan_packages.txt", "wb") as f_out:
44-
f_out.writelines(f_in)
42+
# The ``modules/02packages.details.txt`` file has the following section
43+
# at the beginning of the file:
44+
#
45+
# File: 02packages.details.txt
46+
# URL: http://www.cpan.org/modules/02packages.details.txt
47+
# Description: Package names found in directory $CPAN/authors/id/
48+
# Columns: package name, version, path
49+
# Intended-For: Automated fetch routines, namespace documentation.
50+
# Written-By: PAUSE version 1.005
51+
# Line-Count: 268940
52+
# Last-Updated: Mon, 29 Sep 2025 22:29:02 GMT
53+
#
54+
# This information is there in first 10 lines, and the last line is an
55+
# empty line, both of which we are ignoring below
4556

46-
with open("cpan_packages.txt", encoding="utf-8") as file:
47-
packages_content = file.read()
57+
modules = packages_content.split("\n")[9:-1]
4858

49-
package_path_by_name = {}
59+
# A sample line from this module list looks like this:
60+
#
61+
# Crypt::Passphrase::SHA1::Base64 0.021 L/LE/LEONT/Crypt-Passphrase-0.021.tar.gz
5062

51-
modules = packages_content.split("\n")[9:-1]
5263
for module in modules:
5364
info = [section for section in module.split(" ") if section]
65+
66+
# This is like: L/LE/LEONT/Crypt-Passphrase-0.021.tar.gz
5467
package_path = info[-1]
5568
path_segments = package_path.split("/")
5669
filename = path_segments.pop()
@@ -60,18 +73,24 @@ def get_cpan_packages(cpan_repo=CPAN_REPO, logger=None):
6073
_version = name_version.pop()
6174
name = "-".join(name_version)
6275

76+
# for the above example: name: Crypt-Passphrase, path_prefix: L/LE/LEONT/
6377
package_path_by_name[name] = path_prefix
6478

6579
return package_path_by_name
6680

6781

68-
def write_packages_json(packages, name):
69-
temp_file = get_temp_file(name)
70-
write_data_to_json_file(path=temp_file, data=packages)
71-
return temp_file
82+
def get_cpan_packageurls(name, path_prefix, logger=None):
83+
"""
84+
Given a package name and it's path_prefix (author page path)
85+
return a list of packageURLs for that package.
7286
87+
An author page (like https://www.cpan.org/authors/id/P/PT/PTC/) lists
88+
all versions of all packages released by the author, so we can scrape
89+
all the packageURLs from this author packages index.
90+
"""
91+
92+
author_name = path_prefix.split("/")[-1]
7393

74-
def get_cpan_packageurls(name, path_prefix, logger=None):
7594
packageurls = []
7695

7796
# file extensions found in cpan index
@@ -90,6 +109,8 @@ def get_cpan_packageurls(name, path_prefix, logger=None):
90109
logger(f"Getting package versions for {name} from {cpan_author_page_url}")
91110

92111
soup = BeautifulSoup(response.text, "html.parser")
112+
113+
# We get all the listed packages in the author page index
93114
package_list_elements = soup.find("ul").text.split("\n")
94115

95116
package_elements = [
@@ -116,6 +137,7 @@ def get_cpan_packageurls(name, path_prefix, logger=None):
116137
for version in unique_versions:
117138
purl = PackageURL(
118139
type=CPAN_TYPE,
140+
namespace=author_name,
119141
name=name,
120142
version=version,
121143
)

0 commit comments

Comments
 (0)