11
11
import requests
12
12
13
13
from bs4 import BeautifulSoup
14
-
15
-
16
14
from packageurl import PackageURL
17
15
18
- from minecode_pipelines . utils import get_temp_file
19
- from minecode_pipelines . pipes import write_data_to_json_file
16
+ from scanpipe . pipes . fetch import fetch_http
17
+
20
18
21
19
"""
22
20
Visitors for cpan and cpan-like perl package repositories.
28
26
29
27
30
28
def get_cpan_packages (cpan_repo = CPAN_REPO , logger = None ):
29
+ """
30
+ Get cpan package names parsed from the `02packages.details.txt`
31
+ which conatins a list of all modules and their respective
32
+ package archive paths. We parse the package names and their respective
33
+ path_prefixes with author page path from this list.
34
+ """
31
35
cpan_packages_url = cpan_repo + "modules/02packages.details.txt.gz"
32
- local_filename = "cpan_packages.gz"
36
+ cpan_packages_gz_download = fetch_http (cpan_packages_url )
37
+ with gzip .open (cpan_packages_gz_download , "rb" ) as file_content :
38
+ packages_content = file_content .read ()
33
39
34
- response = requests .get (cpan_packages_url , stream = True )
35
- if not response .ok :
36
- return
37
-
38
- with open (local_filename , "wb" ) as f :
39
- for chunk in response .iter_content (chunk_size = 8192 ):
40
- f .write (chunk )
40
+ package_path_by_name = {}
41
41
42
- with gzip .open ("cpan_packages.gz" , "rb" ) as f_in :
43
- with open ("cpan_packages.txt" , "wb" ) as f_out :
44
- f_out .writelines (f_in )
42
+ # The ``modules/02packages.details.txt`` file has the following section
43
+ # at the beginning of the file:
44
+ #
45
+ # File: 02packages.details.txt
46
+ # URL: http://www.cpan.org/modules/02packages.details.txt
47
+ # Description: Package names found in directory $CPAN/authors/id/
48
+ # Columns: package name, version, path
49
+ # Intended-For: Automated fetch routines, namespace documentation.
50
+ # Written-By: PAUSE version 1.005
51
+ # Line-Count: 268940
52
+ # Last-Updated: Mon, 29 Sep 2025 22:29:02 GMT
53
+ #
54
+ # This information is there in first 10 lines, and the last line is an
55
+ # empty line, both of which we are ignoring below
45
56
46
- with open ("cpan_packages.txt" , encoding = "utf-8" ) as file :
47
- packages_content = file .read ()
57
+ modules = packages_content .split ("\n " )[9 :- 1 ]
48
58
49
- package_path_by_name = {}
59
+ # A sample line from this module list looks like this:
60
+ #
61
+ # Crypt::Passphrase::SHA1::Base64 0.021 L/LE/LEONT/Crypt-Passphrase-0.021.tar.gz
50
62
51
- modules = packages_content .split ("\n " )[9 :- 1 ]
52
63
for module in modules :
53
64
info = [section for section in module .split (" " ) if section ]
65
+
66
+ # This is like: L/LE/LEONT/Crypt-Passphrase-0.021.tar.gz
54
67
package_path = info [- 1 ]
55
68
path_segments = package_path .split ("/" )
56
69
filename = path_segments .pop ()
@@ -60,18 +73,24 @@ def get_cpan_packages(cpan_repo=CPAN_REPO, logger=None):
60
73
_version = name_version .pop ()
61
74
name = "-" .join (name_version )
62
75
76
+ # for the above example: name: Crypt-Passphrase, path_prefix: L/LE/LEONT/
63
77
package_path_by_name [name ] = path_prefix
64
78
65
79
return package_path_by_name
66
80
67
81
68
- def write_packages_json ( packages , name ):
69
- temp_file = get_temp_file ( name )
70
- write_data_to_json_file ( path = temp_file , data = packages )
71
- return temp_file
82
+ def get_cpan_packageurls ( name , path_prefix , logger = None ):
83
+ """
84
+ Given a package name and it's path_prefix (author page path )
85
+ return a list of packageURLs for that package.
72
86
87
+ An author page (like https://www.cpan.org/authors/id/P/PT/PTC/) lists
88
+ all versions of all packages released by the author, so we can scrape
89
+ all the packageURLs from this author packages index.
90
+ """
91
+
92
+ author_name = path_prefix .split ("/" )[- 1 ]
73
93
74
- def get_cpan_packageurls (name , path_prefix , logger = None ):
75
94
packageurls = []
76
95
77
96
# file extensions found in cpan index
@@ -90,6 +109,8 @@ def get_cpan_packageurls(name, path_prefix, logger=None):
90
109
logger (f"Getting package versions for { name } from { cpan_author_page_url } " )
91
110
92
111
soup = BeautifulSoup (response .text , "html.parser" )
112
+
113
+ # We get all the listed packages in the author page index
93
114
package_list_elements = soup .find ("ul" ).text .split ("\n " )
94
115
95
116
package_elements = [
@@ -116,6 +137,7 @@ def get_cpan_packageurls(name, path_prefix, logger=None):
116
137
for version in unique_versions :
117
138
purl = PackageURL (
118
139
type = CPAN_TYPE ,
140
+ namespace = author_name ,
119
141
name = name ,
120
142
version = version ,
121
143
)
0 commit comments