Address review feedback

AyanSinhaMahapatra · AyanSinhaMahapatra · commit 583c4ef84e71 · 2025-10-01T20:29:03.000+05:30
Signed-off-by: Ayan Sinha Mahapatra &lt;ayansmahapatra@gmail.com&gt;
diff --git a/minecode_pipelines/miners/cpan.py b/minecode_pipelines/miners/cpan.py
@@ -11,12 +11,10 @@
 import requests
 
 from bs4 import BeautifulSoup
-
-
 from packageurl import PackageURL
 
-from minecode_pipelines.utils import get_temp_file
-from minecode_pipelines.pipes import write_data_to_json_file
+from scanpipe.pipes.fetch import fetch_http
+
 
 """
 Visitors for cpan and cpan-like perl package repositories.
@@ -28,29 +26,44 @@
 
 
 def get_cpan_packages(cpan_repo=CPAN_REPO, logger=None):
+    """
+    Get cpan package names parsed from the `02packages.details.txt`
+    which conatins a list of all modules and their respective
+    package archive paths. We parse the package names and their respective
+    path_prefixes with author page path from this list.
+    """
     cpan_packages_url = cpan_repo + "modules/02packages.details.txt.gz"
-    local_filename = "cpan_packages.gz"
+    cpan_packages_gz_download = fetch_http(cpan_packages_url)
+    with gzip.open(cpan_packages_gz_download, "rb") as file_content:
+        packages_content = file_content.read()
 
-    response = requests.get(cpan_packages_url, stream=True)
-    if not response.ok:
-        return
-
-    with open(local_filename, "wb") as f:
-        for chunk in response.iter_content(chunk_size=8192):
-            f.write(chunk)
+    package_path_by_name = {}
 
-    with gzip.open("cpan_packages.gz", "rb") as f_in:
-        with open("cpan_packages.txt", "wb") as f_out:
-            f_out.writelines(f_in)
+    # The ``modules/02packages.details.txt`` file has the following section
+    # at the beginning of the file:
+    #
+    # File:         02packages.details.txt
+    # URL:          http://www.cpan.org/modules/02packages.details.txt
+    # Description:  Package names found in directory $CPAN/authors/id/
+    # Columns:      package name, version, path
+    # Intended-For: Automated fetch routines, namespace documentation.
+    # Written-By:   PAUSE version 1.005
+    # Line-Count:   268940
+    # Last-Updated: Mon, 29 Sep 2025 22:29:02 GMT
+    #
+    # This information is there in first 10 lines, and the last line is an
+    # empty line, both of which we are ignoring below
 
-    with open("cpan_packages.txt", encoding="utf-8") as file:
-        packages_content = file.read()
+    modules = packages_content.split("\n")[9:-1]
 
-    package_path_by_name = {}
+    # A sample line from this module list looks like this:
+    #
+    # Crypt::Passphrase::SHA1::Base64   0.021  L/LE/LEONT/Crypt-Passphrase-0.021.tar.gz
 
-    modules = packages_content.split("\n")[9:-1]
     for module in modules:
         info = [section for section in module.split(" ") if section]
+
+        # This is like: L/LE/LEONT/Crypt-Passphrase-0.021.tar.gz
         package_path = info[-1]
         path_segments = package_path.split("/")
         filename = path_segments.pop()
@@ -60,18 +73,24 @@ def get_cpan_packages(cpan_repo=CPAN_REPO, logger=None):
         _version = name_version.pop()
         name = "-".join(name_version)
 
+        # for the above example: name: Crypt-Passphrase, path_prefix: L/LE/LEONT/
         package_path_by_name[name] = path_prefix
 
     return package_path_by_name
 
 
-def write_packages_json(packages, name):
-    temp_file = get_temp_file(name)
-    write_data_to_json_file(path=temp_file, data=packages)
-    return temp_file
+def get_cpan_packageurls(name, path_prefix, logger=None):
+    """
+    Given a package name and it's path_prefix (author page path)
+    return a list of packageURLs for that package.
 
+    An author page (like https://www.cpan.org/authors/id/P/PT/PTC/) lists
+    all versions of all packages released by the author, so we can scrape
+    all the packageURLs from this author packages index.
+    """
+
+    author_name = path_prefix.split("/")[-1]
 
-def get_cpan_packageurls(name, path_prefix, logger=None):
     packageurls = []
 
     # file extensions found in cpan index
@@ -90,6 +109,8 @@ def get_cpan_packageurls(name, path_prefix, logger=None):
         logger(f"Getting package versions for {name} from {cpan_author_page_url}")
 
     soup = BeautifulSoup(response.text, "html.parser")
+
+    # We get all the listed packages in the author page index
     package_list_elements = soup.find("ul").text.split("\n")
 
     package_elements = [
@@ -116,6 +137,7 @@ def get_cpan_packageurls(name, path_prefix, logger=None):
     for version in unique_versions:
         purl = PackageURL(
             type=CPAN_TYPE,
+            namespace=author_name,
             name=name,
             version=version,
         )