1111import requests
1212
1313from bs4 import BeautifulSoup
14-
15-
1614from packageurl import PackageURL
1715
18- from minecode_pipelines . utils import get_temp_file
19- from minecode_pipelines . pipes import write_data_to_json_file
16+ from scanpipe . pipes . fetch import fetch_http
17+
2018
2119"""
2220Visitors for cpan and cpan-like perl package repositories.
2826
2927
3028def get_cpan_packages (cpan_repo = CPAN_REPO , logger = None ):
29+ """
30+ Get cpan package names parsed from the `02packages.details.txt`
31+ which conatins a list of all modules and their respective
32+ package archive paths. We parse the package names and their respective
33+ path_prefixes with author page path from this list.
34+ """
3135 cpan_packages_url = cpan_repo + "modules/02packages.details.txt.gz"
32- local_filename = "cpan_packages.gz"
36+ cpan_packages_gz_download = fetch_http (cpan_packages_url )
37+ with gzip .open (cpan_packages_gz_download , "rb" ) as file_content :
38+ packages_content = file_content .read ()
3339
34- response = requests .get (cpan_packages_url , stream = True )
35- if not response .ok :
36- return
37-
38- with open (local_filename , "wb" ) as f :
39- for chunk in response .iter_content (chunk_size = 8192 ):
40- f .write (chunk )
40+ package_path_by_name = {}
4141
42- with gzip .open ("cpan_packages.gz" , "rb" ) as f_in :
43- with open ("cpan_packages.txt" , "wb" ) as f_out :
44- f_out .writelines (f_in )
42+ # The ``modules/02packages.details.txt`` file has the following section
43+ # at the beginning of the file:
44+ #
45+ # File: 02packages.details.txt
46+ # URL: http://www.cpan.org/modules/02packages.details.txt
47+ # Description: Package names found in directory $CPAN/authors/id/
48+ # Columns: package name, version, path
49+ # Intended-For: Automated fetch routines, namespace documentation.
50+ # Written-By: PAUSE version 1.005
51+ # Line-Count: 268940
52+ # Last-Updated: Mon, 29 Sep 2025 22:29:02 GMT
53+ #
54+ # This information is there in first 10 lines, and the last line is an
55+ # empty line, both of which we are ignoring below
4556
46- with open ("cpan_packages.txt" , encoding = "utf-8" ) as file :
47- packages_content = file .read ()
57+ modules = packages_content .split ("\n " )[9 :- 1 ]
4858
49- package_path_by_name = {}
59+ # A sample line from this module list looks like this:
60+ #
61+ # Crypt::Passphrase::SHA1::Base64 0.021 L/LE/LEONT/Crypt-Passphrase-0.021.tar.gz
5062
51- modules = packages_content .split ("\n " )[9 :- 1 ]
5263 for module in modules :
5364 info = [section for section in module .split (" " ) if section ]
65+
66+ # This is like: L/LE/LEONT/Crypt-Passphrase-0.021.tar.gz
5467 package_path = info [- 1 ]
5568 path_segments = package_path .split ("/" )
5669 filename = path_segments .pop ()
@@ -60,18 +73,24 @@ def get_cpan_packages(cpan_repo=CPAN_REPO, logger=None):
6073 _version = name_version .pop ()
6174 name = "-" .join (name_version )
6275
76+ # for the above example: name: Crypt-Passphrase, path_prefix: L/LE/LEONT/
6377 package_path_by_name [name ] = path_prefix
6478
6579 return package_path_by_name
6680
6781
68- def write_packages_json ( packages , name ):
69- temp_file = get_temp_file ( name )
70- write_data_to_json_file ( path = temp_file , data = packages )
71- return temp_file
82+ def get_cpan_packageurls ( name , path_prefix , logger = None ):
83+ """
84+ Given a package name and it's path_prefix (author page path )
85+ return a list of packageURLs for that package.
7286
87+ An author page (like https://www.cpan.org/authors/id/P/PT/PTC/) lists
88+ all versions of all packages released by the author, so we can scrape
89+ all the packageURLs from this author packages index.
90+ """
91+
92+ author_name = path_prefix .split ("/" )[- 1 ]
7393
74- def get_cpan_packageurls (name , path_prefix , logger = None ):
7594 packageurls = []
7695
7796 # file extensions found in cpan index
@@ -90,6 +109,8 @@ def get_cpan_packageurls(name, path_prefix, logger=None):
90109 logger (f"Getting package versions for { name } from { cpan_author_page_url } " )
91110
92111 soup = BeautifulSoup (response .text , "html.parser" )
112+
113+ # We get all the listed packages in the author page index
93114 package_list_elements = soup .find ("ul" ).text .split ("\n " )
94115
95116 package_elements = [
@@ -116,6 +137,7 @@ def get_cpan_packageurls(name, path_prefix, logger=None):
116137 for version in unique_versions :
117138 purl = PackageURL (
118139 type = CPAN_TYPE ,
140+ namespace = author_name ,
119141 name = name ,
120142 version = version ,
121143 )
0 commit comments