Skip to content

Commit 353dd31

Browse files
Add support to mine npm PackageURLs
Reference: #661 Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent c468a76 commit 353dd31

File tree

7 files changed

+613
-45
lines changed

7 files changed

+613
-45
lines changed

minecode_pipelines/miners/npm.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
11+
import json
12+
import requests
13+
14+
from packageurl import PackageURL
15+
16+
17+
"""
18+
Visitors for Npmjs and npmjs-like javascript package repositories.
19+
20+
We have this hierarchy in npm replicate and registry index:
21+
npm projects replicate.npmjs.com (paginated JSON) -> versions at registry.npmjs.org (JSON) -> download urls
22+
23+
See https://github.com/orgs/community/discussions/152515 for information on
24+
the latest replicate.npmjs.com API.
25+
26+
https://replicate.npmjs.com/_all_docs
27+
This NPMJS replicate API serves as an index to get all npm packages and their revision IDs
28+
in paginated queries.
29+
30+
https://replicate.npmjs.com/_changes
31+
This NPMJS replicate API serves as a CHANGELOG of npm packages with update sequneces which
32+
can be fetched in paginated queries.
33+
34+
https://registry.npmjs.org/{namespace/name}
35+
For each npm package, a JSON containing details including the list of all releases
36+
and archives, their URLs, and some metadata for each release.
37+
38+
https://registry.npmjs.org/{namespace/name}/{version}
39+
For each release, a JSON contains details for the released version and all the
40+
downloads available for this release.
41+
"""
42+
43+
44+
NPM_REPLICATE_REPO = "https://replicate.npmjs.com/"
45+
NPM_REGISTRY_REPO = "https://registry.npmjs.org/"
46+
NPM_TYPE = "NPM"
47+
NPM_REPLICATE_BATCH_SIZE = 10000
48+
49+
50+
def get_package_names_last_key(package_data):
51+
names = [package.get("id") for package in package_data.get("rows")]
52+
last_key = package_data.get("rows")[-1].get("key")
53+
return names, last_key
54+
55+
56+
def get_package_names_last_seq(package_data):
57+
names = [package.get("id") for package in package_data.get("results")]
58+
last_seq = package_data.get("last_seq")
59+
return names, last_seq
60+
61+
62+
def get_current_last_seq(replicate_url=NPM_REPLICATE_REPO):
63+
npm_replicate_latest_changes = replicate_url + "_changes?descending=True"
64+
response = requests.get(npm_replicate_latest_changes)
65+
if not response.ok:
66+
return
67+
68+
package_data = response.json()
69+
_package_names, last_seq = get_package_names_last_seq(package_data)
70+
return last_seq
71+
72+
73+
def get_updated_npm_packages(last_seq, replicate_url=NPM_REPLICATE_REPO):
74+
all_package_names = []
75+
i = 0
76+
77+
while True:
78+
print(f"Processing iteration: {i}: changes after seq: {last_seq}")
79+
npm_replicate_changes = (
80+
replicate_url + "_changes?" + f"limit={NPM_REPLICATE_BATCH_SIZE}" + f"&since={last_seq}"
81+
)
82+
response = requests.get(npm_replicate_changes)
83+
if not response.ok:
84+
return all_package_names
85+
86+
package_data = response.json()
87+
package_names, last_seq = get_package_names_last_seq(package_data)
88+
all_package_names.extend(package_names)
89+
90+
# We have fetched the last set of changes if True
91+
if len(package_names) < NPM_REPLICATE_BATCH_SIZE:
92+
break
93+
94+
i += 1
95+
96+
return {"packages": all_package_names}, last_seq
97+
98+
99+
def get_npm_packages(replicate_url=NPM_REPLICATE_REPO):
100+
all_package_names = []
101+
102+
npm_replicate_all = replicate_url + "_all_docs?" + f"limit={NPM_REPLICATE_BATCH_SIZE}"
103+
response = requests.get(npm_replicate_all)
104+
if not response.ok:
105+
return all_package_names
106+
107+
package_data = response.json()
108+
package_names, last_key = get_package_names_last_key(package_data)
109+
all_package_names.append(package_names)
110+
111+
total_rows = package_data.get("total_rows")
112+
iterations = int(total_rows / NPM_REPLICATE_BATCH_SIZE) + 1
113+
114+
for i in range(iterations):
115+
npm_replicate_from_id = npm_replicate_all + f'&start_key="{last_key}"'
116+
print(f"Processing iteration: {i}: {npm_replicate_from_id}")
117+
118+
response = requests.get(npm_replicate_from_id)
119+
if not response.ok:
120+
raise Exception(npm_replicate_from_id, response.text)
121+
122+
package_data = response.json()
123+
package_names, last_key = get_package_names_last_key(package_data)
124+
all_package_names.append(package_names)
125+
126+
return {"packages": all_package_names}
127+
128+
129+
def get_npm_packageurls(name, npm_repo=NPM_REGISTRY_REPO):
130+
packageurls = []
131+
132+
project_index_api_url = npm_repo + name
133+
response = requests.get(project_index_api_url)
134+
if not response.ok:
135+
return packageurls
136+
137+
project_data = response.json()
138+
for version in project_data.get("versions"):
139+
purl = PackageURL(
140+
type=NPM_TYPE,
141+
name=name,
142+
version=version,
143+
)
144+
packageurls.append(purl.to_string())
145+
146+
return packageurls
147+
148+
149+
def load_npm_packages(packages_file):
150+
with open(packages_file) as f:
151+
packages_data = json.load(f)
152+
153+
return packages_data.get("packages", [])

minecode_pipelines/miners/pypi.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,6 @@
1313

1414
from packageurl import PackageURL
1515

16-
from minecode_pipelines.utils import get_temp_file
17-
from minecode_pipelines.pipes import write_data_to_json_file
18-
1916
"""
2017
Visitors for Pypi and Pypi-like Python package repositories.
2118
@@ -52,12 +49,6 @@ def get_pypi_packages(pypi_repo, logger=None):
5249
return response.json()
5350

5451

55-
def write_packages_json(packages, name):
56-
temp_file = get_temp_file(name)
57-
write_data_to_json_file(path=temp_file, data=packages)
58-
return temp_file
59-
60-
6152
def get_pypi_packageurls(name):
6253
packageurls = []
6354

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"last_serial": 0,
3+
"date": null
4+
}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
from scanpipe.pipelines import Pipeline
24+
from scanpipe.pipes import federatedcode
25+
26+
from minecode_pipelines.pipes import npm
27+
from minecode_pipelines import pipes
28+
29+
30+
class MineandPublishNPMPURLs(Pipeline):
31+
"""
32+
Mine all packageURLs from a npm index and publish them to
33+
a FederatedCode repo.
34+
"""
35+
36+
@classmethod
37+
def steps(cls):
38+
return (
39+
cls.check_federatedcode_eligibility,
40+
cls.mine_npm_packages,
41+
cls.mine_and_publish_npm_packageurls,
42+
cls.delete_cloned_repos,
43+
)
44+
45+
def check_federatedcode_eligibility(self):
46+
"""
47+
Check if the project fulfills the following criteria for
48+
pushing the project result to FederatedCode.
49+
"""
50+
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
51+
52+
def mine_npm_packages(self):
53+
"""Mine npm package names from npm indexes or checkpoint."""
54+
self.npm_packages, self.state, self.last_seq = npm.mine_npm_packages(logger=self.log)
55+
56+
def mine_and_publish_npm_packageurls(self):
57+
"""Get npm packageURLs for all mined npm package names."""
58+
self.repos = npm.mine_and_publish_npm_packageurls(
59+
packages_file=self.npm_packages,
60+
state=self.state,
61+
last_seq=self.last_seq,
62+
logger=self.log,
63+
)
64+
65+
def delete_cloned_repos(self):
66+
pipes.delete_cloned_repos(repos=self.repos, logger=self.log)

minecode_pipelines/pipes/__init__.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
from scanpipe.pipes.federatedcode import delete_local_clone
2121
from scanpipe.pipes.federatedcode import commit_and_push_changes
2222

23+
from minecode_pipelines.utils import get_temp_file
24+
2325
# states:
2426
# note: a state is null when mining starts
2527
INITIAL_SYNC_STATE = "initial-sync"
@@ -29,6 +31,12 @@
2931
MINECODE_PIPELINES_CONFIG_REPO = "https://github.com/aboutcode-data/minecode-pipelines-config/"
3032

3133

34+
def write_packages_json(packages, name):
35+
temp_file = get_temp_file(name)
36+
write_data_to_json_file(path=temp_file, data=packages)
37+
return temp_file
38+
39+
3240
def fetch_checkpoint_from_github(config_repo, checkpoint_path):
3341
repo_name = config_repo.split("github.com")[-1]
3442
checkpoints_file = (
@@ -81,6 +89,32 @@ def update_mined_packages_in_checkpoint(packages, config_repo, cloned_repo, chec
8189
)
8290

8391

92+
def update_checkpoint_state(
93+
cloned_repo,
94+
state,
95+
checkpoint_path,
96+
config_repo=MINECODE_PIPELINES_CONFIG_REPO,
97+
):
98+
checkpoint = fetch_checkpoint_from_github(
99+
config_repo=config_repo,
100+
checkpoint_path=checkpoint_path,
101+
)
102+
checkpoint["state"] = state
103+
update_checkpoints_in_github(
104+
checkpoint=checkpoint,
105+
cloned_repo=cloned_repo,
106+
path=checkpoint_path,
107+
)
108+
109+
110+
def get_packages_file_from_checkpoint(config_repo, checkpoint_path, name):
111+
packages = fetch_checkpoint_from_github(
112+
config_repo=config_repo,
113+
checkpoint_path=checkpoint_path,
114+
)
115+
return write_packages_json(packages, name=name)
116+
117+
84118
def write_packageurls_to_file(repo, base_dir, packageurls):
85119
purl_file_rel_path = os.path.join(base_dir, PURLS_FILENAME)
86120
purl_file_full_path = Path(repo.working_dir) / purl_file_rel_path

0 commit comments

Comments
 (0)