Skip to content

Commit 359ffde

Browse files
authored
Merge pull request #18 from DSACMS/remove-github-lib
Remove PyGithub and Add Support for Gitlab and Bitbucket
2 parents 0e25d9a + 805111d commit 359ffde

File tree

3 files changed

+203
-41
lines changed

3 files changed

+203
-41
lines changed

codejson_index_generator/parsers.py

Lines changed: 199 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,170 @@
11
import json
2+
from json.decoder import JSONDecodeError
23
import base64
3-
import argparse
44
import os
5+
import requests
6+
import re
57

8+
from time import sleep, mktime, gmtime, time, localtime
69
from typing import Dict, Optional
7-
from github import Github, Repository, GithubException, Organization
10+
11+
RETRIES = 5
12+
13+
14+
def hit_endpoint(url,token,method='GET'):
15+
headers = {"Authorization": f"bearer {token}"}
16+
17+
attempts = 0
18+
while attempts < RETRIES:
19+
20+
response = requests.request(method, url, headers=headers,timeout=10)
21+
22+
try:
23+
if response.status_code == 200:
24+
response_json = json.loads(response.text)
25+
break
26+
elif response.status_code in (403,429):
27+
#rate limit was triggered.
28+
wait_until = int(response.headers.get("x-ratelimit-reset"))
29+
wait_in_seconds = int(
30+
mktime(gmtime(wait_until)) -
31+
mktime(gmtime(time()))
32+
)
33+
wait_until_time = localtime(wait_until)
34+
35+
print(f"Ran into rate limit sleeping for {self.name}!")
36+
print(
37+
f"sleeping until {wait_until_time.tm_hour}:{wait_until_time.tm_min} ({wait_in_seconds} seconds)"
38+
)
39+
sleep(wait_in_seconds)
40+
41+
response_json = {}
42+
attempts += 1
43+
44+
if attempts >= RETRIES:
45+
raise ConnectionError(
46+
f"Rate limit was reached and couldn't be rectified after {attempts} tries"
47+
)
48+
else:
49+
print(response.status_code)
50+
raise ConnectionError("Rate limit error!")
51+
except JSONDecodeError:
52+
response_json = {}
53+
attempts += 1
54+
55+
return response_json
56+
57+
58+
def get_repo_owner_and_name(repo_http_url):
59+
""" Gets the owner and repo from a url.
60+
61+
Args:
62+
url: Github url
63+
64+
Returns:
65+
Tuple of owner and repo. Or a tuple of None and None if the url is invalid.
66+
"""
67+
68+
# Regular expression to parse a GitHub URL into two groups
69+
# The first group contains the owner of the github repo extracted from the url
70+
# The second group contains the name of the github repo extracted from the url
71+
# 'But what is a regular expression?' ----> https://docs.python.org/3/howto/regex.html
72+
if 'github' in repo_http_url:
73+
regex = r"https?:\/\/github\.com\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$"
74+
elif 'gitlab' in repo_http_url:
75+
regex = r"https?:\/\/gitlab\.com\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$"
76+
elif 'bitbucket' in repo_http_url:
77+
regex = r"https?:\/\/bitbucket\.org\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$"
78+
79+
result = re.search(regex, repo_http_url)
80+
81+
if not result:
82+
return None, None
83+
84+
capturing_groups = result.groups()
85+
86+
owner = capturing_groups[0]
87+
repo = capturing_groups[1]
88+
89+
return owner, repo
90+
891

992

1093
class IndexGenerator:
11-
def __init__(self, agency: str, verison: str, token: Optional[str] = None,):
12-
self.github = Github(token) if token else Github()
94+
def __init__(self, agency: str, version: str, token: Optional[str] = None, bitbucket_user: Optional[str] = None, bitbucket_password: Optional[str] = None, gitlab_token: Optional[str] = None):
1395

14-
# user can change agency and version depending on paramters
96+
# user can change agency and version depending on parameters
1597
self.index = {
1698
"agency": agency,
17-
"version": verison,
99+
"version": version,
18100
"measurementType": {
19101
"method": "projects"
20102
},
21103
"releases": []
22104
}
23105

24-
def get_code_json(self, repo: Repository) -> Optional[Dict]:
106+
self.token = token
107+
self.gitlab_token = gitlab_token
108+
self.bitbucket_user = bitbucket_user
109+
self.bitbucket_password = bitbucket_password
110+
111+
def get_code_json_github(self,repo : str) -> Optional[Dict]:
25112
try:
26-
content = repo.get_contents("code.json", ref = repo.default_branch)
27-
except GithubException as e:
28-
print(f"GitHub Error: {e.data.get('message', 'No message available')}")
113+
owner,name = get_repo_owner_and_name(repo)
114+
code_json_endpoint = f"https://api.github.com/repos/{owner}/{name}/contents/code.json"
115+
content_dict = hit_endpoint(code_json_endpoint,self.token)#repo.get_contents("code.json", ref = repo.default_branch)
116+
except Exception as e:
117+
print(f"GitHub Error: {e}")
29118
return None
30119

31120
try:
32-
decoded_content = base64.b64decode(content.content)
121+
decoded_content = base64.b64decode(content_dict['content'])
33122
return json.loads(decoded_content)
34123
except (json.JSONDecodeError, ValueError) as e:
35124
print(f"JSON Error: {str(e)}")
36125
return None
37126

38-
def save_code_json(self, repo: Repository, output_path: str) -> Optional[str]:
127+
def get_code_json_gitlab(self,repo: str) -> Optional[Dict]:
128+
try:
129+
owner,name = get_repo_owner_and_name(repo)
130+
code_json_endpoint = f"https://gitlab.com/api/v4/projects/{owner}%2F{name}/repository/files/code.json?ref=HEAD"
131+
content_dict = hit_endpoint(code_json_endpoint,self.gitlab_token)
132+
except Exception as e:
133+
print("Problem querying the Gitlab API")
134+
return None
135+
136+
try:
137+
decoded_content = base64.b64decode(content_dict['content'])
138+
return json.loads(decoded_content)
139+
except (json.JSONDecodeError, ValueError) as e:
140+
print(f"JSON Error {e}")
141+
return None
142+
143+
def get_code_json_bitbucket(self,repo: str) -> Optional[Dict]:
144+
try:
145+
owner, name = get_repo_owner_and_name(repo)
146+
code_json_endpoint = f"https://bitbucket.org/{owner}/{name}/raw/HEAD/code.json"
147+
session = requests.Session()
148+
session.auth = (self.bitbucket_user,self.bitbucket_password)
149+
150+
auth = session.post('http://bitbucket.org')
151+
response_dict = session.get(code_json_endpoint)
152+
except Exception as e:
153+
print(f"Exception when querying bitbucket.org: {e}")
154+
155+
return json.loads(response_dict.text)
156+
157+
def get_code_json(self, repo: str) -> Optional[Dict]:
158+
if 'github' in repo:
159+
return self.get_code_json_github(repo)
160+
elif 'gitlab' in repo:
161+
return self.get_code_json_gitlab(repo)
162+
elif 'bitbucket' in repo:
163+
return self.get_code_json_bitbucket(repo)
164+
else:
165+
return None
166+
167+
def save_code_json(self, repo: str, output_path: str) -> Optional[str]:
39168

40169
res = self.get_code_json(repo)
41170

@@ -57,46 +186,78 @@ def update_index(self, index: Dict, code_json: Dict, org_name: str, repo_name: s
57186

58187
index['releases'].append(baseline)
59188

60-
def get_org_repos(self, org_name: str) -> list[Organization]:
189+
def get_github_org_repos(self, org_name: str) -> list[Dict]:
61190
try:
62-
org = self.github.get_organization(org_name)
191+
org_endpoint = f"https://api.github.com/orgs/{org_name}/repos"
63192
print(f"\nProcessing organization: {org_name}")
64193

65-
total_repos = org.public_repos
194+
repo_list = hit_endpoint(org_endpoint,self.token)
195+
196+
197+
total_repos = len(repo_list)
66198
print(f"Found {total_repos} public repositories")
67199

68-
return total_repos
69-
except GithubException as e:
200+
return repo_list
201+
except Exception as e:
70202
raise e
71203

72-
def save_organization_files(self, org_name: str, codeJSONPath) -> None:
73-
raise NotImplementedError
204+
def _enumerate_repo_orgs(self,id,org_name,repo_name, url, total_repos, codeJSONPath=None,add_to_index=True):
205+
print(f"\nChecking {repo_name} [{id}/{total_repos}]")
206+
207+
if not codeJSONPath:
208+
code_json = self.get_code_json(url)
209+
else:
210+
repoPath = os.path.join(codeJSONPath, (repo_name + '.json'))
211+
code_json = self.save_code_json(url,repoPath)
212+
213+
if code_json and add_to_index:
214+
print(f"✅ Found code.json in {repo_name}")
215+
self.update_index(self.index, code_json, org_name, repo_name)
216+
elif not code_json:
217+
print(f"❌ No code.json found in {repo_name}")
218+
219+
def process_github_org_files(self, org_name: str, add_to_index=True, codeJSONPath=None) -> None:
220+
orgs = self.get_github_org_repos(org_name)
221+
total_repos = len(orgs)
222+
223+
for id, repo in enumerate(orgs, 1):
224+
try:
225+
self._enumerate_repo_orgs(
226+
id,org_name,repo['name'],repo['svn_url'],total_repos,codeJSONPath=codeJSONPath,add_to_index=add_to_index
227+
)
228+
except Exception as e:
229+
print(e)
230+
231+
def get_gitlab_org_repos(self, org_name: str) -> list[Dict]:
232+
try:
233+
url_encoded_org_name = org_name.replace("/","%2F")
234+
org_endpoint = f"https://gitlab.com/api/v4/groups/{url_encoded_org_name}/projects"
235+
236+
repo_list = hit_endpoint(org_endpoint,self.gitlab_token)
237+
238+
total_repos = len(repo_list)
239+
print(f"Found {total_repos} public repositories")
240+
241+
return total_repos
242+
except Exception as e:
243+
print(f"Ran into Exception when querying Gitlab Repos in group {org_name}: {e}")
244+
return None
74245

75-
def process_organization(self, org_name: str, add_to_index=True, codeJSONPath=None) -> None:
246+
def process_gitlab_org_files(self, org_name: str, add_to_index=True, codeJSONPath=None) -> None:
76247
try:
77-
org = self.github.get_organization(org_name)
78-
total_repos = self.get_org_repos(org_name)
248+
orgs = self.get_gitlab_org_repos(org_name)
249+
total_repos = len(orgs)
79250

80-
for id, repo in enumerate(org.get_repos(type='public'), 1):
81-
print(f"\nChecking {repo.name} [{id}/{total_repos}]")
82-
83-
if not codeJSONPath:
84-
code_json = self.get_code_json(repo)
85-
else:
86-
repoPath = os.path.join(codeJSONPath, (repo.name + '.json'))
87-
code_json = self.save_code_json(repo,repoPath)
88-
89-
if code_json and add_to_index:
90-
print(f"✅ Found code.json in {repo.name}")
91-
self.update_index(self.index, code_json, org_name, repo.name)
92-
elif not code_json:
93-
print(f"❌ No code.json found in {repo.name}")
251+
for id, repo in enumerate(orgs, 1):
252+
self._enumerate_repo_orgs(
253+
id,org_name,repo['name'],repo['web_url'],total_repos,codeJSONPath=codeJSONPath,add_to_index=add_to_index
254+
)
94255

95-
except GithubException as e:
256+
except Exception as e:
96257
print(f"Error processing organization {org_name}: {str(e)}")
97258

98259
def save_index(self, output_path: str) -> None:
99-
# sorts index by organizaiton then by name
260+
# sorts index by organization then by name
100261
self.index['releases'].sort(key=lambda x: (x.get('organization', ''), x.get('name', '')))
101262

102263
with open(output_path, 'w') as f:

main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,13 @@ def main():
4040
try:
4141
indexGen = IndexGenerator(
4242
agency = args.agency,
43-
verison = args.version,
43+
version = args.version,
4444
token = github_key
4545
)
4646

4747
for org in args.orgs.split(","):
4848
org = org.strip()
49-
indexGen.process_organization(org)
49+
indexGen.process_github_org_files(org)
5050

5151
indexGen.save_index(args.output)
5252
print(f"\nIndexing complete. Results saved to {args.output}")

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ repository = "https://github.com/DSACMS/codejson-index-generator"
99

1010
[tool.poetry.dependencies]
1111
python = "^3.13"
12-
pygithub = ">=1.59,<2.0"
12+
requests = "^2.32.4"
13+
llnl-scraper = "^0.15.0"
1314

1415

1516
[build-system]

0 commit comments

Comments
 (0)