Skip to content

Commit 74bb3da

Browse files
committed
add Bitbucket, github and gitlab support. Add support for github and gitlab orgs
Signed-off-by: Isaac Milarsky <[email protected]>
1 parent aa34e08 commit 74bb3da

File tree

3 files changed

+99
-28
lines changed

3 files changed

+99
-28
lines changed

codejson_index_generator/parsers.py

Lines changed: 97 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,13 @@ def get_repo_owner_and_name(repo_http_url):
7070
# The first group contains the owner of the github repo extracted from the url
7171
# The second group contains the name of the github repo extracted from the url
7272
# 'But what is a regular expression?' ----> https://docs.python.org/3/howto/regex.html
73-
regex = r"https?:\/\/github\.com\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$"
73+
if 'github' in repo_http_url:
74+
regex = r"https?:\/\/github\.com\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$"
75+
elif 'gitlab' in repo_http_url:
76+
regex = r"https?:\/\/gitlab\.com\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$"
77+
elif 'bitbucket' in repo_http_url:
78+
regex = r"https?:\/\/bitbucket\.org\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$"
79+
7480
result = re.search(regex, repo_http_url)
7581

7682
if not result:
@@ -86,7 +92,7 @@ def get_repo_owner_and_name(repo_http_url):
8692

8793

8894
class IndexGenerator:
89-
def __init__(self, agency: str, version: str, token: Optional[str] = None,):
95+
def __init__(self, agency: str, version: str, token: Optional[str] = None, bitbucket_user: Optional[str] = None, bitbucket_password: Optional[str] = None, gitlab_token: Optional[str] = None):
9096

9197
# user can change agency and version depending on parameters
9298
self.index = {
@@ -99,6 +105,9 @@ def __init__(self, agency: str, version: str, token: Optional[str] = None,):
99105
}
100106

101107
self.token = token
108+
self.gitlab_token = gitlab_token
109+
self.bitbucket_user = bitbucket_user
110+
self.bitbucket_password = bitbucket_password
102111

103112
def get_code_json_github(self,repo : str) -> Optional[Dict]:
104113
try:
@@ -116,14 +125,45 @@ def get_code_json_github(self,repo : str) -> Optional[Dict]:
116125
print(f"JSON Error: {str(e)}")
117126
return None
118127

119-
def get_code_json_other(self,repo: str) -> Optional[Dict]:
120-
return None
128+
def get_code_json_gitlab(self,repo: str) -> Optional[Dict]:
129+
try:
130+
owner,name = get_repo_owner_and_name(repo)
131+
code_json_endpoint = f"https://gitlab.com/api/v4/projects/{owner}%2F{name}/repository/files/code.json?ref=HEAD"
132+
content_dict = hit_endpoint(code_json_endpoint,self.gitlab_token)
133+
except Exception as e:
134+
print("Problem querying the Gitlab API")
135+
return None
136+
137+
try:
138+
decoded_content = base64.b64decode(content_dict['content'])
139+
return json.loads(decoded_content)
140+
except (json.JSONDecodeError, ValueError) as e:
141+
print(f"JSON Error {e}")
142+
return None
143+
144+
def get_code_json_bitbucket(self,repo: str) -> Optional[Dict]:
145+
try:
146+
owner, name = get_repo_owner_and_name(repo)
147+
code_json_endpoint = f"https://bitbucket.org/{owner}/{name}/raw/HEAD/code.json"
148+
session = requests.Session()
149+
session.auth = (self.bitbucket_user,self.bitbucket_password)
150+
151+
auth = session.post('http://bitbucket.org')
152+
response_dict = session.get(code_json_endpoint)
153+
except Exception as e:
154+
print(f"Exception when querying bitbucket.org: {e}")
155+
156+
return json.loads(response_dict.text)
121157

122158
def get_code_json(self, repo: str) -> Optional[Dict]:
123159
if 'github' in repo:
124160
return self.get_code_json_github(repo)
161+
elif 'gitlab' in repo:
162+
return self.get_code_json_gitlab(repo)
163+
elif 'bitbucket' in repo:
164+
return self.get_code_json_bitbucket(repo)
125165
else:
126-
return self.get_code_json_other(repo)
166+
return None
127167

128168
def save_code_json(self, repo: str, output_path: str) -> Optional[str]:
129169

@@ -147,7 +187,7 @@ def update_index(self, index: Dict, code_json: Dict, org_name: str, repo_name: s
147187

148188
index['releases'].append(baseline)
149189

150-
def get_org_repos(self, org_name: str) -> list[Dict]:
190+
def get_github_org_repos(self, org_name: str) -> list[Dict]:
151191
try:
152192
org_endpoint = f"https://api.github.com/orgs/{org_name}/repos"
153193
print(f"\nProcessing organization: {org_name}")
@@ -162,34 +202,64 @@ def get_org_repos(self, org_name: str) -> list[Dict]:
162202
except Exception as e:
163203
raise e
164204

165-
def save_organization_files(self, org_name: str, codeJSONPath) -> None:
166-
raise NotImplementedError
205+
def _enumerate_repo_orgs(self,org_name,repo_name, url, total_repos, codeJSONPath=None):
206+
print(f"\nChecking {repo_name} [{id}/{total_repos}]")
207+
208+
if not codeJSONPath:
209+
code_json = self.get_code_json(url)
210+
else:
211+
repoPath = os.path.join(codeJSONPath, (repo_name + '.json'))
212+
code_json = self.save_code_json(url,repoPath)
213+
214+
if code_json and add_to_index:
215+
print(f"✅ Found code.json in {repo_name}")
216+
self.update_index(self.index, code_json, org_name, repo_name)
217+
elif not code_json:
218+
print(f"❌ No code.json found in {repo_name}")
167219

168-
def process_organization(self, org_name: str, add_to_index=True, codeJSONPath=None) -> None:
220+
def process_github_org_files(self, org_name: str, add_to_index=True, codeJSONPath=None) -> None:
169221
try:
170-
org = self.github.get_organization(org_name)
171-
total_repos = self.get_org_repos(org_name)
222+
orgs = self.get_github_org_repos(org_name)
223+
total_repos = len(orgs)
172224

173-
for id, repo in enumerate(org.get_repos(type='public'), 1):
174-
print(f"\nChecking {repo.name} [{id}/{total_repos}]")
175-
176-
if not codeJSONPath:
177-
code_json = self.get_code_json(repo)
178-
else:
179-
repoPath = os.path.join(codeJSONPath, (repo.name + '.json'))
180-
code_json = self.save_code_json(repo,repoPath)
181-
182-
if code_json and add_to_index:
183-
print(f"✅ Found code.json in {repo.name}")
184-
self.update_index(self.index, code_json, org_name, repo.name)
185-
elif not code_json:
186-
print(f"❌ No code.json found in {repo.name}")
225+
for id, repo in enumerate(orgs, 1):
226+
self._enumerate_repo_orgs(
227+
org_name,repo['name'],repo['svn_url'],total_repos,codeJSONPath=codeJSONPath
228+
)
187229

188-
except GithubException as e:
230+
except Exception as e:
231+
print(f"Error processing organization {org_name}: {str(e)}")
232+
233+
def get_gitlab_org_repos(self, org_name: str) -> list[Dict]:
234+
try:
235+
url_encoded_org_name = org_name.replace("/","%2F")
236+
org_endpoint = f"https://gitlab.com/api/v4/groups/{url_encoded_org_name}/projects"
237+
238+
repo_list = hit_endpoint(org_endpoint,self.gitlab_token)
239+
240+
total_repos = len(repo_list)
241+
print(f"Found {total_repos} public repositories")
242+
243+
return total_repos
244+
except Exception as e:
245+
print(f"Ran into Exception when querying Gitlab Repos in group {org_name}: {e}")
246+
return None
247+
248+
def process_gitlab_org_files(self, org_name: str, add_to_index=True, codeJSONPath=None) -> None:
249+
try:
250+
orgs = self.get_gitlab_org_repos(org_name)
251+
total_repos = len(orgs)
252+
253+
for id, repo in enumerate(orgs, 1):
254+
self._enumerate_repo_orgs(
255+
org_name,repo['name'],repo['web_url'],total_repos,codeJSONPath=codeJSONPath
256+
)
257+
258+
except Exception as e:
189259
print(f"Error processing organization {org_name}: {str(e)}")
190260

191261
def save_index(self, output_path: str) -> None:
192-
# sorts index by organizaiton then by name
262+
# sorts index by organization then by name
193263
self.index['releases'].sort(key=lambda x: (x.get('organization', ''), x.get('name', '')))
194264

195265
with open(output_path, 'w') as f:

main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def main():
4646

4747
for org in args.orgs.split(","):
4848
org = org.strip()
49-
indexGen.process_organization(org)
49+
indexGen.process_github_org_files(org)
5050

5151
indexGen.save_index(args.output)
5252
print(f"\nIndexing complete. Results saved to {args.output}")

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ repository = "https://github.com/DSACMS/codejson-index-generator"
1010
[tool.poetry.dependencies]
1111
python = "^3.13"
1212
requests = "^2.32.4"
13+
llnl-scraper = "^0.15.0"
1314

1415

1516
[build-system]

0 commit comments

Comments
 (0)