-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgithub_scraper.py
158 lines (130 loc) · 5.9 KB
/
github_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# Import the WebSession class
from web_actions import WebSession
from web_actions import SelectorType
import time
import json
import requests
# Define the options for the WebSession
options = {
"headless": False,
"incognito": False,
"disable-gpu": False,
"window-size": "1920,1080",
"user-data-dir": "userdata"
}
# Initialize the WebSession object with options
session = WebSession(options=options)
selectors = {
"name": '/html/body/div[1]/div[4]/main/div[2]/div/div[1]/div/div[2]/div[1]/div[2]/h1/span[1]',
"name2": 'p-name vcard-fullname d-block overflow-hidden',
"username": '/html/body/div[1]/div[4]/main/div[2]/div/div[1]/div/div[2]/div[1]/div[2]/h1/span[2]',
"username2": 'p-nickname vcard-username d-block',
"bio": '/html/body/div[1]/div[4]/main/div[2]/div/div[1]/div/div[2]/div[3]/div[2]/div[1]',
"repositories": '//*[@id="js-pjax-container"]/div[2]/div[1]/div[2]/div[1]/div[1]/div/h2/a/span',
"repo_name": "repo",
"repo_description": 'pinned-item-desc color-fg-muted text-small mt-2',
"repo_language": 'd-inline-block mr-3',
"repo_stars": 'pinned-item-meta Link--muted',
"repo_forks": '//a[contains(@href,"/fork")]',
"repo_container": "mb-3 d-flex flex-content-stretch col-12 col-md-6 col-lg-6",
"repo_container2": '//*[@id="user-profile-frame"]/div/div[2]/div/ol/li[1]',
"repo_status": 'Label Label--attention v-align-middle mt-1 no-wrap v-align-baseline Label--inline',
"repo_status2": 'Label Label--secondary v-align-middle mt-1 no-wrap v-align-baseline Label--inline'
}
# Navigate to the GitHub profile
profile_url = "https://github.com/Silenttttttt"
session.go_to(profile_url)
# Extract the username from the profile URL
username = profile_url.split('/')[-1]
readme_content = ""
branches = ["main", "master"]
for branch in branches:
readme_url = f"https://raw.githubusercontent.com/{username}/{username}/refs/heads/{branch}/README.md"
response = requests.get(readme_url)
if response.status_code == 200:
readme_content = response.text
break
# save readme content to file
with open("README.md", "w") as f:
f.write(readme_content)
#time.sleep(3)
# convert class name to css selector
name2_selector = session.class_to_css_selector(selectors["name2"])
username2_selector = session.class_to_css_selector(selectors["username2"])
repo_description_selector = session.class_to_css_selector(selectors["repo_description"])
repo_selector = session.class_to_css_selector(selectors["repo_container"])
#repo2_selector = session.class_to_css_selector(selectors["repo_container2"])
repo_language_selector = session.class_to_css_selector(selectors["repo_language"])
repo_stars_selector = session.class_to_css_selector(selectors["repo_stars"])
repo_status_selector = session.class_to_css_selector(selectors["repo_status"])
repo_status2_selector = session.class_to_css_selector(selectors["repo_status2"])
try:
repo_elements = session.find_elements(SelectorType.CSS, repo_selector, timeout=5, raise_exc=True)
if not repo_elements:
raise Exception("No repository elements found")
except Exception:
repo_elements = session.find_elements(SelectorType.XPATH, selectors["repo_container2"], timeout=5, raise_exc=True)
# Wait for the profile name element to be present
try:
name = session.extract(selector_type=SelectorType.XPATH, selector=selectors["name"], timeout=5, raise_exc=True)
except Exception:
try:
name = session.extract(selector_type=SelectorType.CSS, selector=name2_selector, timeout=5, raise_exc=True)
except Exception:
name = ""
try:
username = session.extract(selector_type=SelectorType.XPATH, selector=selectors["username"], timeout=5, raise_exc=True)
except Exception:
try:
username = session.extract(selector_type=SelectorType.CSS, selector=username2_selector, timeout=5, raise_exc=True)
except Exception:
username = ""
try:
bio = session.extract(selector_type=SelectorType.XPATH, selector=selectors["bio"], timeout=5, raise_exc=True)
except Exception:
bio = ""
# Extract and organize the repository information
repositories = []
for element in repo_elements:
repo_name = session.extract(element=element, selector_type=SelectorType.CLASS_NAME, selector=selectors["repo_name"], skip_wait=True)
repo_description = session.extract(element=element, selector_type=SelectorType.CSS, selector=repo_description_selector, skip_wait=True)
try:
repo_language = session.extract(element=element, selector_type=SelectorType.CSS, selector=repo_language_selector, skip_wait=True, raise_exc=True)
except Exception:
repo_language = ""
try:
repo_stars = session.extract(element=element, selector_type=SelectorType.CSS, selector=repo_stars_selector, skip_wait=True, raise_exc=True)
except Exception:
repo_stars = ""
try:
repo_forks = session.extract(element=element, selector_type=SelectorType.XPATH, selector=selectors["repo_forks"], skip_wait=True, raise_exc=True)
except Exception:
repo_forks = ""
try:
repo_status = session.extract(element=element, selector_type=SelectorType.CSS, selector=repo_status_selector, skip_wait=True, raise_exc=True)
except Exception:
try:
repo_status = session.extract(element=element, selector_type=SelectorType.CSS, selector=repo_status2_selector, skip_wait=True, raise_exc=True)
except Exception:
repo_status = ""
repository = {
"name": repo_name,
"description": repo_description,
"language": repo_language,
"stars": repo_stars,
"forks": repo_forks,
"status": repo_status
}
repositories.append(repository)
data = {
"profile_name": name,
"bio": bio,
"repositories": repositories
}
# Save data to json
with open('github_profile.json', 'w') as f:
json.dump(data, f, indent=4)
# Uncomment the following line to enter debug mode
# session.debug()
# Close the browser session
session.close()