Skip to content

Commit cd5f666

Browse files
committed
automated_login_and_browsing, scrape stackoverflow
1 parent c562f28 commit cd5f666

File tree

7 files changed

+3398
-0
lines changed

7 files changed

+3398
-0
lines changed
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import os
2+
import time
3+
4+
from selenium import webdriver
5+
6+
BASE_PATH = os.path.dirname(__file__)
7+
driver_path = os.path.join(BASE_PATH, 'geckodriver')
8+
9+
10+
# Firefox Browser
11+
browser = webdriver.Firefox(
12+
executable_path=driver_path)
13+
# browser.get("http://dscrecbijnor.com")
14+
15+
16+
# Chrome Browser
17+
# chromium_driver_path = os.path.join(BASE_PATH, "<path>")
18+
# browser = webdriver.Chrome(executable_path=chromium_driver_path)
19+
# browser.get("http://dscrecbijnor.com")
20+
21+
22+
# Automated google search
23+
url = "https://google.com/"
24+
browser.get(url)
25+
time.sleep(2)
26+
name = 'q'
27+
search_el = browser.find_element_by_name(name)
28+
29+
# print(search_el)
30+
search_el.send_keys("selenium python")
31+
submit_btn_el = browser.find_element_by_css_selector("input[type=submit]")
32+
print(submit_btn_el.get_attribute('name'))
33+
time.sleep(1)
34+
submit_btn_el.click()
35+
36+
# now scrape the content using web scraping
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
# Import Libraries
2+
import os
3+
import time
4+
from getpass import getpass
5+
from urllib.parse import urlparse
6+
7+
import requests
8+
from selenium import webdriver
9+
10+
# Set Path for files
11+
BASE_PATH = os.path.dirname(os.path.abspath(__file__))
12+
driver_path = os.path.join(BASE_PATH, 'geckodriver')
13+
14+
# credientials
15+
username = 'its_yours_kumar'
16+
my_pass = getpass("What is your password?")
17+
18+
19+
# Firefox Browser
20+
browser = webdriver.Firefox(executable_path=driver_path)
21+
url = "https://instagram.com/"
22+
browser.get(url)
23+
24+
# Login
25+
time.sleep(2)
26+
username_el = browser.find_element_by_name("username")
27+
username_el.send_keys(username)
28+
29+
password_el = browser.find_element_by_name("password")
30+
password_el.send_keys(my_pass)
31+
32+
time.sleep(1.5)
33+
submit_btn_el = browser.find_element_by_css_selector("button[type='submit']")
34+
submit_btn_el.click()
35+
36+
37+
body_el = browser.find_element_by_css_selector("body")
38+
html_text = body_el.get_attribute("innerHTML")
39+
# print(html_text)
40+
# use webscraping here to scrape html
41+
42+
43+
# Automatic Follow
44+
# follow = browser.find_element_by_css_selector("button")
45+
46+
# xpath
47+
# my_button_xpath = "//button"
48+
# browser.find_element_by_xpath(my_button_xpath)
49+
50+
51+
def click_to_follow(browser):
52+
my_follow_btn_xpath = "//button[contains(text(), 'Follow')][not (contains(text(), 'Following'))]"
53+
follow_btn_elements = browser.find_elements_by_xpath(my_follow_btn_xpath)
54+
for btn in follow_btn_elements:
55+
time.sleep(2)
56+
try:
57+
btn.click()
58+
except Exception:
59+
pass
60+
61+
62+
new_user = "https://instagram.com/ted/"
63+
browser.get(new_user)
64+
click_to_follow(browser)
65+
66+
67+
# Scraping content from any post
68+
time.sleep(50)
69+
user_profile_url = "https://www.instagram.com/dscrecbijnor/"
70+
browser.get(user_profile_url)
71+
72+
post_url_pattern = "https://www.instagram.com/p/<post-slug-id>"
73+
post_xpath_str = "//a[contains(@href, '/p/')]"
74+
post_links = browser.find_elements_by_xpath(post_xpath_str)
75+
post_link_el = None
76+
77+
if len(post_links) > 0:
78+
post_link_el = post_links[0]
79+
80+
if post_link_el is not None:
81+
post_href = post_link_el.get_attribute("href")
82+
browser.get(post_href)
83+
84+
video_els = browser.find_elements_by_xpath("//video")
85+
image_els = browser.find_elements_by_xpath("//img")
86+
87+
"""
88+
img_dir = os.path.join(BASE_PATH, "images")
89+
os.makedirs(img_dir, exist_ok=True)
90+
for img in image_els:
91+
# print(img.get_attribute('src'))
92+
url = img.get_attribute('src')
93+
base_url = urlparse(url).path
94+
filename = os.path.basename(base_url)
95+
filepath = os.path.join(img_dir, filename)
96+
with requests.get(url, stream=True) as r:
97+
try:
98+
r.raise_for_status()
99+
except Exception:
100+
continue
101+
with open(filepath, "w") as f:
102+
for chunk in r.iter_content():
103+
if chunk:
104+
f.write(chunk) """
105+
106+
107+
def scrape_and_save(elements):
108+
data_dir = os.path.join(BASE_PATH, "data")
109+
os.makedirs(data_dir, exist_ok=True)
110+
for el in elements:
111+
112+
url = el.get_attribute('src')
113+
base_url = urlparse(url).path
114+
filename = os.path.basename(base_url)
115+
filepath = os.path.join(data_dir, filename)
116+
if os.path.exists(filepath):
117+
continue
118+
with requests.get(url, stream=True) as r:
119+
try:
120+
r.raise_for_status()
121+
except Exception:
122+
continue
123+
with open(filepath, "wb") as f:
124+
for chunk in r.iter_content(chunk_size=8192):
125+
if chunk:
126+
f.write(chunk)
127+
128+
129+
scrape_and_save(image_els)
130+
# scrape_and_save(video_els)
131+
132+
133+
# Automatic like and coments on posts
134+
"""
135+
LONG TERM GOAL
136+
Use Machine Learning to classify the post's
137+
image or videos
138+
and then comment in a relative fashion
139+
"""
140+
141+
# automate comment
142+
143+
144+
def automated_comment(browser, content="that's cool!"):
145+
time.sleep(3)
146+
comment_xpath_str = "//textarea[contains(@placeholder, 'Add a comment')]"
147+
comment_el = browser.find_element_by_xpath(comment_xpath_str)
148+
# print(comment_el)
149+
comment_el.send_keys(content)
150+
submit_btns = "button[type='submit']"
151+
submit_btn_els = browser.find_elements_by_css_selector(submit_btns)
152+
time.sleep(2)
153+
for btn in submit_btn_els:
154+
try:
155+
btn.click()
156+
except Exception:
157+
pass
158+
159+
160+
automated_comment(browser)
161+
162+
# automate like
163+
"""
164+
Like button is actually not a button it's a svg.
165+
"""
166+
167+
168+
def automated_like(browser):
169+
like_heart_svg_xpath = "//*[contains(@aria-label, 'Like')]"
170+
all_like_heart_els = browser.find_elements_by_xpath(like_heart_svg_xpath)
171+
172+
max_heart_h = -1
173+
for heart_el in all_like_heart_els:
174+
h = heart_el.get_attribute("height")
175+
max_heart_h = max(max_heart_h, int(h))
176+
177+
all_like_heart_els = browser.find_elements_by_xpath(
178+
like_heart_svg_xpath)
179+
for heart_el in all_like_heart_els:
180+
h = heart_el.get_attribute("height")
181+
# print(h)
182+
if h == max_heart_h or h == f"{max_heart_h}":
183+
parent_button = heart_el.find_element_by_xpath('..')
184+
time.sleep(2)
185+
try:
186+
parent_button.click()
187+
except Exception:
188+
pass
189+
190+
191+
automated_like(browser)
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# automated_login_and_browsing
2+
3+
This project is all about automating login to account and browsing the internet, searching the content, scraping the content and saving it to local machine.
4+
5+
6+
## Automated google search and scrape
7+
8+
The file [1_google_bot.py](automated_login_and_browsing/1_google_bot.py) contains the code for automated google web seaarch
9+
10+
## Intagram Bot
11+
12+
This is a [intagram bot](automated_login_and_browsing/2_instaBot.py), which will automatically login in to your instagram account and open user's profile and download thier posts. It also provides automated follow, automated like and automated comments
13+
14+
### Setup
15+
16+
1. This project uses the requirements from `requirements.txt` file like - *selenium* , *requests*, etc. install them
17+
2. After installing requirements download the webdriver for your browser
18+
<table><thead><tr><th>Browser</th><th>Supported OS</th><th>Maintained by</th><th>Download</th><th>Issue Tracker</th></tr></thead><tbody><tr><td>Chromium/Chrome</td><td>Windows/macOS/Linux</td><td>Google</td><td><a href="https://chromedriver.storage.googleapis.com/index.html" class="highlight">Downloads</a></td><td><a href="https://bugs.chromium.org/p/chromedriver/issues/list" class="highlight">Issues</a></td></tr><tr><td>Firefox</td><td>Windows/macOS/Linux</td><td>Mozilla</td><td><a href="https://github.com/mozilla/geckodriver/releases" class="highlight">Downloads</a></td><td><a href="https://github.com/mozilla/geckodriver/issues" class="highlight">Issues</a></td></tr><tr><td>Edge</td><td>Windows 10</td><td>Microsoft</td><td><a href="https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/" class="highlight">Downloads</a></td><td><a href="https://developer.microsoft.com/en-us/microsoft-edge/platform/issues/?page=1&amp;q=webdriver" class="highlight">Issues</a></td></tr><tr><td>Internet Explorer</td><td>Windows</td><td>Selenium Project</td><td><a href="https://selenium-release.storage.googleapis.com/index.html" class="highlight">Downloads</a></td><td><a href="https://github.com/SeleniumHQ/selenium/labels/D-IE" class="highlight">Issues</a></td></tr><tr><td>Safari</td><td>macOS El Capitan and newer</td><td>Apple</td><td>Built in</td><td><a href="https://bugreport.apple.com/logon" class="highlight">Issues</a></td></tr><tr><td>Opera</td><td>Windows/macOS/Linux</td><td>Opera</td><td><a href="https://github.com/operasoftware/operachromiumdriver/releases" class="highlight">Downloads</a></td><td><a href="https://github.com/operasoftware/operachromiumdriver/issues" class="highlight">Issues</a></td></tr></tbody></table>
19+
20+
21+
3. Now import webdriver from selenuim package and enjoy
22+
23+
```python
24+
from selenium import webdriver
25+
26+
# if firefox browser
27+
browser = webdriver.Firefox(executable_path=driver_path)
28+
29+
# if chrome browser
30+
browser = webdriver.Chrome(executable_path=chromium_driver_path)
31+
32+
url = "https:https://google.com/"
33+
browser.get(url)
34+
```
35+
36+
37+
### Author
38+
39+
[Kumar Shanu](https:https://its-kumar.herokuapp.com/)
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
selenium
2+
requests

0 commit comments

Comments
 (0)