Skip to content

Commit 281e537

Browse files
committed
async webscraping
1 parent 9b83506 commit 281e537

File tree

10 files changed

+8319
-0
lines changed

10 files changed

+8319
-0
lines changed

async_webscraping/Pipfile

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[[source]]
2+
name = "pypi"
3+
url = "https://pypi.org/simple"
4+
verify_ssl = true
5+
6+
[dev-packages]
7+
8+
[packages]
9+
aiohttp = "*"
10+
11+
12+
[requires]
13+
python_version = "3.8"

async_webscraping/ascrape.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import asyncio
2+
import pathlib
3+
4+
from aiohttp import ClientSession
5+
6+
7+
async def main():
8+
url = 'https://www.boxofficemojo.com/year/2019'
9+
async with ClientSession() as session:
10+
async with session.get(url) as response:
11+
html_body = await response.read()
12+
return html_body
13+
14+
html_data = asyncio.run(main())
15+
OUTPUT_DIR = pathlib.Path().resolve() / "snapshots"
16+
OUTPUT_DIR.mkdir(parent=True, exist_ok=True)
17+
OUTPUT_FILE = OUTPUT_DIR / "2019.html"
18+
OUTPUT_FILE.write_text(html_data.decode())

async_webscraping/ascrape_multi.py

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import asyncio
2+
import pathlib
3+
4+
from aiohttp import ClientSession
5+
6+
7+
async def fetch(url, session, year=None):
8+
async with session.get(url) as response:
9+
html_body = await response.read()
10+
return {"body": html_body, "year": year}
11+
12+
13+
async def fetch_with_sem(url, session, year, sem):
14+
async with sem:
15+
return await fetch(url, session, year)
16+
17+
18+
async def main(start_year=2020, years_ago=5):
19+
pages_content = {}
20+
tasks = []
21+
# semaphore
22+
sem = asyncio.Semaphore(10)
23+
async with ClientSession() as session:
24+
for i in range(0, years_ago):
25+
year = start_year - i
26+
url = f'https://www.boxofficemojo.com/year/{year}/'
27+
print(year, url)
28+
tasks.append(
29+
asyncio.create_task(fetch_with_sem(url, session, year, sem))
30+
)
31+
pages_content = await asyncio.gather(*tasks)
32+
return pages_content
33+
34+
35+
results = asyncio.run(main())
36+
# print(results)
37+
OUTPUT_DIR = pathlib.Path().resolve() / "snapshots"
38+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
39+
for result in results:
40+
current_year = result.get("year")
41+
html_data = result.get("body")
42+
OUTPUT_FILE = OUTPUT_DIR / f"{current_year}.html"
43+
OUTPUT_FILE.write_text(html_data.decode())

async_webscraping/async.py

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import asyncio
2+
import time
3+
4+
iteration_times = [1, 3, 2, 4]
5+
6+
7+
async def sleeper(seconds, i=-1):
8+
start_time = time.time()
9+
if i != -1:
10+
print(f"{i}\t{seconds}s")
11+
await asyncio.sleep(seconds)
12+
return time.time() - start_time
13+
14+
run_time = 0
15+
total_compute_run_time = 0
16+
17+
18+
async def main(): # coroutine
19+
global run_time
20+
global total_compute_run_time
21+
# await sleeper(1, i=0)
22+
tasks = []
23+
for i, second in enumerate(iteration_times):
24+
tasks.append(
25+
asyncio.create_task(
26+
sleeper(second, i=i)
27+
)
28+
)
29+
results = await asyncio.gather(*tasks)
30+
for run_time_result in results:
31+
total_compute_run_time += run_time_result
32+
if run_time_result > run_time:
33+
run_time = run_time_result
34+
35+
# main()
36+
asyncio.run(main())
37+
print(f"Ran for {run_time} seconds, with a total of {total_compute_run_time} and {run_time / total_compute_run_time }")

async_webscraping/snapshots/2016.html

+1,728
Large diffs are not rendered by default.

async_webscraping/snapshots/2017.html

+1,724
Large diffs are not rendered by default.

async_webscraping/snapshots/2018.html

+1,998
Large diffs are not rendered by default.

async_webscraping/snapshots/2019.html

+1,828
Large diffs are not rendered by default.

async_webscraping/snapshots/2020.html

+906
Large diffs are not rendered by default.

async_webscraping/sync.py

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import time
2+
3+
iteration_times = [1, 3, 2, 4]
4+
5+
6+
def sleeper(seconds, i=-1):
7+
start_time = time.time()
8+
if i != -1:
9+
print(f"{i}\t{seconds}s")
10+
time.sleep(seconds)
11+
return time.time() - start_time
12+
13+
14+
run_time = 0
15+
16+
17+
def main():
18+
global run_time
19+
for i, second in enumerate(iteration_times):
20+
run_time += sleeper(second, i=i)
21+
22+
23+
main()
24+
print(f"Ran for {run_time} seconds")

0 commit comments

Comments
 (0)