Skip to content

Commit 48dc4fa

Browse files
committed
indeed+so job scrapper (old)
1 parent 76b8f29 commit 48dc4fa

File tree

3 files changed

+64
-8
lines changed

3 files changed

+64
-8
lines changed

day2_5.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
1-
from indeed import extract_indeed_jobs, extract_indeed_pages
1+
from indeed import get_jobs as get_indeed_jobs
2+
from stack_over_flow import get_jobs as get_so_jobs
23

34
# #range - 입력한 수 만큼의 크기의 배열을 생성
45
# for n in range(max_page):
56
# print(f"start={n*50}")
67

7-
last_indeed_pages = extract_indeed_pages()
8+
# last_indeed_pages = extract_indeed_pages()
89

9-
extract_indeed_jobs(last_indeed_pages)
10+
# indeed_jobs = extract_indeed_jobs(last_indeed_pages)
11+
12+
indeed_jobs = get_indeed_jobs()
13+
so_jobs = get_so_jobs()

indeed.py

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
LIMIT = 50
55
URL = f"https://www.indeed.com/jobs?q=Python&limit={LIMIT}"
66

7-
def extract_indeed_pages():
7+
def get_last_page():
88
result = requests.get(URL)
99

1010
soup = BeautifulSoup(result.text, "html.parser")
@@ -17,11 +17,48 @@ def extract_indeed_pages():
1717

1818
for link in links[:-1]: #마지막 요소는 읽지 않겠다는 뜻
1919
pages.append(int(link.string)) #string -> integer 변환
20-
2120
max_page = pages[-1]
2221
return max_page
2322

24-
def extract_indeed_jobs(last_page):
23+
def extract_job(html):
24+
title = html.find("div", {"class": "title"}).find("a")["title"]
25+
company = html.find("span", {"class":"company"})
26+
27+
if company:
28+
company_anchor = company.find("a")
29+
if company_anchor is not None:
30+
company = str(company_anchor.string)
31+
else:
32+
company = str(company.string)
33+
company = company.strip()
34+
else:
35+
company = None
36+
37+
location = html.find("div",{"class": "recJobLoc"})["data-rc-loc"] #div의 attribute를 가져옴
38+
job_id = html["data-jk"]
39+
40+
return {
41+
'title': title,
42+
'company': company,
43+
'location': location,
44+
"link":f"https://www.indeed.com/viewjob?jk={job_id}"
45+
}
46+
47+
def extract_jobs(last_page):
48+
jobs = []
2549
for page in range(last_page):
26-
res = requests.get(f"{URL}&start={page*LIMIT}")
27-
print(res.status_code)
50+
res = requests.get(f"{URL}&start={last_page*LIMIT}")
51+
soup = BeautifulSoup(res.text, "html.parser")
52+
results = soup.find_all("div", {"class":"jobsearch-SerpJobCard"})
53+
54+
#print(results)
55+
for res in results:
56+
job = extract_job(res)
57+
jobs.append(job)
58+
return jobs
59+
60+
61+
def get_jobs():
62+
last_page = get_last_page()
63+
jobs = extract_jobs(last_page)
64+
return jobs

stack_over_flow.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
4+
5+
URL = f"https://stackoverflow.com/jobs?q=python&pg=2"
6+
7+
def get_last_page():
8+
result = requests.get(URL)
9+
soup = BeautifulSoup(result.text,"html.parser")
10+
pages = soup.find("div", {"class":"pagination"}).find_all("a")
11+
12+
13+
def get_jobs():
14+
last_page = get_last_page()
15+
return []

0 commit comments

Comments
 (0)