-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
84 lines (70 loc) · 3.5 KB
/
main.py
File metadata and controls
84 lines (70 loc) · 3.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
# https://www.westermann.de/backend/buchlink/aufrufen/978-3-14-109677-4/1000324819
# https://www.westermann.de/backend/buchlink/aufrufen/978-3-14-109677-4/1000324820
URL = "https://www.westermann.de"
def beautify(description):
description = description.replace("?"," ").replace(":",";").replace("/","-")
description = description.replace("Baustein","BS")
description = description.replace("Arbeitsblätter","AB")
description = description.replace("Arbeitsblatt","AB")
description = description.replace("Zusatzmaterial","ZM")
return description
def build(url):
return URL + str(url)
def download(webcode):
url = "https://www.westermann.de/webcode"
payload = {'webcode': webcode}
# Die Seite mit dem Webcode aufrufen und die Zielseite erfragen
web = requests.post(url, data=payload, allow_redirects=False)
location = web.headers['Location']
zieladresse = "https://www.westermann.de" + location
# Die Zielseite aufrufen und als soup bereitstellen
web = requests.get(zieladresse)
soup = BeautifulSoup(web.text, "html5lib")
# Die Buchlinks abrufen und bereitstellen
buchlinks = soup.find_all("div", attrs={"class" : "buchlink"})
# Wir arbeiten jetzt die Buchlinks ab
description = None
link = None
for buchlink in buchlinks:
p_items = buchlink.find_all("p")
for p_item in p_items:
if p_item.find("a"):
link_element = build(p_item.find("a")["href"])
print("URL!", link_element)
web = requests.get(link_element, allow_redirects=True)
if "Word" in p_item.find("a").text:
with open(f"{description} [{webcode}].doc", "wb") as f:
f.write(web.content)
flag = False
elif "PDF" in p_item.find("a").text:
with open(f"{description} [{webcode}].pdf", "wb") as f:
f.write(web.content)
else:
print("")
print("ERROR - unbekannter Dateityp.")
print(p_item.text)
print("")
else:
description = beautify(p_item.get_text(strip=True))
print(description)
# input(">>>")
def main():
# Sprache und Kommunikation im öffentlichen Raum (2022)
webcodes = ['WES-109677-636', 'WES-109677-282', 'WES-109677-885', 'WES-109677-705', 'WES-109677-204',
'WES-109677-513', 'WES-109677-401', 'WES-109677-962', 'WES-109677-166', 'WES-109677-311',
'WES-109677-667', 'WES-109677-722', 'WES-109677-827', 'WES-109677-771', 'WES-109677-999',
'WES-109677-440', 'WES-109677-337', 'WES-109677-187', 'WES-109677-855', 'WES-109677-617',
'WES-109677-222', 'WES-109677-079', 'WES-109677-483', 'WES-109677-323', 'WES-109677-599',
'WES-109677-911', 'WES-109677-798', 'WES-109677-266', 'WES-109677-199', 'WES-109677-652',
'WES-109677-067', 'WES-109677-554', 'WES-109677-681', 'WES-109677-491', 'WES-109677-384',
'WES-109677-142', 'WES-109677-952', 'WES-109677-777', 'WES-109677-755', 'WES-109677-110',
'WES-109677-255']
# webcodes = ['WES-109677-999']
for webcode in webcodes:
download(webcode)
if __name__ == "__main__":
main()