|
| 1 | + |
| 2 | +# [ Edited On 16.2.2016 ] |
| 3 | +# On that date this program was working. |
| 4 | + |
| 5 | +#Warning: For original Bucky's typed lines of code, take a look at the file 27_python.py . |
| 6 | + |
| 7 | +#Description: |
| 8 | +#This file is alternative solution for web crowler. |
| 9 | +# Mayor reason for this is that website BuckysRoom.com is down, so original code doesnot work anymore. |
| 10 | +# Solution description (what this program does): |
| 11 | +#This program goes on website https://www.thenewboston.com/search.php?type=0&sort=reputation , |
| 12 | +#and goes on every user's profile, and on that profile, |
| 13 | +#it prints the first few (approx. 20) links of latest photos. To view photos, click on url or copy in web broser. |
| 14 | + |
| 15 | + |
| 16 | +# But history is changing and sooner or later this file or program will not work!. |
| 17 | +# On day of the creation this program was working. |
| 18 | + |
| 19 | + |
| 20 | + |
| 21 | + |
| 22 | + |
| 23 | +import requests |
| 24 | +from bs4 import BeautifulSoup |
| 25 | + |
| 26 | + |
| 27 | +def trade_spider(max_pages): |
| 28 | + page = 1 |
| 29 | + while page <= max_pages: |
| 30 | + url = 'https://thenewboston.com/search.php?type=0&sort=reputation&page==' + str(page) |
| 31 | + source_code = requests.get(url, allow_redirects=False) |
| 32 | + # just get the code, no headers or anything |
| 33 | + plain_text = source_code.text.encode('ascii', 'replace') |
| 34 | + # BeautifulSoup objects can be sorted through easy |
| 35 | + soup = BeautifulSoup(plain_text,'html.parser') |
| 36 | + for link in soup.findAll('a', {'class': 'user-name'}): |
| 37 | + href = link.get('href') |
| 38 | + title = link.string # just the text, not the HTML |
| 39 | + print(href) |
| 40 | + print(title) |
| 41 | + #get_single_item_data(href) |
| 42 | + page += 1 |
| 43 | + |
| 44 | + |
| 45 | +def get_single_item_data(item_url): |
| 46 | + source_code = requests.get(item_url) |
| 47 | + plain_text = source_code.text |
| 48 | + soup = BeautifulSoup(plain_text,"lxml") |
| 49 | + # if you want to gather photos from that user |
| 50 | + for item_name in soup.findAll('img', {'class': 'img-responsive'}): # all photos of the user |
| 51 | + photo='https://thenewboston.com'+item_name.get('src') |
| 52 | + print(photo) |
| 53 | + # if you want to gather links for a web crawler |
| 54 | + for link in soup.findAll('a'): |
| 55 | + href = link.get('href') |
| 56 | + print(href) |
| 57 | + |
| 58 | + |
| 59 | +trade_spider(1) |
| 60 | + |
| 61 | + |
| 62 | + |
| 63 | + |
0 commit comments