Merge pull request #58 from sfolje0/python26and27tuts

buckyroberts · buckyroberts · commit 981411984f90 · 2016-02-23T19:50:52.000-05:00
Python26and27tuts
diff --git a/Python/26_workingsolution_python.py b/Python/26_workingsolution_python.py
@@ -0,0 +1,42 @@
+# [ Edited On 16.2.2016 ]
+# On that date this program was working.
+
+#Warning: For original Bucky's typed lines of code, take a look at the file 26_python.py .
+
+#Description:
+#This file is alternative solution for web crowler.
+# Mayor reason for this is that website BuckysRoom.com is down, so original code doesnot work anymore.
+#
+# Solution description (what this actually program does - not the same as in the video):
+#This program goes on website https://www.thenewboston.com/search.php?type=0&sort=reputation ,
+#and goes on every user's profile, and on that profile, 
+#it prints the first few (approx. 20) links of latest photos. To view photos, click on url or copy in web broser.
+
+
+# But history is changing and sooner or later this file or program will not work!.
+# On day of the creation this program was working.
+
+
+
+
+
+
+import requests
+from bs4 import BeautifulSoup
+
+
+def trade_spider(max_pages):
+	page = 1
+	while page <= max_pages:
+		url = 'https://thenewboston.com/search.php?type=0&sort=reputation&page==' + str(page)
+		source_code = requests.get(url, allow_redirects=False)
+		plain_text = source_code.text.encode('ascii', 'replace')
+		soup = BeautifulSoup(plain_text,'html.parser')
+		for link in soup.findAll('a', {'class': 'user-name'}):
+			href = link.get('href')
+			title = link.string
+			print(href)
+			print(title)
+		page += 1
+
+trade_spider(1)
diff --git a/Python/27_workingsolution_python.py b/Python/27_workingsolution_python.py
@@ -0,0 +1,63 @@
+
+# [ Edited On 16.2.2016 ] 
+# On that date this program was working. 
+
+#Warning: For original Bucky's typed lines of code, take a look at the file 27_python.py .
+
+#Description:
+#This file is alternative solution for web crowler. 
+# Mayor reason for this is that website BuckysRoom.com is down, so original code doesnot work anymore. 
+# Solution description (what this program does):
+#This program goes on website https://www.thenewboston.com/search.php?type=0&sort=reputation ,
+#and goes on every user's profile, and on that profile, 
+#it prints the first few (approx. 20) links of latest photos. To view photos, click on url or copy in web broser.
+
+
+# But history is changing and sooner or later this file or program will not work!. 
+# On day of the creation this program was working. 
+
+
+
+
+
+import requests
+from bs4 import BeautifulSoup
+
+
+def trade_spider(max_pages):
+    page = 1
+    while page <= max_pages:
+        url = 'https://thenewboston.com/search.php?type=0&sort=reputation&page==' + str(page)
+        source_code = requests.get(url, allow_redirects=False)
+        # just get the code, no headers or anything
+        plain_text = source_code.text.encode('ascii', 'replace')
+        # BeautifulSoup objects can be sorted through easy
+        soup = BeautifulSoup(plain_text,'html.parser')
+        for link in soup.findAll('a', {'class': 'user-name'}):
+            href = link.get('href')
+            title = link.string  # just the text, not the HTML
+            print(href)
+            print(title)
+            #get_single_item_data(href)
+        page += 1
+
+
+def get_single_item_data(item_url):
+    source_code = requests.get(item_url)
+    plain_text = source_code.text
+    soup = BeautifulSoup(plain_text,"lxml")
+    # if you want to gather photos from that user
+    for item_name in soup.findAll('img', {'class': 'img-responsive'}): # all photos of the user
+        photo='https://thenewboston.com'+item_name.get('src')
+        print(photo)
+    # if you want to gather links for a web crawler
+    for link in soup.findAll('a'):
+        href = link.get('href')
+        print(href)
+
+
+trade_spider(1)
+
+
+
+