Merge pull request #804 from Gud-will/main

pawangeek · web-flow · commit 9a61d5735427 · 2022-01-15T16:55:00.000+05:30
Automated Website Url scraper
diff --git a/websiteurl_scraper/README.md b/websiteurl_scraper/README.md
@@ -0,0 +1,24 @@
+
+<h1 align="center">Automated Url Scraper</h1>
+<p>This python script focuses on retreving all the webpages links from a giver Url
+
+This link can also be present inside a button or an action
+</p>
+<h2 >Libraries needed are</h2>
+
+<ul><h3>ssl</h3>
+<p>This can be install by using pip install ssl</p>
+</ul>
+
+<ul><h3>urrlib</h3>
+<p>This can be install by using pip install urrlib</p></ul>
+
+<ul><h3>BeautifulSoup4</h3>
+<p>This can be installed by using BeautifulSoup4</p>
+</ul>
+
+<h2>Inputs</h2>
+<p>we need ot give A valid website link as input</p>
+<h2>Outputs</h2>
+
+<p>The program access the website and returns all the links present in the website</p>
diff --git a/websiteurl_scraper/requirements.txt b/websiteurl_scraper/requirements.txt
@@ -0,0 +1,12 @@
+We need to install 3 libraries in order for this file to work
+the 1st is ssl
+This can be install by using pip install ssl 
+and this library helps to us to tackle website certification issues
+
+the 2nd is urrlib
+This can be install by using pip install urrlib
+and this library helps us to acces the url
+
+the 3rd is bs4 which is beautifulsoup
+This can be installed by using BeautifulSoup4
+This helps us to read the url and acces information
diff --git a/websiteurl_scraper/webUrlscraper.py b/websiteurl_scraper/webUrlscraper.py
@@ -0,0 +1,36 @@
+# USings ssl and urrlib.request to read the contents of the url
+# ssl helps us to avoid cretificate verifation and so on
+
+import ssl
+from urllib.request import urlopen, Request
+from bs4 import BeautifulSoup
+
+ctx = ssl.create_default_context()
+ctx.check_hostname = False
+ctx.verify_mode = ssl.CERT_NONE
+
+# getting in the website link
+Url = input("Enter your Urllink")
+try :
+    # trying to access the page
+    page = Request(Url, headers={'User-Agent': 'Mozilla/5.0'})
+    page = urlopen(page, context=ctx).read()
+    # Using beautifulsoup to read the contents of the page
+    soup = BeautifulSoup(page, 'html.parser')
+    # finding all the link headers
+    links = soup.findAll('a')
+    if(links is not None) :
+        finalLinks = []
+        # getting actual site links from the header a
+        for link in links :
+            if 'href' in str(link) :
+                templist = str(link).split("href")
+                index1 = templist[-1].index("\"")
+                index2 = templist[-1][index1 + 1 :].index("\"")
+                finalLinks.append(templist[-1][index1 : index2 + 3])
+        print("Here are your final links")
+        # printing the final completed list
+        for i in finalLinks :
+            print(i)
+except Exception as e :
+    print(str(e))