sanscript-tech
diff --git a/‎Python/WebCrawler/Readme.MD
Lines changed: 11 additions & 0 deletions b/‎Python/WebCrawler/Readme.MD
Lines changed: 11 additions & 0 deletions
diff --git a/‎Python/WebCrawler/images/Screenshot (271).png
241 KB b/‎Python/WebCrawler/images/Screenshot (271).png
241 KB
diff --git a/‎Python/WebCrawler/images/input.png
284 KB b/‎Python/WebCrawler/images/input.png
284 KB
diff --git a/‎Python/WebCrawler/images/output.png
230 KB b/‎Python/WebCrawler/images/output.png
230 KB
diff --git a/‎Python/WebCrawler/webcrawler.py
Lines changed: 81 additions & 0 deletions b/‎Python/WebCrawler/webcrawler.py
Lines changed: 81 additions & 0 deletions
@@ -0,0 +1,11 @@
+## This is a web crawler for grabbing links from a website 
+
+# The code is built in Python
+
+# A sample output is like this while scrapping a webpage of a enter Website by the user
+
+
+![endpoint](https://github.com/Tejas1510/hacking-tools-scripts/blob/main/Python/WebCrawler/images/Screenshot%20(271).png)
+
+# To scrap a particular website just run the code and enter the website name of which you want to scrapp
+
@@ -0,0 +1,81 @@
+import requests
+from urllib.parse import urlparse, urljoin
+from bs4 import BeautifulSoup
+import colorama
+
+colorama.init()
+GREEN = colorama.Fore.GREEN
+GRAY = colorama.Fore.LIGHTBLACK_EX
+RESET = colorama.Fore.RESET
+
+internal_urls = set()
+external_urls = set()
+
+def is_valid(url):
+    """
+    Checks whether `url` is a valid URL.
+    """
+    parsed = urlparse(url)
+    return bool(parsed.netloc) and bool(parsed.scheme)
+
+def get_all_website_links(url):
+    """
+    Returns all URLs that is found on `url` in which it belongs to the same website
+    """
+    # all URLs of `url`
+    urls = set()
+    # domain name of the URL without the protocol
+    domain_name = urlparse(url).netloc
+    soup = BeautifulSoup(requests.get(url).content, "html.parser")
+
+    for a_tag in soup.findAll("a"):
+        href = a_tag.attrs.get("href")
+        if href == "" or href is None:
+            # href empty tag
+            continue
+
+        href = urljoin(url, href)
+
+        parsed_href = urlparse(href)
+        # remove URL GET parameters, URL fragments, etc.
+        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
+
+        if not is_valid(href):
+            continue
+        if href in internal_urls:
+            continue
+        if domain_name not in href:
+            # external link
+            if href not in external_urls:
+                print(f"{GRAY}[!] External link: {href}{RESET}")
+                external_urls.add(href)
+            continue
+            print(f"{GREEN}[*] Internal link: {href}{RESET}")
+            urls.add(href)
+            internal_urls.add(href)
+    return urls
+
+# number of urls visited so far will be stored here
+total_urls_visited = 0
+
+def crawl(url, max_urls=50):
+    """
+    Crawls a web page and extracts all links.
+    You'll find all links in `external_urls` and `internal_urls` global set variables.
+    params:
+        max_urls (int): number of max urls to crawl, default is 30.
+    """
+    global total_urls_visited
+    total_urls_visited += 1
+    links = get_all_website_links(url)
+    for link in links:
+        if total_urls_visited > max_urls:
+            break
+        crawl(link, max_urls=max_urls)
+
+if __name__ == "__main__":
+    website =input("Please Enter the website which you want to scrapp : ")
+    crawl(website)
+    print("[+] Total External links:", len(external_urls))
+    print("[+] Total Internal links:", len(internal_urls))
+    print("[+] Total:", len(external_urls) + len(internal_urls))