Skip to content

Commit 450ba6e

Browse files
authored
Merge pull request #89 from Tejas1510/main
Changed the readme as required on issue #61
2 parents 4e6d766 + 05dd3a3 commit 450ba6e

File tree

5 files changed

+92
-0
lines changed

5 files changed

+92
-0
lines changed

Python/WebCrawler/Readme.MD

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
## This is a web crawler for grabbing links from a website
2+
3+
# The code is built in Python
4+
5+
# A sample output is like this while scrapping a webpage of a enter Website by the user
6+
7+
8+
![endpoint](https://github.com/Tejas1510/hacking-tools-scripts/blob/main/Python/WebCrawler/images/Screenshot%20(271).png)
9+
10+
# To scrap a particular website just run the code and enter the website name of which you want to scrapp
11+
241 KB
Loading

Python/WebCrawler/images/input.png

284 KB
Loading

Python/WebCrawler/images/output.png

230 KB
Loading

Python/WebCrawler/webcrawler.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import requests
2+
from urllib.parse import urlparse, urljoin
3+
from bs4 import BeautifulSoup
4+
import colorama
5+
6+
colorama.init()
7+
GREEN = colorama.Fore.GREEN
8+
GRAY = colorama.Fore.LIGHTBLACK_EX
9+
RESET = colorama.Fore.RESET
10+
11+
internal_urls = set()
12+
external_urls = set()
13+
14+
def is_valid(url):
15+
"""
16+
Checks whether `url` is a valid URL.
17+
"""
18+
parsed = urlparse(url)
19+
return bool(parsed.netloc) and bool(parsed.scheme)
20+
21+
def get_all_website_links(url):
22+
"""
23+
Returns all URLs that is found on `url` in which it belongs to the same website
24+
"""
25+
# all URLs of `url`
26+
urls = set()
27+
# domain name of the URL without the protocol
28+
domain_name = urlparse(url).netloc
29+
soup = BeautifulSoup(requests.get(url).content, "html.parser")
30+
31+
for a_tag in soup.findAll("a"):
32+
href = a_tag.attrs.get("href")
33+
if href == "" or href is None:
34+
# href empty tag
35+
continue
36+
37+
href = urljoin(url, href)
38+
39+
parsed_href = urlparse(href)
40+
# remove URL GET parameters, URL fragments, etc.
41+
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
42+
43+
if not is_valid(href):
44+
continue
45+
if href in internal_urls:
46+
continue
47+
if domain_name not in href:
48+
# external link
49+
if href not in external_urls:
50+
print(f"{GRAY}[!] External link: {href}{RESET}")
51+
external_urls.add(href)
52+
continue
53+
print(f"{GREEN}[*] Internal link: {href}{RESET}")
54+
urls.add(href)
55+
internal_urls.add(href)
56+
return urls
57+
58+
# number of urls visited so far will be stored here
59+
total_urls_visited = 0
60+
61+
def crawl(url, max_urls=50):
62+
"""
63+
Crawls a web page and extracts all links.
64+
You'll find all links in `external_urls` and `internal_urls` global set variables.
65+
params:
66+
max_urls (int): number of max urls to crawl, default is 30.
67+
"""
68+
global total_urls_visited
69+
total_urls_visited += 1
70+
links = get_all_website_links(url)
71+
for link in links:
72+
if total_urls_visited > max_urls:
73+
break
74+
crawl(link, max_urls=max_urls)
75+
76+
if __name__ == "__main__":
77+
website =input("Please Enter the website which you want to scrapp : ")
78+
crawl(website)
79+
print("[+] Total External links:", len(external_urls))
80+
print("[+] Total Internal links:", len(internal_urls))
81+
print("[+] Total:", len(external_urls) + len(internal_urls))

0 commit comments

Comments
 (0)