Skip to content

Commit

Permalink
Updated CopyRight Notice
Browse files Browse the repository at this point in the history
Updated the copyright notice from an Email Scraper Framework to an Information Gathering Tool inside bh.py
  • Loading branch information
theHamdiz committed Aug 25, 2023
1 parent c256a6d commit d31173c
Showing 1 changed file with 16 additions and 9 deletions.
25 changes: 16 additions & 9 deletions bh.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Blue Hawk By Ahmad Hamdi Emara - Email Scraper Framework
Blue Hawk By Ahmad Hamdi Emara - Information Gathering Tool
Version: 1.0.0
Author: Ahmad Hamdi Emara
Website: https://hamdiz.me
Expand Down Expand Up @@ -47,7 +47,8 @@ def scrape(self):
break
url = self.urls.popleft()
self.scraped_urls.add(url)
print(colorize(f'🔥[{self.counter}] Processing {self._truncate(url, 50)}', 'yellow', True))
print(colorize(
f'🔥[{self.counter}] Processing {self._truncate(url, 50)}', 'yellow', True))
response = None
if (self.mode in [ScrapeMode.SMART, ScrapeMode.LAZY] and self._get_domain(
self.target_url) == self._get_domain(url)) or self.mode == ScrapeMode.VERBOSE:
Expand All @@ -63,7 +64,7 @@ def scrape(self):

def _check_exit_conditions(self):
if ((self.mode == ScrapeMode.LAZY and self.counter > 1) or ((
self.mode == ScrapeMode.SMART or self.mode == ScrapeMode.VERBOSE) and self.counter > self.max_depth + 1)):
self.mode == ScrapeMode.SMART or self.mode == ScrapeMode.VERBOSE) and self.counter > self.max_depth + 1)):
return True

return False
Expand All @@ -85,7 +86,8 @@ def _get_response(url):
def _process_response(self, response, url):
new_emails = set(re.findall(
self.regex_config.pattern, response.text, re.I))
new_phones = self._clean_phone_numbers(set(re.findall(self.regex_config.phone_regex, response.text, re.I)))
new_phones = self._clean_phone_numbers(
set(re.findall(self.regex_config.phone_regex, response.text, re.I)))

self.phone_numbers.update(new_phones)
self.emails.update(new_emails)
Expand All @@ -99,7 +101,8 @@ def _process_response(self, response, url):
# Here we are checking for found emails.
if not new_emails:
# If they're directly visible in the HTML, We check mailto links.
new_emails = set(re.findall(self.regex_config.mailto_regex, response.text, re.I))
new_emails = set(re.findall(
self.regex_config.mailto_regex, response.text, re.I))
self.emails.update(new_emails)

def _process_anchor(self, anchor, base_url, path):
Expand All @@ -112,7 +115,8 @@ def _process_anchor(self, anchor, base_url, path):

if link not in self.urls and link not in self.scraped_urls:
self.urls.append(link)
user_names = self._filter_and_construct_links(set(re.findall(self.regex_config.username_regex, link, re.I)))
user_names = self._filter_and_construct_links(
set(re.findall(self.regex_config.username_regex, link, re.I)))
self.user_names.update(user_names)

@staticmethod
Expand All @@ -138,10 +142,12 @@ def _display_emails(self) -> None:
@staticmethod
def _filter_and_construct_links(results) -> set:
# Regular routes that are not usernames
non_user_routes = {'in', 'p', 'sharer', 'intent', 'channel', 'shareArticle', 'reel', 'share', 'add', 'c'}
non_user_routes = {'in', 'p', 'sharer', 'intent',
'channel', 'shareArticle', 'reel', 'share', 'add', 'c'}

# Filter out results with non-user routes
filtered_results = {(platform, route) for platform, route in results if route not in non_user_routes}
filtered_results = {(platform, route) for platform,
route in results if route not in non_user_routes}

# Construct platform links without 'https://www.'
links = {f"{platform}/{route}" for platform, route in filtered_results}
Expand Down Expand Up @@ -181,7 +187,8 @@ def _filter_results(self) -> None:
Final Check on email results to remove false positives.
"""
extensions = (".png", ".webp", ".jpg", ".jpeg", ".tiff", ".gif")
emails_to_remove = {email for email in self.emails if str(email).endswith(extensions)}
emails_to_remove = {email for email in self.emails if str(
email).endswith(extensions)}

self.emails -= emails_to_remove

Expand Down

0 comments on commit d31173c

Please sign in to comment.