Skip to content

Commit 62b4e27

Browse files
committed
Initial commit
0 parents  commit 62b4e27

9 files changed

+252
-0
lines changed

.DS_Store

6 KB
Binary file not shown.

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
zenv/
2+
__pycache__/
3+
logs/

README.md

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Python Phishing URL Detection
2+
---
3+
Python 3.12.3

extractorFunctions.py

+175
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
# importing required packages for Address Bar Based feature Extraction
2+
from urllib.parse import urlparse,urlencode, unquote
3+
import re
4+
# importing required packages for Domain Based Feature Extraction
5+
import whois
6+
from datetime import datetime
7+
8+
9+
# 2.Checks for IP address in URL (Have_IP)
10+
def havingIP(url):
11+
ip_pattern = r"\b(?:\d{1,3}\.){3}\d{1,3}\b"
12+
match = re.search(ip_pattern, url)
13+
if match:
14+
return 1
15+
return 0
16+
17+
# 3.Checks the presence of @ in URL (Have_At)
18+
def haveAtSign(url):
19+
if "@" in url:
20+
at = 1
21+
else:
22+
at = 0
23+
return at
24+
25+
# 4.Finding the length of URL and categorizing (URL_Length)
26+
def getLength(url):
27+
return len(url)
28+
29+
# 5.Gives number of '/' in URL (URL_Depth)
30+
def getDepth(url):
31+
s = urlparse(url).path.split('/')
32+
depth = 0
33+
for j in range(len(s)):
34+
if len(s[j]) != 0:
35+
depth = depth+1
36+
return depth
37+
38+
#listing shortening services
39+
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
40+
r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
41+
r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
42+
r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
43+
r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
44+
r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
45+
r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
46+
r"tr\.im|link\.zip\.net"
47+
48+
# 8. Checking for Shortening Services in URL (Tiny_URL)
49+
def tinyURL(url):
50+
match=re.search(shortening_services,url)
51+
if match:
52+
return 1
53+
else:
54+
return 0
55+
56+
# 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)
57+
def prefixSuffix(url):
58+
if '-' in urlparse(url).netloc:
59+
return 1 # phishing
60+
else:
61+
return 0 # legitimate
62+
63+
def no_of_dots(url):
64+
return url.count('.')
65+
66+
sensitiveWords = ["account", "confirm", "banking", "secure", "ebyisapi", "webscr", "signin", "mail",
67+
"install", "toolbar", "backup", "paypal", "password", "username", "verify", "update",
68+
"login", "support", "billing", "transaction", "security", "payment", "verify", "online",
69+
"customer", "service", "accountupdate", "verification", "important", "confidential",
70+
"limited", "access", "securitycheck", "verifyaccount", "information", "change", "notice"
71+
"myaccount", "updateinfo", "loginsecure", "protect", "transaction", "identity", "member"
72+
"personal", "actionrequired", "loginverify", "validate", "paymentupdate", "urgent"]
73+
74+
def sensitive_word(url):
75+
domain = urlparse(url).netloc
76+
for i in sensitiveWords:
77+
if i in domain:
78+
return 1
79+
return 0
80+
81+
82+
def has_unicode(url):
83+
# Parse the URL
84+
parsed_url = urlparse(url)
85+
86+
# Get the netloc part of the URL
87+
netloc = parsed_url.netloc
88+
89+
# Decode the netloc using IDNA encoding
90+
decoded_netloc = netloc.encode('latin1').decode('idna')
91+
92+
# Unquote the decoded netloc
93+
unquoted_netloc = unquote(decoded_netloc)
94+
95+
# Compare the unquoted netloc with the original netloc
96+
if unquoted_netloc != netloc:
97+
return 1
98+
99+
return 0
100+
101+
# 13.Survival time of domain: The difference between termination time and creation time (Domain_Age)
102+
def domainAge(domain_name):
103+
creation_date = domain_name.creation_date
104+
expiration_date = domain_name.expiration_date
105+
if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
106+
try:
107+
creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
108+
expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
109+
except:
110+
return 1
111+
if ((expiration_date is None) or (creation_date is None)):
112+
return 1
113+
elif ((type(expiration_date) is list) or (type(creation_date) is list)):
114+
return 1
115+
else:
116+
ageofdomain = abs((expiration_date - creation_date).days)
117+
if ((ageofdomain/30) < 6):
118+
age = 1
119+
else:
120+
age = 0
121+
return age
122+
123+
# 14.End time of domain: The difference between termination time and current time (Domain_End)
124+
def domainEnd(domain_name):
125+
expiration_date = domain_name.expiration_date
126+
if isinstance(expiration_date,str):
127+
try:
128+
expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
129+
except:
130+
return 1
131+
if (expiration_date is None):
132+
return 1
133+
elif (type(expiration_date) is list):
134+
return 1
135+
else:
136+
today = datetime.now()
137+
end = abs((expiration_date - today).days)
138+
if ((end/30) < 6):
139+
end = 0
140+
else:
141+
end = 1
142+
return end
143+
144+
# 15. IFrame Redirection (iFrame)
145+
def iframe(response):
146+
if response == "":
147+
return 1
148+
else:
149+
if re.findall(r"[<iframe>|<frameBorder>]", response.text):
150+
return 0
151+
else:
152+
return 1
153+
154+
# 16.Checks the effect of mouse over on status bar (Mouse_Over)
155+
def mouseOver(response):
156+
if response == "" :
157+
return 1
158+
else:
159+
try:
160+
if re.findall("<script>.+onmouseover.+</script>", response.text):
161+
return 1
162+
else:
163+
return 0
164+
except:
165+
return 1
166+
167+
# 18.Checks the number of forwardings (Web_Forwards)
168+
def forwarding(response):
169+
if response == "":
170+
return 1
171+
else:
172+
if len(response.history) <= 2:
173+
return 0
174+
else:
175+
return 1

featureExtractor.py

+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import whois
2+
from urllib.parse import urlparse
3+
import httpx
4+
import pickle as pk
5+
import pandas as pd
6+
import extractorFunctions as ef
7+
8+
#Function to extract features
9+
def featureExtraction(url):
10+
11+
features = []
12+
#Address bar based features (12)
13+
features.append(ef.getLength(url))
14+
features.append(ef.getDepth(url))
15+
features.append(ef.tinyURL(url))
16+
features.append(ef.prefixSuffix(url))
17+
features.append(ef.no_of_dots(url))
18+
features.append(ef.sensitive_word(url))
19+
20+
21+
domain_name = ''
22+
#Domain based features (4)
23+
dns = 0
24+
try:
25+
domain_name = whois.whois(urlparse(url).netloc)
26+
except:
27+
dns = 1
28+
29+
features.append(1 if dns == 1 else ef.domainAge(domain_name))
30+
features.append(1 if dns == 1 else ef.domainEnd(domain_name))
31+
32+
# HTML & Javascript based features (4)
33+
dom = []
34+
try:
35+
response = httpx.get(url)
36+
except:
37+
response = ""
38+
39+
dom.append(ef.iframe(response))
40+
dom.append(ef.mouseOver(response))
41+
dom.append(ef.forwarding(response))
42+
43+
features.append(ef.has_unicode(url)+ef.haveAtSign(url)+ef.havingIP(url))
44+
45+
with open('pca_model.pkl', 'rb') as file:
46+
pca = pk.load(file)
47+
48+
#converting the list to dataframe
49+
feature_names = ['URL_Length', 'URL_Depth', 'TinyURL', 'Prefix/Suffix', 'No_Of_Dots', 'Sensitive_Words',
50+
'Domain_Age', 'Domain_End', 'Have_Symbol','domain_att']
51+
dom_pd = pd.DataFrame([dom], columns = ['iFrame','Web_Forwards','Mouse_Over'])
52+
features.append(pca.transform(dom_pd)[0][0])
53+
54+
row = pd.DataFrame([features], columns= feature_names)
55+
56+
return row

main.py

Whitespace-only changes.

pca_model.pkl

914 Bytes
Binary file not shown.

phishingdetection.pkl

346 KB
Binary file not shown.

requirements.txt

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
anyio==4.3.0
2+
certifi==2024.2.2
3+
h11==0.14.0
4+
httpcore==1.0.5
5+
httpx==0.27.0
6+
idna==3.7
7+
numpy==1.26.4
8+
pandas==2.2.2
9+
python-dateutil==2.9.0.post0
10+
pytz==2024.1
11+
regex==2024.4.16
12+
six==1.16.0
13+
sniffio==1.3.1
14+
tzdata==2024.1
15+
whois==1.20240129.2

0 commit comments

Comments
 (0)