|
| 1 | +# importing required packages for Address Bar Based feature Extraction |
| 2 | +from urllib.parse import urlparse,urlencode, unquote |
| 3 | +import re |
| 4 | +# importing required packages for Domain Based Feature Extraction |
| 5 | +import whois |
| 6 | +from datetime import datetime |
| 7 | + |
| 8 | + |
| 9 | +# 2.Checks for IP address in URL (Have_IP) |
| 10 | +def havingIP(url): |
| 11 | + ip_pattern = r"\b(?:\d{1,3}\.){3}\d{1,3}\b" |
| 12 | + match = re.search(ip_pattern, url) |
| 13 | + if match: |
| 14 | + return 1 |
| 15 | + return 0 |
| 16 | + |
| 17 | +# 3.Checks the presence of @ in URL (Have_At) |
| 18 | +def haveAtSign(url): |
| 19 | + if "@" in url: |
| 20 | + at = 1 |
| 21 | + else: |
| 22 | + at = 0 |
| 23 | + return at |
| 24 | + |
| 25 | +# 4.Finding the length of URL and categorizing (URL_Length) |
| 26 | +def getLength(url): |
| 27 | + return len(url) |
| 28 | + |
| 29 | +# 5.Gives number of '/' in URL (URL_Depth) |
| 30 | +def getDepth(url): |
| 31 | + s = urlparse(url).path.split('/') |
| 32 | + depth = 0 |
| 33 | + for j in range(len(s)): |
| 34 | + if len(s[j]) != 0: |
| 35 | + depth = depth+1 |
| 36 | + return depth |
| 37 | + |
| 38 | +#listing shortening services |
| 39 | +shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \ |
| 40 | + r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \ |
| 41 | + r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \ |
| 42 | + r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \ |
| 43 | + r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \ |
| 44 | + r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \ |
| 45 | + r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \ |
| 46 | + r"tr\.im|link\.zip\.net" |
| 47 | + |
| 48 | +# 8. Checking for Shortening Services in URL (Tiny_URL) |
| 49 | +def tinyURL(url): |
| 50 | + match=re.search(shortening_services,url) |
| 51 | + if match: |
| 52 | + return 1 |
| 53 | + else: |
| 54 | + return 0 |
| 55 | + |
| 56 | +# 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix) |
| 57 | +def prefixSuffix(url): |
| 58 | + if '-' in urlparse(url).netloc: |
| 59 | + return 1 # phishing |
| 60 | + else: |
| 61 | + return 0 # legitimate |
| 62 | + |
| 63 | +def no_of_dots(url): |
| 64 | + return url.count('.') |
| 65 | + |
| 66 | +sensitiveWords = ["account", "confirm", "banking", "secure", "ebyisapi", "webscr", "signin", "mail", |
| 67 | + "install", "toolbar", "backup", "paypal", "password", "username", "verify", "update", |
| 68 | + "login", "support", "billing", "transaction", "security", "payment", "verify", "online", |
| 69 | + "customer", "service", "accountupdate", "verification", "important", "confidential", |
| 70 | + "limited", "access", "securitycheck", "verifyaccount", "information", "change", "notice" |
| 71 | + "myaccount", "updateinfo", "loginsecure", "protect", "transaction", "identity", "member" |
| 72 | + "personal", "actionrequired", "loginverify", "validate", "paymentupdate", "urgent"] |
| 73 | + |
| 74 | +def sensitive_word(url): |
| 75 | + domain = urlparse(url).netloc |
| 76 | + for i in sensitiveWords: |
| 77 | + if i in domain: |
| 78 | + return 1 |
| 79 | + return 0 |
| 80 | + |
| 81 | + |
| 82 | +def has_unicode(url): |
| 83 | + # Parse the URL |
| 84 | + parsed_url = urlparse(url) |
| 85 | + |
| 86 | + # Get the netloc part of the URL |
| 87 | + netloc = parsed_url.netloc |
| 88 | + |
| 89 | + # Decode the netloc using IDNA encoding |
| 90 | + decoded_netloc = netloc.encode('latin1').decode('idna') |
| 91 | + |
| 92 | + # Unquote the decoded netloc |
| 93 | + unquoted_netloc = unquote(decoded_netloc) |
| 94 | + |
| 95 | + # Compare the unquoted netloc with the original netloc |
| 96 | + if unquoted_netloc != netloc: |
| 97 | + return 1 |
| 98 | + |
| 99 | + return 0 |
| 100 | + |
| 101 | +# 13.Survival time of domain: The difference between termination time and creation time (Domain_Age) |
| 102 | +def domainAge(domain_name): |
| 103 | + creation_date = domain_name.creation_date |
| 104 | + expiration_date = domain_name.expiration_date |
| 105 | + if (isinstance(creation_date,str) or isinstance(expiration_date,str)): |
| 106 | + try: |
| 107 | + creation_date = datetime.strptime(creation_date,'%Y-%m-%d') |
| 108 | + expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d") |
| 109 | + except: |
| 110 | + return 1 |
| 111 | + if ((expiration_date is None) or (creation_date is None)): |
| 112 | + return 1 |
| 113 | + elif ((type(expiration_date) is list) or (type(creation_date) is list)): |
| 114 | + return 1 |
| 115 | + else: |
| 116 | + ageofdomain = abs((expiration_date - creation_date).days) |
| 117 | + if ((ageofdomain/30) < 6): |
| 118 | + age = 1 |
| 119 | + else: |
| 120 | + age = 0 |
| 121 | + return age |
| 122 | + |
| 123 | +# 14.End time of domain: The difference between termination time and current time (Domain_End) |
| 124 | +def domainEnd(domain_name): |
| 125 | + expiration_date = domain_name.expiration_date |
| 126 | + if isinstance(expiration_date,str): |
| 127 | + try: |
| 128 | + expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d") |
| 129 | + except: |
| 130 | + return 1 |
| 131 | + if (expiration_date is None): |
| 132 | + return 1 |
| 133 | + elif (type(expiration_date) is list): |
| 134 | + return 1 |
| 135 | + else: |
| 136 | + today = datetime.now() |
| 137 | + end = abs((expiration_date - today).days) |
| 138 | + if ((end/30) < 6): |
| 139 | + end = 0 |
| 140 | + else: |
| 141 | + end = 1 |
| 142 | + return end |
| 143 | + |
| 144 | +# 15. IFrame Redirection (iFrame) |
| 145 | +def iframe(response): |
| 146 | + if response == "": |
| 147 | + return 1 |
| 148 | + else: |
| 149 | + if re.findall(r"[<iframe>|<frameBorder>]", response.text): |
| 150 | + return 0 |
| 151 | + else: |
| 152 | + return 1 |
| 153 | + |
| 154 | +# 16.Checks the effect of mouse over on status bar (Mouse_Over) |
| 155 | +def mouseOver(response): |
| 156 | + if response == "" : |
| 157 | + return 1 |
| 158 | + else: |
| 159 | + try: |
| 160 | + if re.findall("<script>.+onmouseover.+</script>", response.text): |
| 161 | + return 1 |
| 162 | + else: |
| 163 | + return 0 |
| 164 | + except: |
| 165 | + return 1 |
| 166 | + |
| 167 | +# 18.Checks the number of forwardings (Web_Forwards) |
| 168 | +def forwarding(response): |
| 169 | + if response == "": |
| 170 | + return 1 |
| 171 | + else: |
| 172 | + if len(response.history) <= 2: |
| 173 | + return 0 |
| 174 | + else: |
| 175 | + return 1 |
0 commit comments