Skip to content

Commit 067c5fc

Browse files
authored
feature/normalize fqdns (csirtgadgets#148)
* Add test-cases for normalizing urls/fqdns * Add matching for dots at end of fqdn And create `normalize_indicator` func * Implement `normalize_indicator` func for urls/fqdns * Fix imports * Init github action for commit tests * Relocate .github dir to root * Add tests on PRs * Don't lowercase `reference` field
1 parent 2d05672 commit 067c5fc

File tree

6 files changed

+110
-38
lines changed

6 files changed

+110
-38
lines changed

.github/workflows/push_test.yml

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
name: Test latest commit
2+
3+
on: [push, pull_request]
4+
5+
jobs:
6+
test:
7+
runs-on: ubuntu-latest
8+
9+
steps:
10+
- name: Set up Python 3.6
11+
uses: actions/setup-python@v1
12+
with:
13+
python-version: 3.6
14+
15+
- uses: actions/checkout@v2
16+
with:
17+
fetch-depth: 0
18+
19+
- name: Install dependencies
20+
run: |
21+
sudo apt-get update
22+
23+
python -m pip install --upgrade pip
24+
pip install -r dev_requirements.txt
25+
pip install stix
26+
python setup.py test
27+
python setup.py sdist bdist bdist_wheel

csirtg_indicator/indicator.py

+13-16
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
from base64 import b64encode
77
from .exceptions import InvalidIndicator
88
from . import VERSION
9-
from .utils import parse_timestamp, resolve_itype, is_subdomain, ipv4_normalize
9+
from .utils import resolve_itype, is_subdomain, ipv4_normalize, normalize_indicator
10+
from .utils.ztime import parse_timestamp
1011
import pytricia
1112
import codecs
1213
from datetime import datetime
@@ -33,7 +34,7 @@ def __init__(self, indicator=None, version=PROTOCOL_VERSION, **kwargs):
3334
self.version = version
3435
if 'lowercase' in kwargs:
3536
self._lowercase = kwargs.get('lowercase')
36-
# indicate lowercase arg was explicitly passed by user rather than just a default value
37+
# indicate lowercase arg was explicitly passed by user rather than just a default value
3738
self._lowercase_explicit = True
3839
else:
3940
# set lowercase to True by default, but ensure we can later determine it was not user specified
@@ -55,10 +56,10 @@ def __init__(self, indicator=None, version=PROTOCOL_VERSION, **kwargs):
5556
continue
5657

5758
if isinstance(kwargs[k], basestring):
58-
# always stripe whitespace
59+
# always strip whitespace
5960
kwargs[k] = kwargs[k].strip()
6061

61-
if self._lowercase is True:
62+
if self._lowercase is True and k != 'reference': # don't lower reference which may be a url
6263
kwargs[k] = kwargs[k].lower()
6364
if k in ['tags', 'peers']:
6465
kwargs[k] = kwargs[k].split(',')
@@ -115,20 +116,16 @@ def indicator(self, i):
115116
self.itype = resolve_itype(i.lower())
116117
self._indicator = i
117118

118-
if self.itype == 'url':
119-
u = urlparse(self._indicator)
120-
if self._lowercase is True and self._lowercase_explicit is True:
121-
self._indicator = u.geturl().rstrip('/').lower()
122-
else:
123-
self._indicator = u.geturl().rstrip('/')
124-
else:
125-
if self._lowercase is True:
126-
self._indicator = self._indicator.lower()
127-
119+
if self.itype in ['url', 'fqdn']:
120+
self._indicator = normalize_indicator(self._indicator, itype=self.itype,
121+
lowercase=self._lowercase, lowercase_explicit=self._lowercase_explicit)
128122

129-
if self.itype == 'ipv4':
123+
elif self.itype == 'ipv4':
130124
self._indicator = ipv4_normalize(self._indicator)
131125

126+
else:
127+
self._indicator = self._indicator.lower()
128+
132129
if self.mask and (self.itype in ['ipv4', 'ipv6']):
133130
self._indicator = '{}/{}'.format(self._indicator, int(self.mask))
134131
self.mask = None
@@ -297,7 +294,7 @@ def __repr__(self):
297294
v = v.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
298295

299296
if isinstance(v, basestring):
300-
if k not in ['indicator', 'message'] and not k.endswith('time') and self._lowercase is True:
297+
if k not in ['indicator', 'message', 'reference'] and not k.endswith('time') and self._lowercase is True:
301298
v = v.lower()
302299

303300
if k == 'confidence':

csirtg_indicator/utils/__init__.py

+35-16
Original file line numberDiff line numberDiff line change
@@ -3,41 +3,41 @@
33
import ipaddress
44
from ..exceptions import InvalidIndicator
55
from ..constants import PYVERSION
6-
from .ztime import parse_timestamp
7-
import sys
6+
87

98
if PYVERSION == 3:
109
from urllib.parse import urlparse
10+
from urllib.parse import urlsplit
11+
from urllib.parse import urlunsplit
1112
else:
1213
from urlparse import urlparse
1314

14-
from pprint import pprint
1515

16-
RE_IPV4 = re.compile('^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(\d{1,3})$')
17-
RE_IPV4_CIDR = re.compile('^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\/\d{1,2})$')
16+
RE_IPV4 = re.compile(r'^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(\d{1,3})$')
17+
RE_IPV4_CIDR = re.compile(r'^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\/\d{1,2})$')
1818

1919
# http://stackoverflow.com/a/17871737
20-
RE_IPV6 = re.compile('(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))')
20+
RE_IPV6 = re.compile(r'(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))')
2121

2222
# http://goo.gl/Cztyn2 -- probably needs more work
2323
# http://stackoverflow.com/a/26987741/7205341
2424
# ^((xn--)?(--)?[a-zA-Z0-9-_@]+(-[a-zA-Z0-9]+)*\.)+[a-zA-Z]{2,}(--p1ai)?$
2525
#RE_FQDN = re.compile('^((?!-))(xn--)?[a-z0-9][a-z0-9-_\.]{0,61}[a-z0-9]{0,1}\.(xn--)?([a-z0-9\-]{1,61}|[a-z0-9-]{1,30}\.[a-z]{2,})$')
2626
# http://stackoverflow.com/questions/14402407/maximum-length-of-a-domain-name-without-the-http-www-com-parts
27-
RE_FQDN = re.compile('^((?!-))(xn--)?[a-z0-9][a-z0-9-_\.]{0,245}[a-z0-9]{0,1}\.(xn--)?([a-z0-9\-]{1,61}|[a-z0-9-]{1,30}\.[a-z]{2,})$')
28-
RE_URI_SCHEMES = re.compile('^(https?|ftp)$')
29-
RE_EMAIL = re.compile("^[-\w+.!#$%&'*\/=?^_`{|}~;]+@[-.0-9a-zA-Z][-.0-9a-zA-Z]*[a-zA-Z]{2,}$")
30-
RE_ASN = re.compile('^(AS|as)[0-9]{1,6}$')
27+
RE_FQDN = re.compile(r'^((?!-))(xn--)?[a-z0-9][a-z0-9-_\.]{0,245}[a-z0-9]{0,1}\.(xn--)?([a-z0-9\-]{1,61}|[a-z0-9-]{1,30}\.[a-z]{2,})\.?$')
28+
RE_URI_SCHEMES = re.compile(r'^(https?|ftp)$')
29+
RE_EMAIL = re.compile(r"^[-\w+.!#$%&'*\/=?^_`{|}~;]+@[-.0-9a-zA-Z][-.0-9a-zA-Z]*[a-zA-Z]{2,}$")
30+
RE_ASN = re.compile(r'^(AS|as)[0-9]{1,6}$')
3131

3232
RE_HASH = {
33-
'uuid': re.compile('^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$'),
34-
'md5': re.compile('^[a-fA-F0-9]{32}$'),
35-
'sha1': re.compile('^[a-fA-F0-9]{40}$'),
36-
'sha256': re.compile('^[a-fA-F0-9]{64}$'),
37-
'sha512': re.compile('^[a-fA-F0-9]{128}$'),
33+
'uuid': re.compile(r'^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$'),
34+
'md5': re.compile(r'^[a-fA-F0-9]{32}$'),
35+
'sha1': re.compile(r'^[a-fA-F0-9]{40}$'),
36+
'sha256': re.compile(r'^[a-fA-F0-9]{64}$'),
37+
'sha512': re.compile(r'^[a-fA-F0-9]{128}$'),
3838
}
3939

40-
RE_IPV4_PADDING = re.compile(r"(^|\.)0+([^/.])")
40+
RE_IPV4_PADDING = re.compile(r'(^|\.)0+([^/.])')
4141

4242

4343
def ipv4_normalize(i):
@@ -205,6 +205,25 @@ def normalize_itype(i, itype=None):
205205
return i
206206

207207

208+
def normalize_indicator(i, itype=None, lowercase=False, lowercase_explicit=False):
209+
if itype == 'fqdn':
210+
i = i.rstrip('.')
211+
# only don't lowercase if lowercase=False and lowercase_explicit=True (set by user)
212+
if lowercase or not lowercase_explicit:
213+
i = i.lower()
214+
elif itype == 'url':
215+
u = urlparse(i)
216+
i = u.geturl().rstrip('/')
217+
if lowercase and lowercase_explicit:
218+
i = i.lower()
219+
elif lowercase or not lowercase_explicit:
220+
scheme, netloc, path, qs, anchor = urlsplit(i)
221+
netloc = netloc.rstrip('.').lower()
222+
i = urlunsplit((scheme, netloc, path, qs, anchor))
223+
224+
return i
225+
226+
208227
def is_subdomain(i):
209228
itype = resolve_itype(i)
210229
if itype is not 'fqdn':

test/test_fqdn.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from csirtg_indicator import Indicator
2-
from csirtg_indicator.utils import is_subdomain
32
from faker import Faker
43
fake = Faker()
54

@@ -12,6 +11,8 @@
1211
'xn----jtbbmekqknepg3a.xn--p1ai',
1312
'dualstack.cddf-prod-frontend-1ho73vqwbi0tw-1326553765.us-east-1.elb.amazonaws.com',
1413
'laser-retargeting-server-production.us-east-1-prod-core-edge-public.spongecell.net',
14+
'example.org.',
15+
'an0ther.exAmple.orG.',
1516
]
1617

1718

@@ -39,14 +40,17 @@ def test_fqdn_urls():
3940
def test_fqdn_ok():
4041

4142
for d in GOOD:
42-
d = Indicator(d)
43-
assert d.itype is 'fqdn'
43+
e = Indicator(d)
44+
assert e.itype is 'fqdn'
45+
d = d.rstrip('.')
46+
assert e.indicator == d.lower()
4447

4548

4649
def test_fqdn_subdomain():
4750
data = [
4851
'www.yahoo.com',
4952
'www.ww2.yahoo.com',
53+
'this.is.aNother.sub.domain.tld.',
5054
]
5155

5256
for d in data:
@@ -57,6 +61,7 @@ def test_fqdn_subdomain():
5761
data = [
5862
'yahoo.com',
5963
'google.com',
64+
'notasubdomain.tLd.',
6065
'http://google.com',
6166
'https://www.google.com',
6267
'http://www2.www.google.com',

test/test_indicator.py

+26-3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from weakref import ref
12
from csirtg_indicator import Indicator
23
import json
34
from csirtg_indicator.exceptions import InvalidIndicator
@@ -30,9 +31,10 @@ def test_indicator_url():
3031
assert 'malware' in i.tags
3132

3233

33-
def test_indicator_mixedcase_lower_false():
34-
i = Indicator('http://example.org/MiXeDCaSe',
35-
tags='botnet,malware', lowercase=False)
34+
# by default, if it's a url, lower the hostname but not the path
35+
def test_indicator_mixedcase_lower_default():
36+
i = Indicator('http://exAmple.Org./MiXeDCaSe',
37+
tags='botnet,malware')
3638

3739
assert i.is_private() is False
3840
assert i.indicator == 'http://example.org/MiXeDCaSe'
@@ -41,7 +43,19 @@ def test_indicator_mixedcase_lower_false():
4143
assert 'botnet' in i.tags
4244
assert 'malware' in i.tags
4345

46+
# if it's a url and user explicitly says don't lowercase, leave entire indicator as-is
47+
def test_indicator_mixedcase_lower_false():
48+
i = Indicator('http://examPle.org/MiXeDCaSe',
49+
tags='botnet,malware', lowercase=False)
50+
51+
assert i.is_private() is False
52+
assert i.indicator == 'http://examPle.org/MiXeDCaSe'
53+
assert i.itype is not 'fqdn'
54+
assert i.itype is 'url'
55+
assert 'botnet' in i.tags
56+
assert 'malware' in i.tags
4457

58+
# if it's a url and user explicitly says to lowercase, force all parts to lowercase
4559
def test_indicator_mixedcase_lower_true():
4660
i = Indicator('http://example.org/MiXeDCaSe',
4761
tags='botnet,malware', lowercase=True)
@@ -136,3 +150,12 @@ def test_eq():
136150

137151
u2.uuid = u1.uuid
138152
assert u1 == u2
153+
154+
def test_reference_field_case():
155+
ref_url = 'http://good.intel.tld/API/Generator'
156+
i = Indicator('http://example.org', tags='botnet,malware', reference=ref_url)
157+
158+
assert i.reference == ref_url
159+
160+
x = json.loads(i.__repr__())
161+
assert x['reference'] == ref_url

test/test_urls.py

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
'http://[email protected]/aol5/a000l.html',
1616
'https://example.com:443/1.html',
1717
'http://test1.test2.example.com',
18+
'http://tEst1.test3.exAmple.com.',
1819
'http://xz.job391.com/down/����࿪��@89_1_60',
1920
'http://refreshdharan.com/bg/excel2/index.php?userid={[email protected]}',
2021
'http://https.www.paypal.blahblahblahblah.web.cgi.bin.blahblah.blahblahblahblah.blahblahblah-blah-blah-blah.com/signin/',

0 commit comments

Comments
 (0)