forked from rekcahemal/Trinity
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTrinity.py
301 lines (236 loc) · 11.5 KB
/
Trinity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
#!/usr/bin/python
#####################################################################################################
#####################################################################################################
## Created on Aug 16, 2013
##
## @author: Gerasimos Kassaras
## E-mail: [email protected]
## Nickname: Lamehacker Free Industries
##
## Conventions used in this coding style:
## Variable Naming Convention: urlSet - Is a set containing a set with url's
## Variable Naming Convention: cleanSet - Is a set that removes certain urls based on some conditions.
## Variable Naming Convention: subUrlSet - Is a sub set of a urlSet.
## Variable Naming Convention: urlList - Is a list containing a set with unfiltered url's
## Variable Naming Convention: cleanList - Is a list that removes certain urls based on some conditions.
## Variable Naming Convention: subUrlList - Is a sub set of a urlSet.
## TODO Naming Convention: The TODO Naming Convention is TODO:
## Comments: Naming conventions are used are valid for every module within webshark.
#######################################################################################################
#######################################################################################################
from urlparse import urlparse
from time import gmtime, strftime
from bs4 import BeautifulSoup
import httplib
import itertools
import re
urlList = ["http://www.example.com/"] # Later on this url is going to be fed through command parser.
crawled = [] # Is going to be used to keep state of visited links.
host = 'http://www.example.com/'
domain = 'www.example.com'
'''
Description: This function is used to fetch the raw html for a single page and also manage the connection.
Status: Finished.
TODO: Move function to separate to connection manager module and fix also url parser.
Usage: Fetching the html page and usded with in the fetch function.
'''
def response(url): # Takes as an input a string and returns a string
# Takes as an input a single url and returns a single page raw html.
urlPath = urlparse(url).path # Gets the url path.
try:
print 'Calling response function - Fetching URL path: '+urlPath
conn = httplib.HTTPConnection(domain)
conn.request("GET",urlPath)
response = conn.getresponse().read() # Collect http response body.
except httplib.error as msg:
print 'Inserted url'+urlPath
print msg # TODO: Add better exception handling.
pass
return response
'''
Description: This function is used to collect all urls from a single page and return a url list.
Status: In progress.
Usage: Is going to be used to collect per raw html input all urls.
'''
def parse(rawHtml) : # Takes as an input a string and returns a list
loadHtml = rawHtml # Load raw html from raw http response.
soup = BeautifulSoup(loadHtml)
urlList = []
for tag in soup.findAll('link', href=True) : # Checks for links within the link tag e.g.<link rel="canonical" href="http://example.com/" />
urlList.append(tag['href'])
for tag in soup.findAll('a', href=True) :# Checks for links within the a tag e.g. <a href="http://exmple.com/packages/"</a>
urlList.append(tag['href'])
for tag in soup.findAll('meta', content=True) :# Checks for links within the content tag e.g. <meta property="og:video" content="http://example.com/v/w0?version=3;autohide=1">
if re.search(r'(^http://)|(^https://)', tag['content'], re.IGNORECASE): # Checks for links that start with http or https.
urlList.append(tag['content'])
for tag in soup.findAll('script', src=True) :# Checks for links within the script tag e.g. <script type='text/javascript' src="http://example.com/wp-includes/js/jquery/jquery.js"></script>
if re.search(r'(^http://)|(^https://)', tag['src'], re.IGNORECASE):
urlList.append(tag['src'])
return urlList
'''
Description: This function is used to as a fetcher for the urlList.
Status: In progress
TODO: Would have to revisit. This is going to become the connection manager for future use.
Usage: Is going to be used to collect per page all urls.
'''
def fetch(urlList):# Takes as an input a list and returns a list.
rawUrlList = [] # This is a none filtered list of urls collected
for url in urlList:
if not url in crawled: # If the url is already fetched is not going to re-fetch.
rawUrlList.append(parse(response(url))) # Fetch urls from each page.
crawled.append(url) # Record non visited links.
#print 'Calling fetcher() - Crawled :'+str(crawled)
return rawUrlList
'''
Description:This function extracts non html files from from the urls collected.
Status: In progress
Usage: e.g. Extract Windows Files from each URL
'''
def noneHtml(urlList): # Takes as an input a list and returns a list.
rm = ['.ZIP','.JPG','.DMG','.MSI','.RAR','BZ2','TGZ','.CHM','.TAR','.EXE','.XZ','.DOC','.PDF','.PNG','.JPG','.TIFF','MAILTO','#','XPI','CRX'] #Add here all the urls you would like to remove from final url list
cleanList = []
tmp = urlList
for fileSuffix in range(len(rm)):
for element in range(len(tmp)):
if re.search(rm[fileSuffix],urlList[element],re.IGNORECASE):
cleanList.append(urlList[element])
return cleanList
'''
Description: This function transforms the collected urls so as to be use later on by httplib to collect more urls.
Status: Finished.
Comment: Might have to revisit.
Usage: Restrict crawling to the start url.
'''
def restrict(urlList) : # Takes as an input a list and returns a list.
cleanList = []
for url in range(len(urlList)):
if re.search(r"(^/)|(^../)",urlList[url],re.IGNORECASE):# Search for inner urls.
cleanList.append(urlList[url])
if re.search(domain,urlList[url],re.IGNORECASE):# Restrict url collection to same domain.
cleanList.append(urlList[url])
return cleanList
'''
Description: This function is going to be used as a mreger for the list returned from the parseHtml function.
Status: Finished.
Usage: Is used to prevent the url parser from crashing by converting a list of lists to a list of urls.
'''
def merge(urlList):# Takes as an input a list and returns a list.
while True:
if any(isinstance(element, list) for element in urlList): # Checks through all list elements to find lists.
'''
Check if this is a list of lists of type [...[]...] or of type [...[],[],[]...].
Examples lists that checks:
1.[['/about','/search','/help']]
2.[[['/about']]]
3.[['/about',['/search'],'/help']]
'''
urlList = list(itertools.chain(*urlList))
'''
Removes external brackets from a list of lists of type [...[]...] or of type [...[],[],[]...].
Examples lists that alters:
1. Input:[['/about'],['/search'],['/help']] -> Output: ['/about','/search','/help']
2. Input:[[[[['/about']]]]] -> Output: ['/about']
3. Input:[['/about',['/search'],'/hp']] -> Output:['/about','/search','/','h','p'] - Breaks the parser - Should never occur.
'''
else:
break # Break the loop if this is not a list of lists.
return urlList
'''
Description: This function receives a list with the collected urls and removes double urls.
Status: Finished.
Usage: Is used to avoid re-downloading the same urls and going through loops by the parser.
'''
def deduplicate(urlList):# Takes as an input a list and returns a list.
urlList = list(set(urlList))
return urlList
'''
Description: This function is used to return the crawled links.
Status: Finished
Usage: This is going to be used in the future to get the crawled links.
'''
def getCrawledList():# Takes as an input nothing and returns a list.
return crawled
'''
Description: This function receives a list with the collected urls and subtracts a sublist of urls.
Status: Finished.
Usage: Is used to protect the html parser from crashing and help later on to analyze the non html content.
'''
def removeSet(urlList,urlSublist): # Takes as an input a list and returns a list.
urlSet = set(urlList)
urlSubset = set(urlSublist)
clearSet = urlSet - urlSubset
return list(clearSet) # Convert again to list.
'''
Description: This function filters out doubles and and non html comments.
Status: TODO: In progress.
Usage: Is used to protect the html parser from crashing, analyzing the non html content and removing double urls.
'''
def purify(urlList): # Takes as an input a list and returns a list.
cleanList = urlList
cleanList = merge(cleanList)
cleanList = restrict(cleanList) # Restrict urls to main domain.
cleanList = deduplicate(cleanList)
cleanList = removeSet(cleanList,noneHtml(cleanList)) # Clean up non html content.
return cleanList
'''
Description: This function is used to find if the next page contains the same links with the previous page.
Status: Finished
Usage: Is used within the harvest functions as a termination condition.
'''
def isSubset(collectedUrlList,nextPageUrlList):# Takes as an input a list and returns true or false.
collectedUrlSet = set(collectedUrlList)
nextPageUrlSet = set(nextPageUrlList)
if nextPageUrlSet.issubset(collectedUrlSet):
return True
else:
return False
'''
Description: This function is used to find if the next page contains any links.
Status: Finished
Usage: Is used within the harvest functions as a termination condition.
'''
def emptyList(page):# Takes as an input a list and returns true or false.
if page == []:
return True
else:
return False
'''
Description: This function is used to export the urls into a file.
Status: Finished
Usage: Is used within the harvest functions as a url exporter.
'''
def exportFeed(filename,urlList):# Takes as an input a list and returns nothing.
urlList = sorted(urlList) # Sort urls so it can be more easy to read.
fobj = open(filename,'wa')
for link in range(len(urlList)):
fobj.write('['+strftime("%Y-%m-%d %H:%M:%S", gmtime())+'] '+str(urlList[link])+'\n') # Exports the urls in a file.
fobj.close()
'''
Description: This function is used to fetch a single page urls.
Status: Finished.
Usage: Is used within the harvester to fetch for a single page the urls.
'''
def singlePage(urlList): # Input is a list, output is a list
urlList = fetch(urlList)
urlList = purify(urlList)
return urlList
'''
Description: This function is used maintain the state of the crawler.
Status: In Progress
Usage: Is used within the harvester to remove visited links.
'''
def harvester(urlList):# Accepts the initial domain name as a url list.
urlCollector = [] # Returns the collected urls and should be equal to the crawled list.
nextPage = singlePage(urlList)# Gets the start urls (usual a the domain name).
while True:
nextPage = singlePage(nextPage)# Fetch the next page.
if emptyList(nextPage):# If the crawled list contains the urlCollector links then the list shrinks to an empty list.
break
for eachLink in range(len(nextPage)):
if not isSubset(urlCollector,nextPage): # Check if the fetched urls is a subset of the collected links.
urlCollector.append(nextPage[eachLink])
nextPage = urlCollector
exportFeed('collector.out',getCrawledList())
return urlCollector
if __name__ == '__main__':
harvester(urlList)