-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
110 lines (101 loc) · 4.38 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# -*- coding: utf-8 -*-
from repositories_reader import read_repositories_file
from repositories_requester import request_url
from utils import is_valid_repository, get_folder_or_file_name, get_lines_and_bytes, generate_str_with_spaces, get_file_extension, print_to_file
import re
from bs4 import BeautifulSoup
from multiprocessing import Pool
def pull_folder_content(url):
"""
Extrai conteúdo de um diretório
Retorna soup.tbody da tabela que contém os itens do diretório
"""
response = request_url(url)
soup = BeautifulSoup(response.text, 'html.parser')
return soup.tbody
FILE_ELEMENT_FINDER = 'div'
FILE_CLASS_FINDER = 'text-mono f6 flex-auto pr-3 flex-order-2 flex-md-order-1 mt-2 mt-md-0'
def pull_file_content(url):
"""
Extrai conteúdo de um arquivo
Retorna lista com informações do arquivo (linhas e bytes)
"""
response = request_url(url)
soup = BeautifulSoup(response.text, 'html.parser')
div = soup.find(FILE_ELEMENT_FINDER, class_=FILE_CLASS_FINDER)
if div:
return [t.strip() for t in div.get_text().splitlines() if t.strip() != '']
else:
return []
REGEX_TO_FOLDERS = '/tree/master'
REGEX_TO_FILES = '/blob/master'
def extract_hrefs(repository_content):
"""
Extrai href de elementos que contenham links para diretórios e arquivos
Retorna lista com os hrefs encontrados
"""
folders_html = repository_content.find_all(href=re.compile(REGEX_TO_FOLDERS))
files_html = repository_content.find_all(href=re.compile(REGEX_TO_FILES))
hrefs_to_folders = [html['href'] for html in folders_html]
hrefs_to_files = [html['href'] for html in files_html]
return hrefs_to_folders, hrefs_to_files
def explore_file(file_href):
"""
Extrai conteúdo de arquivo
Retorna nome, linhas, bytes e extensão do arquivo recebido
"""
file_content = pull_file_content(file_href)
lines, bytes_ = get_lines_and_bytes(file_content)
filename = get_folder_or_file_name(file_href)
return filename, lines, bytes_, get_file_extension(filename)
def include_extension_in_files_dict(f_dict, lines, bytes_, extension):
"""
Adiciona extensão com linhas e bytes em um dict
f_dict é assumido já instanciado
Formato do dict: {'extensão': {'lines', 'bytes'}}
"""
if extension not in f_dict:
f_dict[extension] = {'lines': lines, 'bytes': bytes_}
else:
current_lines, current_bytes = f_dict[extension]['lines'], f_dict[extension]['bytes']
f_dict[extension] = {'lines': current_lines + lines, 'bytes': current_bytes + bytes_}
return f_dict
def explore_repository(repo_name, tree_str='', files_dict=None, depth=0):
"""
Método principal da aplicação
Percorre recursivamente o repositório
"""
repository_content = pull_folder_content(repo_name)
if (repository_content):
if depth == 0:
print('[+] Scraping no repositório ' + repo_name + ' iniciado...')
if files_dict is None:
files_dict = {}
folders, files = extract_hrefs(repository_content)
for f in folders:
tree_str += generate_str_with_spaces(depth, get_folder_or_file_name(f), is_folder=True)
tree_str, files_dict = explore_repository(f, tree_str=tree_str, files_dict=files_dict, depth=depth + 1)
for f in files:
filename, lines, bytes_, extension = explore_file(f)
files_dict = include_extension_in_files_dict(files_dict, lines=lines, bytes_=bytes_, extension=extension)
tree_str += generate_str_with_spaces(depth, filename, is_folder=False, loc=lines)
if depth == 0:
print_to_file(repo_name, tree_str, files_dict)
print('[+] Scraping no repositório ' + repo_name + ' finalizado!')
return tree_str, files_dict
else:
return
ASYNC_EXPLORATION = 3
if __name__ == '__main__':
repo_names = read_repositories_file()
valid_repos = [repo for repo in repo_names if is_valid_repository(repo)]
if valid_repos:
print('[+] Iniciando exploração dos repositórios...')
print('[+] [ATENÇÃO] A duração do processamento depende do tamanho do repositório.')
if len(valid_repos) >= 2:
with Pool(ASYNC_EXPLORATION) as p:
# Explorar até 3 repositórios paralelamente
p.map(explore_repository, valid_repos)
else:
for valid_repo in valid_repos:
explore_repository(valid_repo)