Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 59 additions & 29 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@
#aa='http://dsrd.uc.cl/dara/libcursos/periodo21/ua6_0.html'

N_cursos = 100
url_root = 'http://dsrd.uc.cl/dara/libcursos/periodo22/'
url_root = 'http://dsrd.uc.cl/dara/libcursos/periodo21/'
file_root = 'ua'
titles = ['N','sigla','seccion','creditos','nombre','min','opt','ofg','profesores','horario','actividad','salas','campus','titulos']
titlesFueraPeriodo = ['N','sigla','seccion','creditos','nombre','vac', 'profesores', 'horario', 'actividad', 'salas', 'campus', 'titulos']
#titles = ['N','sigla','seccion','creditos','nombre','vac','profesores','horario','actividad','salas','campus','titulos']
multiples = ['profesores','horario','actividad','salas']
otherUA = []#['bachhu','bachcs']
courses = []
requisitos = {}
esTemporadaInscripcion = False

def get_course(sigla):
for c in courses:
Expand All @@ -22,7 +25,6 @@ def get_course(sigla):

def start():
for ua in otherUA + range(1,N_cursos + 1):

page = 0
aux = 0
if(ua == 9):
Expand All @@ -31,7 +33,6 @@ def start():
else:
offset = 0
lastN = 30

if(ua in otherUA):
offset = -1

Expand All @@ -40,10 +41,10 @@ def start():
url = '%s%s%s'%(url_root,ua,'.html')
else:
url = '%s%s%s%s%s%s'%(url_root,file_root,ua,'_',page,'.html')

f = urllib.urlopen(url)
response_code = f.getcode()

print url
if(response_code != 200 or lastN < 25):
break

Expand All @@ -61,13 +62,11 @@ def start():

def parse(html,offset):
soup = bs4.BeautifulSoup(html)
f = soup.find_all('tr')
if(len(f) < 11):
courses = soup.find_all('tr', recursive=False)[:-3]
if(len(courses) < 11):
return 0
table = f[10+offset]
courses = table.find_all('tr')[1:]
i = 0
for course in courses[1:]:
for course in courses:
parseCourse(course)
i = i + 1
return i
Expand All @@ -77,21 +76,26 @@ def parseCourse(soup):
i = 0
aux = {}
raw = {}
title_array = titles
if(esTemporadaInscripcion == False):
title_array = titlesFueraPeriodo
for td in soup.find_all('td'):
data = td.text
if(titles[i] in multiples):
if(title_array[i] in multiples):
data = td.find_all(text=True)
aux[titles[i]] = data
aux[title_array[i]] = data
i = i + 1
continue
raw[titles[i]] = data
raw[title_array[i]] = data
i = i + 1
#print(raw)
seccion = {}
#seccion['vac'] = raw['vac']
seccion['min'] = raw['min']
seccion['opt'] = raw['opt']
seccion['ofg'] = raw['ofg']
if(esTemporadaInscripcion):
seccion['min'] = raw['min']
seccion['opt'] = raw['opt']
seccion['ofg'] = raw['ofg']
else:
seccion['vac'] = raw['vac']
seccion['seccion'] = raw['seccion']
seccion['campus'] = raw['campus']

Expand Down Expand Up @@ -152,7 +156,7 @@ def startDesc():
curso = {}
curso['sigla'] = sigla
courses.append(curso)
curso['descripcion'] = str(p)
curso['descripcion'] = p.text
#cursos.append(curso)
#print('encontrados %d cursos')%(len(cursos))
i = i + 1
Expand All @@ -162,15 +166,40 @@ def startDesc():
#######################################################

def startReq():
f = open('req/output2.json').read()
reqs = json.loads(f)
f = open('output2.json').read()
courses = json.loads(f)
print "Procesando requisitos, esto puede tomar varios minutos..."
root_url = "https://www2.puc.cl/ControlPrerrequisitos/jsp/RequisitosAsign.jsp?SIGLA="
i = 0
for c in courses:
if i % 100 == 0:
print str(i) + "/" + str(len(courses)) + " cursos procesados."
url = '%s%s'%(root_url, c['sigla'])
html = urllib.urlopen(url).read()
soup = bs4.BeautifulSoup(html)
tds = soup.find_all("td", {"class" : "td"})
course_reqs = []
for td in tds:
if (td.contents[0]['class'][0] != u"html_tipo_requisito"):
continue
curr_req = {}
tipo_requisito = td.find_all("font", {"class" : "html_tipo_requisito"})[0].text
if tipo_requisito == "Requisitos que todos los alumnos deben cumplir.":
tipo_requisito = "Todos"
curr_req["alumnos"] = tipo_requisito
texts = td.find_all("font", {"class" : "html_texto_azul"})
curr_req["requisitos"] = texts[0].text
curr_req["requisitos_especiales"] = texts[-1].text
course_reqs.append(curr_req)

requisitos[c['sigla']] = course_reqs
i = i + 1


#######################################################
#####################Flujo Programa####################
#######################################################

for r in reqs:
sigla = r['sigla']
curso = get_course(sigla)
if curso == None:
continue
curso['requisitos'] = r['req']

start()
print 'Total cursos: \t\t%s'%(len(courses))
Expand All @@ -182,11 +211,12 @@ def startReq():
print 'Total cursos: \t\t%s'%(len(courses))
f = open('output2.json','w')
f.write(json.dumps(courses,sort_keys=True))
startReq()
f.close()

startReq()
print 'Total cursos: \t\t%s'%(len(courses))
f = open('output3.json','w')
f.write(json.dumps(courses,sort_keys=True))
f = open('requisitos.json','w')
f.write(json.dumps(requisitos,sort_keys=True))
f.close()

f = open('output3.json','r')
Expand Down