Skip to content

Commit

Permalink
fix: Add en scrapper
Browse files Browse the repository at this point in the history
  • Loading branch information
andyjjrt committed May 24, 2023
1 parent 7a53342 commit 6afd20e
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 18 deletions.
119 changes: 106 additions & 13 deletions DB.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,36 +6,129 @@ class DB:
def __init__(self, location: str) -> None:
self.con = sqlite3.connect(location)
cur = self.con.cursor()
cur.execute("CREATE TABLE IF NOT EXISTS COURSE ( id TEXT, core TEXT, far TEXT, gdeTpe TEXT, gdeTpeMsg TEXT, info TEXT, isTrace TEXT, langTpe TEXT, lmtKind TEXT, note TEXT, pay TEXT, s TEXT, smtQty TEXT, subClassroom TEXT, subGde TEXT, subKind TEXT, subLocUrl TEXT, subNam TEXT, subNum TEXT, subOdr TEXT, subPoint TEXT, subRemainUrl TEXT, subSetUrl TEXT, subTime TEXT, subUnitRuleUrl TEXT, teaExpUrl TEXT, teaNam TEXT, teaSchmUrl TEXT, tranTpe TEXT, y TEXT, syllabus TEXT, objective TEXT, PRIMARY KEY ( id ) );")
# subNam => name 科目名稱
# lmtKind 通識類別
# core 是否為核心通識
# langTpe => lang 語言
# smtQty N學期科目
# subClassroom => classroom 教室
# subGde => unit 開課單位
# subKind => kind 必選群
# subPoint => point 學分
# subTime => time 時間

cur.execute("""
CREATE TABLE IF NOT EXISTS COURSE (
id TEXT,
y TEXT,
s TEXT,
subNum TEXT,
name TEXT,
nameEn TEXT,
teacher TEXT,
teacherEn TEXT,
kind INTEGER,
time TEXT,
timeEn TEXT,
lmtKind TEXT,
lmtKindEn TEXT,
core INTEGER,
lang TEXT,
langEn TEXT,
smtQty INTEGER,
classroom TEXT,
classroomId TEXT,
unit TEXT,
unitEn TEXT,
point REAL,
subRemainUrl TEXT,
subSetUrl TEXT,
subUnitRuleUrl TEXT,
teaExpUrl TEXT,
teaSchmUrl TEXT,
tranTpe TEXT,
tranTpeEn TEXT,
info TEXT,
infoEn TEXT,
note TEXT,
noteEn TEXT,
syllabus TEXT,
objective TEXT,
PRIMARY KEY ( id )
);
""")
cur.execute("CREATE TABLE IF NOT EXISTS TEACHER ( id TEXT, name TEXT, UNIQUE( id, name ) )")
cur.execute("CREATE TABLE IF NOT EXISTS RATE ( courseId TEXT, teacherId TEXT, content TEXT )")
cur.execute("CREATE TABLE IF NOT EXISTS RATE ( courseId TEXT NOT NULL, rowId TEXT NOT NULL, teacherId TEXT, content TEXT, PRIMARY KEY (courseId, rowId) )")

def addRate(self, courseId: str, teacherId: str, content: str):
def addRate(self, rowId: str, courseId: str, teacherId: str, content: str):
cur = self.con.cursor()
cur.execute("INSERT OR REPLACE INTO RATE (courseId, teacherId, content) VALUES (?, ?, ?)", (courseId, teacherId, content))
cur.execute("INSERT OR REPLACE INTO RATE (rowId, courseId, teacherId, content) VALUES (?, ?, ?, ?)", (rowId, courseId, teacherId, content))
self.con.commit()

def addTeacher(self, id: str, name: str):
cur = self.con.cursor()
cur.execute("INSERT OR REPLACE INTO TEACHER (id, name) VALUES (?, ?)", (id, name))
self.con.commit()

def addCourse(self, courseData: dict, syllabus: str, description: str):
def addCourse(self, courseData: dict, courseDataEn: dict, syllabus: str, description: str):
if courseData["lmtKind"] == "必修":
kind = 1
elif courseData["lmtKind"] == "選修":
kind = 2
elif courseData["lmtKind"] == "群修":
kind = 3
else:
kind = 0

cur = self.con.cursor()
cur.execute(
'''INSERT OR REPLACE INTO COURSE ( id, core, far, gdeTpe, gdeTpeMsg, info, isTrace, langTpe, lmtKind, note, pay, s, smtQty, subClassroom, subGde, subKind, subLocUrl, subNam, subNum, subOdr, subPoint, subRemainUrl, subSetUrl, subTime, subUnitRuleUrl, teaExpUrl, teaNam, teaSchmUrl, tranTpe, y, syllabus, objective) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);''',
'''INSERT OR REPLACE INTO COURSE ( id, y, s, subNum, name, nameEn, teacher, teacherEn, kind, time, timeEn, lmtKind, lmtKindEn, core, lang, langEn, smtQty, classroom, classroomId, unit, unitEn, point, subRemainUrl, subSetUrl, subUnitRuleUrl, teaExpUrl, teaSchmUrl, tranTpe, tranTpeEn, info, infoEn, note, noteEn, syllabus, objective )
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);''',
(
"{}{}{}".format(courseData["y"], courseData["s"], courseData["subNum"]), courseData["core"], courseData["far"],
courseData["gdeTpe"], courseData["gdeTpeMsg"], courseData["info"], courseData["isTrace"], courseData["langTpe"],
courseData["lmtKind"], courseData["note"], courseData["pay"],
courseData["s"], courseData["smtQty"], courseData["subClassroom"], courseData["subGde"], courseData["subKind"],
courseData["subLocUrl"], courseData["subNam"], courseData["subNum"], courseData["subOdr"], courseData["subPoint"],
courseData["subRemainUrl"], courseData["subSetUrl"], courseData["subTime"], courseData["subUnitRuleUrl"],
courseData["teaExpUrl"], courseData["teaNam"], courseData["teaSchmUrl"], courseData["tranTpe"], courseData["y"],
"{}{}{}".format(courseData["y"], courseData["s"], courseData["subNum"]),
courseData["y"],
courseData["s"],
courseData["subNum"],
courseData["subNam"],
courseDataEn["subNam"],
courseData["teaNam"],
courseDataEn["teaNam"],
kind,
courseData["subTime"],
courseDataEn["subTime"],
courseData["lmtKind"],
courseDataEn["lmtKind"],
(lambda x:1 if x == "是" else 0)(courseData["core"]),
courseData["langTpe"],
courseDataEn["langTpe"],
courseData["smtQty"],
courseData["subClassroom"],
courseDataEn["subClassroom"],
courseData["subGde"],
courseDataEn["subGde"],
float(courseData["subPoint"]),
courseData["subRemainUrl"],
courseData["subSetUrl"],
courseData["subUnitRuleUrl"],
courseData["teaExpUrl"],
courseData["teaSchmUrl"],
courseData["tranTpe"],
courseDataEn["tranTpe"],
courseData["info"],
courseDataEn["info"],
courseData["note"],
courseDataEn["note"],
syllabus, description
)
)
self.con.commit()

def getCourse(self, y: str, s: str):
cur = self.con.cursor()
request = cur.execute('SELECT teaNam FROM COURSE WHERE y = 111 AND s = 2')
response = request.fetchall()

return [str(x[0]) for x in response]


if __name__ == "__main__":
Expand Down
5 changes: 5 additions & 0 deletions fetchDescription.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ def fetchDescription(courseId: str):
if len(response.json()) != 1:
raise Exception("No matched course")
result["qrysub"] = response.json()[0]
response = requests.get("http://es.nccu.edu.tw/course/en/{} /".format(courseId))
response.raise_for_status()
if len(response.json()) != 1:
raise Exception("No matched course")
result["qrysubEn"] = response.json()[0]
location = str(result["qrysub"]["teaSchmUrl"]).replace("https://", "http://")

# fetching content
Expand Down
8 changes: 4 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@
if not programOptions["skip_class_detail"]:
for course in tqdm.tqdm(courses, leave=False):
detail = fetchDescription("{}{}".format(semester, course["subNum"]))
db.addCourse(course, "".join(detail["description"]), "".join(detail["objectives"]))
db.addCourse(detail["qrysub"], detail["qrysubEn"], "".join(detail["description"]), "".join(detail["objectives"]))
except Exception as e:
logging.error(e)

Expand Down Expand Up @@ -144,7 +144,7 @@
continue

# Add courses to track list
tqdmCourses = tqdm.tqdm(coursesList, leave=False)
tqdmCourses = tqdm.tqdm([*set(coursesList)], leave=False)
for courseId in tqdmCourses:
try:
sleep(0.2)
Expand Down Expand Up @@ -239,8 +239,8 @@
rates = fetchRate("http://newdoc.nccu.edu.tw/teaschm/{}/{}".format(semester, row[-1].find("a")["href"]))

# Write to database
for rate in rates:
db.addRate(courseId, teacherId, str(rate))
for index, rate in enumerate(rates):
db.addRate(index, courseId, teacherId, str(rate))

# # Create folder if not exist
# path = os.path.join(dirPath, "result", teacher, detail["qrysub"]["subNam"])
Expand Down
3 changes: 2 additions & 1 deletion requirement.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ beautifulsoup4
requests
tqdm
pyDes
google-cloud-translate
google-cloud-translate
flask

0 comments on commit 6afd20e

Please sign in to comment.