Skip to content

Commit 46b0d37

Browse files
Merge remote-tracking branch 'origin/main'
2 parents 7374221 + 9310520 commit 46b0d37

File tree

3 files changed

+385
-7
lines changed

3 files changed

+385
-7
lines changed

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
backoff==2.2.1
2+
beautifulsoup4==4.12.3
23
feedgen==1.0.0
34
Flask==3.1.0
45
Flask_APScheduler==1.13.1

src/hackernewsd.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from datetime import datetime, timezone
88
from pathlib import Path
99
import backoff
10+
import bs4
1011
import requests
1112
from feedgen.feed import FeedGenerator
1213
from functional import seq
@@ -62,14 +63,18 @@ def processPage(self, pageNumber):
6263
if(resp.status_code == 503 or html == "Sorry."):
6364
self.logger.info("Rate limited. Retrying.")
6465
raise HackernewsRateLimitException("Rate limit occurred.")
65-
parser = PyQuery(html)
66-
hackerNewsUrls = parser("tr > td.subtext > span.subline > span.age > a")
67-
titles = parser(".titleline > a")
68-
dates = parser("tr > td.subtext > span.subline > span.age")
6966

67+
parser = bs4.BeautifulSoup(html, features="lxml")
68+
hackerNewsUrls = seq(parser.select("span.age > a")).map(lambda x: "https://news.ycombinator.com/" + x['href']).to_list()
69+
titles = seq(parser.select(".titleline > a")).map(lambda x: x.text).to_list()
70+
urls = seq(parser.select(".titleline > a")).map(lambda x: x['href']).map(lambda x: "https://news.ycombinator.com/" + x if x.startswith("item?id") else x).to_list()
7071

72+
dates = seq(parser.select("span.age")).map(lambda x: x['title']).map(lambda x: datetime.fromtimestamp(int(re.findall(r"[0-9]{10,}", x)[0]), timezone.utc)).to_list()
7173

72-
return seq(zip(titles, hackerNewsUrls, dates)).map(lambda x: HackerNewsStory(x[0].text, x[0].attrib['href'], "https://news.ycombinator.com/" + x[1].attrib['href'], datetime.now(timezone.utc), datetime.fromtimestamp(int(re.findall(r"[0-9]{10,}", x[2].attrib['title'])[0]), timezone.utc)))
74+
if not (len(hackerNewsUrls) == len(titles) == len(urls) == len(dates)):
75+
raise Exception(f"Error in parsing page {pageNumber}: length of parsed elements is different. Hackernewsurls: {len(hackerNewsUrls)} Titles: {len(titles)} Urls: {len(urls)} Dates: {len(dates)}\n\n#Hackernewsurls\n{hackerNewsUrls}\n\n#Titles\n{titles}\n\n#Urls\n{urls}\n\n#Dates\n{dates}")
76+
77+
return seq(zip(titles, urls, hackerNewsUrls, dates)).map(lambda x: HackerNewsStory(x[0], x[1], x[2], datetime.now(timezone.utc), x[3])).to_list()
7378

7479

7580
def readRcFile(self):
@@ -82,10 +87,10 @@ def readRcFile(self):
8287

8388
def generateRss(self, stories, useHackernewsUrl=False):
8489
fg = FeedGenerator()
85-
fg.title('Hackernewsd - HN' if useHackernewsUrl else 'Hackernews - Blog')
90+
fg.title('Hackernewsd - HN' if useHackernewsUrl else 'Hackernewsd - Blog')
8691
fg.link(href='http://localhost:5555', rel='alternate') #TODO parameterize
8792
# fg.logo('http://ex.com/logo.jpg')
88-
fg.subtitle('Hackernewsd - HN' if useHackernewsUrl else 'Hackernews - Blog')
93+
fg.subtitle('Hackernewsd - HN' if useHackernewsUrl else 'Hackernewsd - Blog')
8994
fg.language('en')
9095
for story in stories:
9196
fe = fg.add_entry()

0 commit comments

Comments
 (0)