7
7
from datetime import datetime , timezone
8
8
from pathlib import Path
9
9
import backoff
10
+ import bs4
10
11
import requests
11
12
from feedgen .feed import FeedGenerator
12
13
from functional import seq
@@ -62,14 +63,18 @@ def processPage(self, pageNumber):
62
63
if (resp .status_code == 503 or html == "Sorry." ):
63
64
self .logger .info ("Rate limited. Retrying." )
64
65
raise HackernewsRateLimitException ("Rate limit occurred." )
65
- parser = PyQuery (html )
66
- hackerNewsUrls = parser ("tr > td.subtext > span.subline > span.age > a" )
67
- titles = parser (".titleline > a" )
68
- dates = parser ("tr > td.subtext > span.subline > span.age" )
69
66
67
+ parser = bs4 .BeautifulSoup (html , features = "lxml" )
68
+ hackerNewsUrls = seq (parser .select ("span.age > a" )).map (lambda x : "https://news.ycombinator.com/" + x ['href' ]).to_list ()
69
+ titles = seq (parser .select (".titleline > a" )).map (lambda x : x .text ).to_list ()
70
+ urls = seq (parser .select (".titleline > a" )).map (lambda x : x ['href' ]).map (lambda x : "https://news.ycombinator.com/" + x if x .startswith ("item?id" ) else x ).to_list ()
70
71
72
+ dates = seq (parser .select ("span.age" )).map (lambda x : x ['title' ]).map (lambda x : datetime .fromtimestamp (int (re .findall (r"[0-9]{10,}" , x )[0 ]), timezone .utc )).to_list ()
71
73
72
- return seq (zip (titles , hackerNewsUrls , dates )).map (lambda x : HackerNewsStory (x [0 ].text , x [0 ].attrib ['href' ], "https://news.ycombinator.com/" + x [1 ].attrib ['href' ], datetime .now (timezone .utc ), datetime .fromtimestamp (int (re .findall (r"[0-9]{10,}" , x [2 ].attrib ['title' ])[0 ]), timezone .utc )))
74
+ if not (len (hackerNewsUrls ) == len (titles ) == len (urls ) == len (dates )):
75
+ raise Exception (f"Error in parsing page { pageNumber } : length of parsed elements is different. Hackernewsurls: { len (hackerNewsUrls )} Titles: { len (titles )} Urls: { len (urls )} Dates: { len (dates )} \n \n #Hackernewsurls\n { hackerNewsUrls } \n \n #Titles\n { titles } \n \n #Urls\n { urls } \n \n #Dates\n { dates } " )
76
+
77
+ return seq (zip (titles , urls , hackerNewsUrls , dates )).map (lambda x : HackerNewsStory (x [0 ], x [1 ], x [2 ], datetime .now (timezone .utc ), x [3 ])).to_list ()
73
78
74
79
75
80
def readRcFile (self ):
@@ -82,10 +87,10 @@ def readRcFile(self):
82
87
83
88
def generateRss (self , stories , useHackernewsUrl = False ):
84
89
fg = FeedGenerator ()
85
- fg .title ('Hackernewsd - HN' if useHackernewsUrl else 'Hackernews - Blog' )
90
+ fg .title ('Hackernewsd - HN' if useHackernewsUrl else 'Hackernewsd - Blog' )
86
91
fg .link (href = 'http://localhost:5555' , rel = 'alternate' ) #TODO parameterize
87
92
# fg.logo('http://ex.com/logo.jpg')
88
- fg .subtitle ('Hackernewsd - HN' if useHackernewsUrl else 'Hackernews - Blog' )
93
+ fg .subtitle ('Hackernewsd - HN' if useHackernewsUrl else 'Hackernewsd - Blog' )
89
94
fg .language ('en' )
90
95
for story in stories :
91
96
fe = fg .add_entry ()
0 commit comments