Skip to content

Commit 1920426

Browse files
committed
updated RSS with better feed processing, property selection, and header/limit options
1 parent fcd9c80 commit 1920426

File tree

1 file changed

+124
-24
lines changed

1 file changed

+124
-24
lines changed

web-rss.py

+124-24
Original file line numberDiff line numberDiff line change
@@ -9,36 +9,60 @@
99
# type: array
1010
# description: Urls for which to get the info
1111
# required: true
12+
# returns:
13+
# - name: channel_title
14+
# type: string
15+
# description: The feed channel title
16+
# - name: channel_link
17+
# type: string
18+
# description: The feed channel link
19+
# - name: item_title
20+
# type: string
21+
# description: The article title
22+
# - name: item_author
23+
# type: string
24+
# description: The article author
25+
# - name: item_link
26+
# type: string
27+
# description: The article link
28+
# - name: item_published
29+
# type: string
30+
# description: The date/time the article was published
31+
# - name: item_description
32+
# type: string
33+
# description: A description for the article
1234
# examples:
13-
# - '"https://news.ycombinator.com/rss"'
14-
# - '"https://news.ycombinator.com/rss,http://feeds.arstechnica.com/arstechnica/index/"'
15-
# - 'A1:A3'
35+
# - '"http://feeds.arstechnica.com/arstechnica/technology-lab"'
36+
# - '"http://feeds.arstechnica.com/arstechnica/technology-lab,https://www.technologyreview.com/feed/"'
37+
# - '"https://www.technologyreview.com/feed/","channel_title,item_title,item_author,item_link"'
1638
# notes:
1739
# ---
1840

1941
import json
42+
import time
43+
import urllib
44+
import tempfile
2045
import aiohttp
2146
import asyncio
2247
import itertools
48+
import feedparser
2349
from cerberus import Validator
2450
from collections import OrderedDict
25-
from bs4 import BeautifulSoup
2651

2752
def flexio_handler(flex):
2853

2954
# get the input
3055
input = flex.input.read()
31-
try:
32-
input = json.loads(input)
33-
if not isinstance(input, list): raise ValueError
34-
except ValueError:
56+
input = json.loads(input)
57+
if not isinstance(input, list):
3558
raise ValueError
3659

3760
# define the expected parameters and map the values to the parameter names
3861
# based on the positions of the keys/values
3962
params = OrderedDict()
4063
params['urls'] = {'required': True, 'validator': validator_list, 'coerce': to_list}
41-
#params['columns'] = {'required': True, 'validator': validator_list, 'coerce': to_list}
64+
params['properties'] = {'required': False, 'validator': validator_list, 'coerce': to_list, 'default': '*'}
65+
params['config'] = {'required': False, 'type': 'string', 'default': ''} # index-styled config string
4266
input = dict(zip(params.keys(), input))
4367

4468
# validate the mapped input against the validator
@@ -47,31 +71,101 @@ def flexio_handler(flex):
4771
if input is None:
4872
raise ValueError
4973

74+
# map this function's property names to the API's property names
75+
property_map = OrderedDict()
76+
property_map['channel_title'] = 'channel_title'
77+
property_map['channel_link'] = 'channel_link'
78+
property_map['item_title'] = 'item_title'
79+
property_map['item_author'] = 'item_author'
80+
property_map['item_link'] = 'item_link'
81+
property_map['item_published'] = 'item_published'
82+
property_map['item_description'] = 'item_description'
83+
84+
# get the properties to return and the property map;
85+
# if we have a wildcard, get all the properties
86+
properties = [p.lower().strip() for p in input['properties']]
87+
if len(properties) == 1 and (properties[0] == '' or properties[0] == '*'):
88+
properties = list(property_map.keys())
89+
90+
# get any configuration settings
91+
config = urllib.parse.parse_qs(input['config'])
92+
config = {k: v[0] for k, v in config.items()}
93+
limit = int(config.get('limit', 10000))
94+
headers = config.get('headers', 'true').lower()
95+
if headers == 'true':
96+
headers = True
97+
else:
98+
headers = False
99+
100+
# get the feeds
50101
urls = input['urls']
51102
loop = asyncio.get_event_loop()
52-
result = loop.run_until_complete(fetch_all(urls))
53-
flex.output.write(result)
103+
temp_fp_all = loop.run_until_complete(fetch_all(urls))
104+
105+
# write the output
106+
flex.output.content_type = 'application/json'
107+
flex.output.write('[')
108+
109+
if headers is True:
110+
flex.output.write(json.dumps(properties))
111+
112+
idx = 0
113+
for temp_fp in temp_fp_all:
114+
while True:
115+
row = temp_fp.readline()
116+
if not row:
117+
break
118+
if idx >= limit:
119+
break
120+
row = json.loads(row)
121+
content = ''
122+
if headers is True or idx > 0:
123+
content = ','
124+
content = content + json.dumps([(row.get(p) or '') for p in properties])
125+
flex.output.write(content)
126+
idx = idx + 1
127+
128+
flex.output.write(']')
54129

55130
async def fetch_all(urls):
56131
tasks = []
57-
async with aiohttp.ClientSession() as session:
132+
async with aiohttp.ClientSession(raise_for_status=True) as session:
58133
for url in urls:
59134
tasks.append(fetch(session, url))
60-
content = await asyncio.gather(*tasks)
61-
return list(itertools.chain.from_iterable(content))
135+
temp_fp_all = await asyncio.gather(*tasks)
136+
return temp_fp_all
62137

63138
async def fetch(session, url):
64-
async with session.get(url) as response:
65-
result = await response.text()
66-
return parseFeed(result)
67-
68-
def parseFeed(content):
69-
result = []
70-
soup = BeautifulSoup(content, "xml")
71-
items = soup.findAll("item")
139+
# get the data, process it and put the results in a temporary
140+
# file for aggregating with other results
141+
temp_fp = tempfile.TemporaryFile(mode='w+t')
142+
try:
143+
async with session.get(url) as response:
144+
content = await response.text()
145+
for item in getFeedItem(content):
146+
data = json.dumps(item) + "\n" # application/x-ndjson
147+
temp_fp.write(data)
148+
except Exception:
149+
pass
150+
temp_fp.seek(0)
151+
return temp_fp
152+
153+
def getFeedItem(content):
154+
# see: https://pythonhosted.org/feedparser/
155+
parser = feedparser.parse(content)
156+
channel = parser.get('channel',{})
157+
items = parser.get('entries',[])
72158
for i in items:
73-
result.append([i.title.text, i.link.text, i.pubDate.text, i.description.text])
74-
return result
159+
yield {
160+
'id': i.get('id'),
161+
'channel_title': channel.get('title'),
162+
'channel_link': channel.get('link'),
163+
'item_title': i.get('title'),
164+
'item_author': i.get('author'),
165+
'item_link': i.get('link'),
166+
'item_published': string_from_time(i.get('published_parsed')),
167+
'item_description': i.get('description')
168+
}
75169

76170
def validator_list(field, value, error):
77171
if isinstance(value, str):
@@ -91,3 +185,9 @@ def to_list(value):
91185
if isinstance(value, list):
92186
return list(itertools.chain.from_iterable(value))
93187
return None
188+
189+
def string_from_time(value):
190+
try:
191+
return time.strftime('%Y-%m-%d %H:%M:%S', value)
192+
except:
193+
return ''

0 commit comments

Comments
 (0)