updated RSS with better feed processing, property selection, and header/limit options

alwillia · alwillia · commit 192042671f6e · 2020-05-04T13:27:27.000-05:00
diff --git a/web-rss.py b/web-rss.py
@@ -9,36 +9,60 @@
 #   type: array
 #   description: Urls for which to get the info
 #   required: true
+# returns:
+# - name: channel_title
+#   type: string
+#   description: The feed channel title
+# - name: channel_link
+#   type: string
+#   description: The feed channel link
+# - name: item_title
+#   type: string
+#   description: The article title
+# - name: item_author
+#   type: string
+#   description: The article author
+# - name: item_link
+#   type: string
+#   description: The article link
+# - name: item_published
+#   type: string
+#   description: The date/time the article was published
+# - name: item_description
+#   type: string
+#   description: A description for the article
 # examples:
-# - '"https://news.ycombinator.com/rss"'
-# - '"https://news.ycombinator.com/rss,http://feeds.arstechnica.com/arstechnica/index/"'
-# - 'A1:A3'
+#   - '"http://feeds.arstechnica.com/arstechnica/technology-lab"'
+#   - '"http://feeds.arstechnica.com/arstechnica/technology-lab,https://www.technologyreview.com/feed/"'
+#   - '"https://www.technologyreview.com/feed/","channel_title,item_title,item_author,item_link"'
 # notes:
 # ---
 
 import json
+import time
+import urllib
+import tempfile
 import aiohttp
 import asyncio
 import itertools
+import feedparser
 from cerberus import Validator
 from collections import OrderedDict
-from bs4 import BeautifulSoup
 
 def flexio_handler(flex):
 
     # get the input
     input = flex.input.read()
-    try:
-        input = json.loads(input)
-        if not isinstance(input, list): raise ValueError
-    except ValueError:
+    input = json.loads(input)
+    if not isinstance(input, list):
         raise ValueError
 
     # define the expected parameters and map the values to the parameter names
     # based on the positions of the keys/values
     params = OrderedDict()
     params['urls'] = {'required': True, 'validator': validator_list, 'coerce': to_list}
-    #params['columns'] = {'required': True, 'validator': validator_list, 'coerce': to_list}
+    params['properties'] = {'required': False, 'validator': validator_list, 'coerce': to_list, 'default': '*'}
+    params['config'] = {'required': False, 'type': 'string', 'default': ''} # index-styled config string
     input = dict(zip(params.keys(), input))
 
     # validate the mapped input against the validator
@@ -47,31 +71,101 @@ def flexio_handler(flex):
     if input is None:
         raise ValueError
 
+    # map this function's property names to the API's property names
+    property_map = OrderedDict()
+    property_map['channel_title'] = 'channel_title'
+    property_map['channel_link'] = 'channel_link'
+    property_map['item_title'] = 'item_title'
+    property_map['item_author'] = 'item_author'
+    property_map['item_link'] = 'item_link'
+    property_map['item_published'] = 'item_published'
+    property_map['item_description'] = 'item_description'
+
+    # get the properties to return and the property map;
+    # if we have a wildcard, get all the properties
+    properties = [p.lower().strip() for p in input['properties']]
+    if len(properties) == 1 and (properties[0] == '' or properties[0] == '*'):
+        properties = list(property_map.keys())
+
+    # get any configuration settings
+    config = urllib.parse.parse_qs(input['config'])
+    config = {k: v[0] for k, v in config.items()}
+    limit = int(config.get('limit', 10000))
+    headers = config.get('headers', 'true').lower()
+    if headers == 'true':
+        headers = True
+    else:
+        headers = False
+
+    # get the feeds
     urls = input['urls']
     loop = asyncio.get_event_loop()
-    result = loop.run_until_complete(fetch_all(urls))
-    flex.output.write(result)
+    temp_fp_all = loop.run_until_complete(fetch_all(urls))
+
+    # write the output
+    flex.output.content_type = 'application/json'
+    flex.output.write('[')
+
+    if headers is True:
+        flex.output.write(json.dumps(properties))
+
+    idx = 0
+    for temp_fp in temp_fp_all:
+        while True:
+            row = temp_fp.readline()
+            if not row:
+                break
+            if idx >= limit:
+                break
+            row = json.loads(row)
+            content = ''
+            if headers is True or idx > 0:
+                content = ','
+            content = content + json.dumps([(row.get(p) or '') for p in properties])
+            flex.output.write(content)
+            idx = idx + 1
+
+    flex.output.write(']')
 
 async def fetch_all(urls):
     tasks = []
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(raise_for_status=True) as session:
         for url in urls:
             tasks.append(fetch(session, url))
-        content = await asyncio.gather(*tasks)
-        return list(itertools.chain.from_iterable(content))
+        temp_fp_all = await asyncio.gather(*tasks)
+        return temp_fp_all
 
 async def fetch(session, url):
-    async with session.get(url) as response:
-        result = await response.text()
-        return parseFeed(result)
-
-def parseFeed(content):
-    result = []
-    soup = BeautifulSoup(content, "xml")
-    items = soup.findAll("item")
+    # get the data, process it and put the results in a temporary
+    # file for aggregating with other results
+    temp_fp = tempfile.TemporaryFile(mode='w+t')
+    try:
+        async with session.get(url) as response:
+            content = await response.text()
+            for item in getFeedItem(content):
+                data = json.dumps(item) + "\n" # application/x-ndjson
+                temp_fp.write(data)
+    except Exception:
+        pass
+    temp_fp.seek(0)
+    return temp_fp
+
+def getFeedItem(content):
+    # see: https://pythonhosted.org/feedparser/
+    parser = feedparser.parse(content)
+    channel = parser.get('channel',{})
+    items = parser.get('entries',[])
     for i in items:
-        result.append([i.title.text, i.link.text, i.pubDate.text, i.description.text])
-    return result
+        yield {
+            'id': i.get('id'),
+            'channel_title': channel.get('title'),
+            'channel_link': channel.get('link'),
+            'item_title': i.get('title'),
+            'item_author': i.get('author'),
+            'item_link': i.get('link'),
+            'item_published': string_from_time(i.get('published_parsed')),
+            'item_description': i.get('description')
+        }
 
 def validator_list(field, value, error):
     if isinstance(value, str):
@@ -91,3 +185,9 @@ def to_list(value):
     if isinstance(value, list):
         return list(itertools.chain.from_iterable(value))
     return None
+
+def string_from_time(value):
+    try:
+        return time.strftime('%Y-%m-%d %H:%M:%S', value)
+    except:
+        return ''