9
9
# type: array
10
10
# description: Urls for which to get the info
11
11
# required: true
12
+ # returns:
13
+ # - name: channel_title
14
+ # type: string
15
+ # description: The feed channel title
16
+ # - name: channel_link
17
+ # type: string
18
+ # description: The feed channel link
19
+ # - name: item_title
20
+ # type: string
21
+ # description: The article title
22
+ # - name: item_author
23
+ # type: string
24
+ # description: The article author
25
+ # - name: item_link
26
+ # type: string
27
+ # description: The article link
28
+ # - name: item_published
29
+ # type: string
30
+ # description: The date/time the article was published
31
+ # - name: item_description
32
+ # type: string
33
+ # description: A description for the article
12
34
# examples:
13
- # - '"https ://news.ycombinator .com/rss "'
14
- # - '"https ://news.ycombinator .com/rss,http ://feeds.arstechnica .com/arstechnica/index /"'
15
- # - 'A1:A3 '
35
+ # - '"http ://feeds.arstechnica .com/arstechnica/technology-lab "'
36
+ # - '"http ://feeds.arstechnica .com/arstechnica/technology-lab,https ://www.technologyreview .com/feed /"'
37
+ # - '"https://www.technologyreview.com/feed/","channel_title,item_title,item_author,item_link" '
16
38
# notes:
17
39
# ---
18
40
19
41
import json
42
+ import time
43
+ import urllib
44
+ import tempfile
20
45
import aiohttp
21
46
import asyncio
22
47
import itertools
48
+ import feedparser
23
49
from cerberus import Validator
24
50
from collections import OrderedDict
25
- from bs4 import BeautifulSoup
26
51
27
52
def flexio_handler (flex ):
28
53
29
54
# get the input
30
55
input = flex .input .read ()
31
- try :
32
- input = json .loads (input )
33
- if not isinstance (input , list ): raise ValueError
34
- except ValueError :
56
+ input = json .loads (input )
57
+ if not isinstance (input , list ):
35
58
raise ValueError
36
59
37
60
# define the expected parameters and map the values to the parameter names
38
61
# based on the positions of the keys/values
39
62
params = OrderedDict ()
40
63
params ['urls' ] = {'required' : True , 'validator' : validator_list , 'coerce' : to_list }
41
- #params['columns'] = {'required': True, 'validator': validator_list, 'coerce': to_list}
64
+ params ['properties' ] = {'required' : False , 'validator' : validator_list , 'coerce' : to_list , 'default' : '*' }
65
+ params ['config' ] = {'required' : False , 'type' : 'string' , 'default' : '' } # index-styled config string
42
66
input = dict (zip (params .keys (), input ))
43
67
44
68
# validate the mapped input against the validator
@@ -47,31 +71,101 @@ def flexio_handler(flex):
47
71
if input is None :
48
72
raise ValueError
49
73
74
+ # map this function's property names to the API's property names
75
+ property_map = OrderedDict ()
76
+ property_map ['channel_title' ] = 'channel_title'
77
+ property_map ['channel_link' ] = 'channel_link'
78
+ property_map ['item_title' ] = 'item_title'
79
+ property_map ['item_author' ] = 'item_author'
80
+ property_map ['item_link' ] = 'item_link'
81
+ property_map ['item_published' ] = 'item_published'
82
+ property_map ['item_description' ] = 'item_description'
83
+
84
+ # get the properties to return and the property map;
85
+ # if we have a wildcard, get all the properties
86
+ properties = [p .lower ().strip () for p in input ['properties' ]]
87
+ if len (properties ) == 1 and (properties [0 ] == '' or properties [0 ] == '*' ):
88
+ properties = list (property_map .keys ())
89
+
90
+ # get any configuration settings
91
+ config = urllib .parse .parse_qs (input ['config' ])
92
+ config = {k : v [0 ] for k , v in config .items ()}
93
+ limit = int (config .get ('limit' , 10000 ))
94
+ headers = config .get ('headers' , 'true' ).lower ()
95
+ if headers == 'true' :
96
+ headers = True
97
+ else :
98
+ headers = False
99
+
100
+ # get the feeds
50
101
urls = input ['urls' ]
51
102
loop = asyncio .get_event_loop ()
52
- result = loop .run_until_complete (fetch_all (urls ))
53
- flex .output .write (result )
103
+ temp_fp_all = loop .run_until_complete (fetch_all (urls ))
104
+
105
+ # write the output
106
+ flex .output .content_type = 'application/json'
107
+ flex .output .write ('[' )
108
+
109
+ if headers is True :
110
+ flex .output .write (json .dumps (properties ))
111
+
112
+ idx = 0
113
+ for temp_fp in temp_fp_all :
114
+ while True :
115
+ row = temp_fp .readline ()
116
+ if not row :
117
+ break
118
+ if idx >= limit :
119
+ break
120
+ row = json .loads (row )
121
+ content = ''
122
+ if headers is True or idx > 0 :
123
+ content = ','
124
+ content = content + json .dumps ([(row .get (p ) or '' ) for p in properties ])
125
+ flex .output .write (content )
126
+ idx = idx + 1
127
+
128
+ flex .output .write (']' )
54
129
55
130
async def fetch_all (urls ):
56
131
tasks = []
57
- async with aiohttp .ClientSession () as session :
132
+ async with aiohttp .ClientSession (raise_for_status = True ) as session :
58
133
for url in urls :
59
134
tasks .append (fetch (session , url ))
60
- content = await asyncio .gather (* tasks )
61
- return list ( itertools . chain . from_iterable ( content ))
135
+ temp_fp_all = await asyncio .gather (* tasks )
136
+ return temp_fp_all
62
137
63
138
async def fetch (session , url ):
64
- async with session .get (url ) as response :
65
- result = await response .text ()
66
- return parseFeed (result )
67
-
68
- def parseFeed (content ):
69
- result = []
70
- soup = BeautifulSoup (content , "xml" )
71
- items = soup .findAll ("item" )
139
+ # get the data, process it and put the results in a temporary
140
+ # file for aggregating with other results
141
+ temp_fp = tempfile .TemporaryFile (mode = 'w+t' )
142
+ try :
143
+ async with session .get (url ) as response :
144
+ content = await response .text ()
145
+ for item in getFeedItem (content ):
146
+ data = json .dumps (item ) + "\n " # application/x-ndjson
147
+ temp_fp .write (data )
148
+ except Exception :
149
+ pass
150
+ temp_fp .seek (0 )
151
+ return temp_fp
152
+
153
+ def getFeedItem (content ):
154
+ # see: https://pythonhosted.org/feedparser/
155
+ parser = feedparser .parse (content )
156
+ channel = parser .get ('channel' ,{})
157
+ items = parser .get ('entries' ,[])
72
158
for i in items :
73
- result .append ([i .title .text , i .link .text , i .pubDate .text , i .description .text ])
74
- return result
159
+ yield {
160
+ 'id' : i .get ('id' ),
161
+ 'channel_title' : channel .get ('title' ),
162
+ 'channel_link' : channel .get ('link' ),
163
+ 'item_title' : i .get ('title' ),
164
+ 'item_author' : i .get ('author' ),
165
+ 'item_link' : i .get ('link' ),
166
+ 'item_published' : string_from_time (i .get ('published_parsed' )),
167
+ 'item_description' : i .get ('description' )
168
+ }
75
169
76
170
def validator_list (field , value , error ):
77
171
if isinstance (value , str ):
@@ -91,3 +185,9 @@ def to_list(value):
91
185
if isinstance (value , list ):
92
186
return list (itertools .chain .from_iterable (value ))
93
187
return None
188
+
189
+ def string_from_time (value ):
190
+ try :
191
+ return time .strftime ('%Y-%m-%d %H:%M:%S' , value )
192
+ except :
193
+ return ''
0 commit comments