|
| 1 | + |
| 2 | +# --- |
| 3 | +# name: web-csv |
| 4 | +# deployed: true |
| 5 | +# title: CSV Reader |
| 6 | +# description: Returns the data for the CSVs given by the URLs |
| 7 | +# params: |
| 8 | +# - name: url |
| 9 | +# type: array |
| 10 | +# description: Urls for which to get the info |
| 11 | +# required: true |
| 12 | +# examples: |
| 13 | +# - '"https://raw.githubusercontent.com/flexiodata/data/master/sample/sample-contacts.csv"' |
| 14 | +# notes: |
| 15 | +# --- |
| 16 | + |
| 17 | +import csv |
| 18 | +import json |
| 19 | +import tempfile |
| 20 | +import io |
| 21 | +import aiohttp |
| 22 | +import asyncio |
| 23 | +import itertools |
| 24 | +from cerberus import Validator |
| 25 | +from contextlib import closing |
| 26 | +from collections import OrderedDict |
| 27 | + |
| 28 | +def flexio_handler(flex): |
| 29 | + |
| 30 | + # get the input |
| 31 | + input = flex.input.read() |
| 32 | + input = json.loads(input) |
| 33 | + if not isinstance(input, list): |
| 34 | + raise ValueError |
| 35 | + |
| 36 | + # define the expected parameters and map the values to the parameter names |
| 37 | + # based on the positions of the keys/values |
| 38 | + params = OrderedDict() |
| 39 | + params['urls'] = {'required': True, 'validator': validator_list, 'coerce': to_list} |
| 40 | + #params['columns'] = {'required': True, 'validator': validator_list, 'coerce': to_list} |
| 41 | + input = dict(zip(params.keys(), input)) |
| 42 | + |
| 43 | + # validate the mapped input against the validator |
| 44 | + v = Validator(params, allow_unknown = True) |
| 45 | + input = v.validated(input) |
| 46 | + if input is None: |
| 47 | + raise ValueError |
| 48 | + |
| 49 | + urls = input['urls'] |
| 50 | + loop = asyncio.get_event_loop() |
| 51 | + temp_fp_all = loop.run_until_complete(fetch_all(urls)) |
| 52 | + |
| 53 | + flex.output.content_type = 'application/json' |
| 54 | + flex.output.write('[') |
| 55 | + |
| 56 | + # get the columns for each of the input urls |
| 57 | + properties = [] |
| 58 | + for temp_fp in temp_fp_all: |
| 59 | + try: |
| 60 | + fp = io.TextIOWrapper(temp_fp, encoding='utf-8-sig') |
| 61 | + reader = csv.DictReader(fp, delimiter=',', quotechar='"') |
| 62 | + for row in reader: |
| 63 | + properties = list(row.keys()) |
| 64 | + break |
| 65 | + finally: |
| 66 | + fp.seek(0) |
| 67 | + fp.detach() |
| 68 | + |
| 69 | + flex.output.write(json.dumps(properties)) |
| 70 | + |
| 71 | + for temp_fp in temp_fp_all: |
| 72 | + fp = io.TextIOWrapper(temp_fp, encoding='utf-8-sig') |
| 73 | + reader = csv.DictReader(fp, delimiter=',', quotechar='"') |
| 74 | + for row in reader: |
| 75 | + row = ',' + json.dumps([(row.get(p) or '') for p in properties]) |
| 76 | + flex.output.write(row) |
| 77 | + temp_fp.close() |
| 78 | + |
| 79 | + flex.output.write(']') |
| 80 | + |
| 81 | +async def fetch_all(urls): |
| 82 | + tasks = [] |
| 83 | + async with aiohttp.ClientSession() as session: |
| 84 | + for url in urls: |
| 85 | + tasks.append(fetch(session, url)) |
| 86 | + temp_fp_all = await asyncio.gather(*tasks) |
| 87 | + return temp_fp_all |
| 88 | + |
| 89 | +async def fetch(session, url): |
| 90 | + # stream the data from the url into a temporary file and return |
| 91 | + # it for processing, after which it'll be closed and deleted |
| 92 | + temp_fp = tempfile.TemporaryFile() |
| 93 | + async with session.get(url) as response: |
| 94 | + while True: |
| 95 | + data = await response.content.read(1024) |
| 96 | + if not data: |
| 97 | + break |
| 98 | + temp_fp.write(data) |
| 99 | + temp_fp.seek(0) # rewind to the beginning |
| 100 | + return temp_fp |
| 101 | + |
| 102 | +def validator_list(field, value, error): |
| 103 | + if isinstance(value, str): |
| 104 | + return |
| 105 | + if isinstance(value, list): |
| 106 | + for item in value: |
| 107 | + if not isinstance(item, str): |
| 108 | + error(field, 'Must be a list with only string values') |
| 109 | + return |
| 110 | + error(field, 'Must be a string or a list of strings') |
| 111 | + |
| 112 | +def to_list(value): |
| 113 | + # if we have a list of strings, create a list from them; if we have |
| 114 | + # a list of lists, flatten it into a single list of strings |
| 115 | + if isinstance(value, str): |
| 116 | + return value.split(",") |
| 117 | + if isinstance(value, list): |
| 118 | + return list(itertools.chain.from_iterable(value)) |
| 119 | + return None |
0 commit comments