This repository was archived by the owner on Jul 24, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse_csv.py
More file actions
87 lines (72 loc) · 2.22 KB
/
parse_csv.py
File metadata and controls
87 lines (72 loc) · 2.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from bs4 import BeautifulSoup
import csv
import os
import re
import requests
import shutil
import sys
CSV_FILE = 'knowledgebase.csv'
ARTICLES_PATH = 'app/articles/'
IMG_PATH = 'app/static/articles/'
PUBLIC_IMG_PATH = '/static/articles/'
def get_articles():
articles = []
with open(CSV_FILE, 'r') as handle:
reader = csv.DictReader(handle)
for row in reader:
if row['Published'] == 'true':
articles.append(row)
print(len(articles))
return articles
def create_directory(path):
if os.path.isdir(path):
shutil.rmtree(path)
os.mkdir(path)
def slugify(string):
string = string.replace('(', '')
string = string.replace(')', '')
string = string.replace(' ', '-')
string = string.lower()
return string
def save_articles(articles):
for article in articles:
file_path = ARTICLES_PATH + article['Updated At'].replace(' ', '-')
print(file_path)
data = article['Question'] + "\n"
data += slugify(article['Question']) + "\n"
data += article['Updated At'] + "\n"
data += article['Section'] + "\n"
answer = parse_html(article['Answer Html'])
data += answer
with open(file_path, 'w') as handle:
handle.write(data)
print("")
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
for img in soup.findAll('img'):
src = img.get('src')
newsrc = download_src(src)
html = html.replace(src, newsrc)
print(newsrc)
return html
def download_src(src):
remote_path = src
if remote_path[0] == '/':
remote_path = 'http://support.cellabus.com' + remote_path
img = requests.get(remote_path)
if img.status_code != 200:
return src
src_name = img.url.split('/')[-1]
src_name = src_name.split('?')[0]
if src_name[-4] != '.':
src_name = src_name + '.png'
local_path = IMG_PATH + src_name
with open(local_path, 'wb') as handle:
handle.write(img.content)
local_public_path = PUBLIC_IMG_PATH + src_name
return local_public_path
if __name__ == "__main__":
articles = get_articles()
create_directory(ARTICLES_PATH)
create_directory(IMG_PATH)
save_articles(articles)