This repository has been archived by the owner on Apr 2, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathworker.py
82 lines (65 loc) · 2.59 KB
/
worker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import dispy
def parse(url, state, category):
"""
This method contains all the necessary imports, classes and methods to ship to the worker nodes.
Arguments:
url {string} -- URL on which the parser is run.
state {string} -- The "State" of the URL. Can be None.
category {string} -- The "Category" of the URL. Can be None.
Returns:
None -- Does not return anything,
"""
import requests
from bs4 import BeautifulSoup as bs
from pymongo import MongoClient
from sumy.parsers.html import HtmlParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
HOST = '172.18.0.1' # The host URL at which MongoDB is running
PORT = 8888 # The port at which MongoDB is running
class ArticleHelper:
def __init__(self, url):
language = 'english'
self.tokenizer = Tokenizer(language)
self.summarizer = Summarizer(Stemmer(language))
self.summarizer.stop_words = get_stop_words(language)
self.article = HtmlParser.from_url(url, self.tokenizer)
def get_article(self):
sentences = [*self.article.document.sentences]
texts = [sentence._text for sentence in sentences]
return ' '.join(texts), len(sentences)
def get_summary(self, num_sentences):
return ''.join([str(sentence) for sentence in self.summarizer(self.article.document, num_sentences)])
soup = bs(requests.get(url).content, 'xml')
for item in soup.channel.find_all('item'):
helper = ArticleHelper(item.link.string)
article, length = helper.get_article()
summary = helper.get_summary(int(length * 0.2))
date = item.pubDate.string
post = {
'article':str(article),
'summary': str(summary),
'category': str(category),
'state': str(state),
'date': str(date)
}
client = MongoClient(HOST, PORT)
db = client['articles']
try:
db.posts.insert_one(post)
except Exception as e:
continue
if __name__ == '__main__':
cluster = dispy.JobCluster(parse)
jobs = []
with open('data.csv', 'r') as f:
lines = f.readlines()
for line in lines:
url, location, category = line.split(',')
job = cluster.submit(url, location, category)
jobs.append(job)
for job in jobs:
job()
cluster.print_status()