-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetadata.py
73 lines (54 loc) · 3.18 KB
/
metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from newspaper import Article
from save_db import *
metadata_table_name = 'news_stories_meta'
data_table_name = 'news_stories'
def string_format(string):
string = str(string)
string = string.replace(u"\u2018", "").replace(u"\u2019", "").replace("'", "").replace('"', '')
return string
def process_url(url):
url_keywords = ['analysis', 'andhra-pradesh', 'article', 'athletics', 'aviation', 'badminton', 'bangalore', 'banking-and-finance', 'beyond-the-news', 'bollywood', 'books', 'books-authors', 'brand-stories', 'brandstories', 'brunch', 'business', 'business-news', 'business-others', 'chandigarh', 'chennai', 'cities', 'columns', 'companies', 'cricket', 'delhi', 'delhi-news', 'economy', 'editorials', 'education', 'energy-and-environment', 'entertainment', 'explained', 'fitness', 'football', 'health', 'history-and-culture', 'hollywood', 'hyderabad', 'india', 'india-news', 'industry', 'international', 'internet', 'interview', 'karnataka', 'kerala', 'life-and-style', 'lifestyle', 'markets', 'mobile-tabs', 'more-lifestyle', 'movie-reviews', 'movies', 'mumbai', 'mumbai-news', 'music', 'national', 'news', 'noida', 'opinion', 'other-sports', 'other-states', 'pakistan', 'politics-and-policy', 'pro-kabaddi-league', 'pune', 'punjab', 'regional-movies', 'research', 'reviews', 'sci-tech', 'science', 'sex-and-relationships', 'society', 'sponsored-lifestyle', 'sport', 'sport-others', 'sports', 'tamil-nadu', 'tech', 'tech-news-technology', 'technology', 'telangana', 'television', 'tennis', 'thread', 'trending', 'trending-globally', 'trending-in-india', 'tv', 'videos', 'viral-videos-trending', 'web-edits', 'what-is', 'who-is', 'world', 'world-news']
url_parts = url.split("/")
url_sections = [string_format(i) for i in url_parts if i in url_keywords]
return url_sections
def get_article_metadata(url):
a = Article(url)
#Processing
a.download()
a.parse()
#a.nlp()
#data generated by news web site
article_text = a.text.encode('utf-8')
meta_keywords = [string_format(i) for i in a.meta_keywords]
print meta_keywords
# raw_input()
title = a.title
tags = a.tags
#data using newspaper nlp api
nlp_keywords = a.keywords
nlp_summary = a.summary.encode('utf-8')
#get article categories from the url
url_sections = process_url(url)
save_metadata_db(metadata_table_name, data_table_name, string_format(url), string_format(article_text), meta_keywords, string_format(title), tags, nlp_keywords, string_format(nlp_summary), url_sections)
print "SUCCESS: ", url
"""try:
#save metadata to database
save_metadata_db(metadata_table_name, data_table_name, "string_format(url)", "article_text", "meta_keywords", "title", "tags", "nlp_keywords", "nlp_summary", "url_sections")
#save_metadata_db(metadata_table_name, data_table_name, string_format(url), article_text, meta_keywords, title, tags, nlp_keywords, nlp_summary, url_sections)
print "SUCCESS: ", url
except:
print "FAILED: ", url"""
#get all articles for which metadata not collected
article_links = get_article_list(data_table_name)
for url in article_links:
print url
try:
get_article_metadata(url)
except:
try:
get_article_metadata(url)
except:
continue