metadata.py

import sys  

reload(sys)  
sys.setdefaultencoding('utf8')

from newspaper import Article
from save_db import *

metadata_table_name = 'news_stories_meta'
data_table_name = 'news_stories'

def string_format(string):
	string = str(string)
	string = string.replace(u"\u2018", "").replace(u"\u2019", "").replace("'", "").replace('"', '')
	return string

def process_url(url):
	url_keywords = ['analysis', 'andhra-pradesh', 'article', 'athletics', 'aviation', 'badminton', 'bangalore', 'banking-and-finance', 'beyond-the-news', 'bollywood', 'books', 'books-authors', 'brand-stories', 'brandstories', 'brunch', 'business', 'business-news', 'business-others', 'chandigarh', 'chennai', 'cities', 'columns', 'companies', 'cricket', 'delhi', 'delhi-news', 'economy', 'editorials', 'education', 'energy-and-environment', 'entertainment', 'explained', 'fitness', 'football', 'health', 'history-and-culture', 'hollywood', 'hyderabad', 'india', 'india-news', 'industry', 'international', 'internet', 'interview', 'karnataka', 'kerala', 'life-and-style', 'lifestyle', 'markets', 'mobile-tabs', 'more-lifestyle', 'movie-reviews', 'movies', 'mumbai', 'mumbai-news', 'music', 'national', 'news', 'noida', 'opinion', 'other-sports', 'other-states', 'pakistan', 'politics-and-policy', 'pro-kabaddi-league', 'pune', 'punjab', 'regional-movies', 'research', 'reviews', 'sci-tech', 'science', 'sex-and-relationships', 'society', 'sponsored-lifestyle', 'sport', 'sport-others', 'sports', 'tamil-nadu', 'tech', 'tech-news-technology', 'technology', 'telangana', 'television', 'tennis', 'thread', 'trending', 'trending-globally', 'trending-in-india', 'tv', 'videos', 'viral-videos-trending', 'web-edits', 'what-is', 'who-is', 'world', 'world-news']
	url_parts = url.split("/")
	url_sections = [string_format(i) for i in url_parts if i in url_keywords]
	return url_sections

def get_article_metadata(url):
	a = Article(url)

	#Processing
	a.download()

	a.parse()
	#a.nlp()

	#data generated by news web site
	article_text = a.text.encode('utf-8')

	meta_keywords = [string_format(i) for i in a.meta_keywords]
	print meta_keywords
	# raw_input()
	title = a.title

	tags = a.tags

	#data using newspaper nlp api
	nlp_keywords = a.keywords
	nlp_summary = a.summary.encode('utf-8')

	#get article categories from the url
	url_sections = process_url(url)

	save_metadata_db(metadata_table_name, data_table_name, string_format(url), string_format(article_text), meta_keywords, string_format(title), tags, nlp_keywords, string_format(nlp_summary), url_sections)
	print "SUCCESS: ", url
	"""try:
					#save metadata to database
					save_metadata_db(metadata_table_name, data_table_name, "string_format(url)", "article_text", "meta_keywords", "title", "tags", "nlp_keywords", "nlp_summary", "url_sections")
					
			
					#save_metadata_db(metadata_table_name, data_table_name, string_format(url), article_text, meta_keywords, title, tags, nlp_keywords, nlp_summary, url_sections)
					print "SUCCESS: ", url
				except:
					print "FAILED: ", url"""


#get all articles for which metadata not collected
article_links = get_article_list(data_table_name)

for url in article_links:
	print url
	try:
		get_article_metadata(url)
	except:
		try:
			get_article_metadata(url)
		except:
			continue