Merge pull request #6 from mandjevant/master

Start summary & url function
Learning-Python-Team · Aug 25, 2019 · e7e9f53 · e7e9f53
2 parents 4094e93 + efc0a57
commit e7e9f53
Show file tree

Hide file tree

Showing 8 changed files with 207 additions and 0 deletions.
diff --git a/functions/summary/config.py b/functions/summary/config.py
@@ -0,0 +1,4 @@
+url = '' # url you want to evaluate 
+language = '' # language prefix the article is in
+
+lines =  # number of sentences of the summary (integer)
diff --git a/functions/summary/readme.md b/functions/summary/readme.md
@@ -0,0 +1,19 @@
+# summary
+
+Returns summary from an article's main content.
+
+## Setup
+
+1. `cd functions/summary`
+2. `virtualenv env`
+3. `source env/bin/activate`
+4. `pip install -r requirements.txt`
+
+## Parameters
+
+Inside `config.py` file the following parameters are **necessary** and **customizable**:
+- `url`
+- `language`
+- `lines`
+
+***Do not change the value of other parameters***
diff --git a/functions/summary/requirements.txt b/functions/summary/requirements.txt
@@ -0,0 +1,3 @@
+networkx==2.1
+newspaper3k==0.2.8
+nltk==3.4
diff --git a/functions/summary/summary.py b/functions/summary/summary.py
@@ -0,0 +1,95 @@
+# rewritten from https://github.com/edubey/text-summarizer/blob/master/text-summarizer.py
+
+from nltk.cluster.util import cosine_distance
+from nltk.corpus import stopwords
+import numpy as np
+import newspaper
+import config
+import networkx as nx
+
+
+class summarizer:
+	# define variables
+	def __init__(self):
+		self.a = newspaper.build_article(config.url)
+		self.a.download()
+		self.a.parse()
+		self.a.nlp()
+
+		self.hot = newspaper.hot()
+
+
+	# get sentences from article main
+	def text_to_sentences(self):
+		sentences = list()
+
+		article = self.a.text
+		article = article.replace('\n\n', '. ')
+		article_sentences = article.split(r'. ')
+
+		for sentence in article_sentences:
+			sentences.append(sentence.replace('[^a-zA-Z]', '').split(' '))
+
+		return sentences
+
+
+	# determines sentence similarity
+	def sentence_similarity(self, sent1, sent2, stopwords=None):
+		if stopwords is None:
+			stopwords = []
+
+		sent1 = [w.lower() for w in sent1]
+		sent2 = [w.lower() for w in sent2]
+
+		all_words = list(set(sent1 + sent2))
+
+		vector1 = [0] * len(all_words)
+		vector2 = [0] * len(all_words)
+
+		for w in sent1:
+			if w in stopwords:
+				continue
+			vector1[all_words.index(w)] += 1
+
+		for w in sent2:
+			if w in stopwords:
+				continue
+			vector2[all_words.index(w)] += 1
+
+		return 1 - cosine_distance(vector1, vector2)
+
+
+	# takes article content, returns key words
+	def build_similarity_matrix(self, content, stop_words, sentences):
+		similarity_matrix = np.zeros((len(sentences), len(sentences)))
+
+		for idx1 in range(len(sentences)):
+			for idx2 in range(len(sentences)):
+				if idx1 == idx2: 
+					continue 
+				similarity_matrix[idx1][idx2] = self.sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
+
+		return similarity_matrix
+
+
+	# main of function
+	def main(self):
+		summarize_text = list()
+
+		sentences = self.text_to_sentences()
+
+		sentence_similarity_matrix = self.build_similarity_matrix(self.a.text, stopwords.words('english'), sentences)
+
+		sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
+		scores = nx.pagerank(sentence_similarity_graph)
+
+		ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
+
+		for i in range(config.lines):
+			summarize_text.append(" ".join(ranked_sentence[i][1]))
+
+		return summarize_text
+
+
+if __name__ == '__main__':
+	print("Summarized Text: \n", ". ".join(summarizer().main()))
diff --git a/functions/url/config.py b/functions/url/config.py
@@ -0,0 +1,5 @@
+url = '' # url you want to evaluate
+language = '' # language prefix the article is in
+
+api_key = '' # google api key
+cse_id = '' # custom search engine ID
diff --git a/functions/url/readme.md b/functions/url/readme.md
@@ -0,0 +1,23 @@
+# url
+
+Takes title from article, searches nouns and searches if subjects are currently talked about.
+
+## Setup
+
+1. `cd functions/url`
+2. `virtualenv env`
+3. `source env/bin/activate`
+4. `pip install -r requirements.txt`
+
+## Parameters
+You can get an API key by visiting [google console](https://code.google.com/apis/console) and clicking "API Access". You will then need to switch on the custom search API on the "Services" tab.
+
+Inside `config.py` file the following parameters are **necessary** 
+- `api_key`
+- `cse_id`
+
+The following parameters are **customizable**:
+- `url`
+- `language`
+
+***Do not change the value of other parameters***
diff --git a/functions/url/requirements.txt b/functions/url/requirements.txt
@@ -0,0 +1,3 @@
+newspaper3k==0.2.8
+nltk==3.4
+google-api-python-client==1.7.11
diff --git a/functions/url/url.py b/functions/url/url.py
@@ -0,0 +1,55 @@
+from googleapiclient.discovery import build # news api requires js, can do that later
+import newspaper 
+import pprint
+import config
+import nltk
+import time
+import sys
+
+
+class url_evaluator:
+	# define variables and prepare article instance
+	def __init__(self):
+		self.a = newspaper.build_article(config.url)
+		self.a.download()
+		self.a.parse()
+		self.a.nlp()
+
+		self.hot = newspaper.hot()
+
+
+	# get nouns from title
+	def get_nouns_nltk(self):
+		is_noun = lambda pos: pos[:2] == 'NN'
+		tokenized = nltk.word_tokenize(self.a.title)
+		nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)]
+
+		return nouns
+
+
+	# takes a search term, does a google search on it
+	def google_search(self, search_term, **kwargs):
+		try:
+			service = build('customsearch', 'v1', developerKey=config.api_key)
+			res = service.cse().list(q=search_term, cx=config.cse_id, **kwargs).execute()
+
+			return res['items']
+
+		except Exception as e:
+			print('Google API returned error', e)
+			sys.exit()
+
+
+	# main of function
+	def main(self):
+		nouns = self.get_nouns_nltk()
+
+		for i in nouns:
+			results = self.google_search('i', num=10)
+
+			for result in results:
+				pprint.pprint(result)
+
+
+if __name__ == '__main__':
+	url_evaluator().main()