-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #6 from mandjevant/master
Start summary & url function
- Loading branch information
Showing
8 changed files
with
207 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
url = '' # url you want to evaluate | ||
language = '' # language prefix the article is in | ||
|
||
lines = # number of sentences of the summary (integer) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# summary | ||
|
||
Returns summary from an article's main content. | ||
|
||
## Setup | ||
|
||
1. `cd functions/summary` | ||
2. `virtualenv env` | ||
3. `source env/bin/activate` | ||
4. `pip install -r requirements.txt` | ||
|
||
## Parameters | ||
|
||
Inside `config.py` file the following parameters are **necessary** and **customizable**: | ||
- `url` | ||
- `language` | ||
- `lines` | ||
|
||
***Do not change the value of other parameters*** |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
networkx==2.1 | ||
newspaper3k==0.2.8 | ||
nltk==3.4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
# rewritten from https://github.com/edubey/text-summarizer/blob/master/text-summarizer.py | ||
|
||
from nltk.cluster.util import cosine_distance | ||
from nltk.corpus import stopwords | ||
import numpy as np | ||
import newspaper | ||
import config | ||
import networkx as nx | ||
|
||
|
||
class summarizer: | ||
# define variables | ||
def __init__(self): | ||
self.a = newspaper.build_article(config.url) | ||
self.a.download() | ||
self.a.parse() | ||
self.a.nlp() | ||
|
||
self.hot = newspaper.hot() | ||
|
||
|
||
# get sentences from article main | ||
def text_to_sentences(self): | ||
sentences = list() | ||
|
||
article = self.a.text | ||
article = article.replace('\n\n', '. ') | ||
article_sentences = article.split(r'. ') | ||
|
||
for sentence in article_sentences: | ||
sentences.append(sentence.replace('[^a-zA-Z]', '').split(' ')) | ||
|
||
return sentences | ||
|
||
|
||
# determines sentence similarity | ||
def sentence_similarity(self, sent1, sent2, stopwords=None): | ||
if stopwords is None: | ||
stopwords = [] | ||
|
||
sent1 = [w.lower() for w in sent1] | ||
sent2 = [w.lower() for w in sent2] | ||
|
||
all_words = list(set(sent1 + sent2)) | ||
|
||
vector1 = [0] * len(all_words) | ||
vector2 = [0] * len(all_words) | ||
|
||
for w in sent1: | ||
if w in stopwords: | ||
continue | ||
vector1[all_words.index(w)] += 1 | ||
|
||
for w in sent2: | ||
if w in stopwords: | ||
continue | ||
vector2[all_words.index(w)] += 1 | ||
|
||
return 1 - cosine_distance(vector1, vector2) | ||
|
||
|
||
# takes article content, returns key words | ||
def build_similarity_matrix(self, content, stop_words, sentences): | ||
similarity_matrix = np.zeros((len(sentences), len(sentences))) | ||
|
||
for idx1 in range(len(sentences)): | ||
for idx2 in range(len(sentences)): | ||
if idx1 == idx2: | ||
continue | ||
similarity_matrix[idx1][idx2] = self.sentence_similarity(sentences[idx1], sentences[idx2], stop_words) | ||
|
||
return similarity_matrix | ||
|
||
|
||
# main of function | ||
def main(self): | ||
summarize_text = list() | ||
|
||
sentences = self.text_to_sentences() | ||
|
||
sentence_similarity_matrix = self.build_similarity_matrix(self.a.text, stopwords.words('english'), sentences) | ||
|
||
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix) | ||
scores = nx.pagerank(sentence_similarity_graph) | ||
|
||
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True) | ||
|
||
for i in range(config.lines): | ||
summarize_text.append(" ".join(ranked_sentence[i][1])) | ||
|
||
return summarize_text | ||
|
||
|
||
if __name__ == '__main__': | ||
print("Summarized Text: \n", ". ".join(summarizer().main())) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
url = '' # url you want to evaluate | ||
language = '' # language prefix the article is in | ||
|
||
api_key = '' # google api key | ||
cse_id = '' # custom search engine ID |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# url | ||
|
||
Takes title from article, searches nouns and searches if subjects are currently talked about. | ||
|
||
## Setup | ||
|
||
1. `cd functions/url` | ||
2. `virtualenv env` | ||
3. `source env/bin/activate` | ||
4. `pip install -r requirements.txt` | ||
|
||
## Parameters | ||
You can get an API key by visiting [google console](https://code.google.com/apis/console) and clicking "API Access". You will then need to switch on the custom search API on the "Services" tab. | ||
|
||
Inside `config.py` file the following parameters are **necessary** | ||
- `api_key` | ||
- `cse_id` | ||
|
||
The following parameters are **customizable**: | ||
- `url` | ||
- `language` | ||
|
||
***Do not change the value of other parameters*** |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
newspaper3k==0.2.8 | ||
nltk==3.4 | ||
google-api-python-client==1.7.11 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
from googleapiclient.discovery import build # news api requires js, can do that later | ||
import newspaper | ||
import pprint | ||
import config | ||
import nltk | ||
import time | ||
import sys | ||
|
||
|
||
class url_evaluator: | ||
# define variables and prepare article instance | ||
def __init__(self): | ||
self.a = newspaper.build_article(config.url) | ||
self.a.download() | ||
self.a.parse() | ||
self.a.nlp() | ||
|
||
self.hot = newspaper.hot() | ||
|
||
|
||
# get nouns from title | ||
def get_nouns_nltk(self): | ||
is_noun = lambda pos: pos[:2] == 'NN' | ||
tokenized = nltk.word_tokenize(self.a.title) | ||
nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] | ||
|
||
return nouns | ||
|
||
|
||
# takes a search term, does a google search on it | ||
def google_search(self, search_term, **kwargs): | ||
try: | ||
service = build('customsearch', 'v1', developerKey=config.api_key) | ||
res = service.cse().list(q=search_term, cx=config.cse_id, **kwargs).execute() | ||
|
||
return res['items'] | ||
|
||
except Exception as e: | ||
print('Google API returned error', e) | ||
sys.exit() | ||
|
||
|
||
# main of function | ||
def main(self): | ||
nouns = self.get_nouns_nltk() | ||
|
||
for i in nouns: | ||
results = self.google_search('i', num=10) | ||
|
||
for result in results: | ||
pprint.pprint(result) | ||
|
||
|
||
if __name__ == '__main__': | ||
url_evaluator().main() |