Skip to content

Commit

Permalink
Merge pull request #6 from mandjevant/master
Browse files Browse the repository at this point in the history
Start summary & url function
  • Loading branch information
asiffarhankhan authored Aug 25, 2019
2 parents 4094e93 + efc0a57 commit e7e9f53
Show file tree
Hide file tree
Showing 8 changed files with 207 additions and 0 deletions.
4 changes: 4 additions & 0 deletions functions/summary/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
url = '' # url you want to evaluate
language = '' # language prefix the article is in

lines = # number of sentences of the summary (integer)
19 changes: 19 additions & 0 deletions functions/summary/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# summary

Returns summary from an article's main content.

## Setup

1. `cd functions/summary`
2. `virtualenv env`
3. `source env/bin/activate`
4. `pip install -r requirements.txt`

## Parameters

Inside `config.py` file the following parameters are **necessary** and **customizable**:
- `url`
- `language`
- `lines`

***Do not change the value of other parameters***
3 changes: 3 additions & 0 deletions functions/summary/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
networkx==2.1
newspaper3k==0.2.8
nltk==3.4
95 changes: 95 additions & 0 deletions functions/summary/summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# rewritten from https://github.com/edubey/text-summarizer/blob/master/text-summarizer.py

from nltk.cluster.util import cosine_distance
from nltk.corpus import stopwords
import numpy as np
import newspaper
import config
import networkx as nx


class summarizer:
# define variables
def __init__(self):
self.a = newspaper.build_article(config.url)
self.a.download()
self.a.parse()
self.a.nlp()

self.hot = newspaper.hot()


# get sentences from article main
def text_to_sentences(self):
sentences = list()

article = self.a.text
article = article.replace('\n\n', '. ')
article_sentences = article.split(r'. ')

for sentence in article_sentences:
sentences.append(sentence.replace('[^a-zA-Z]', '').split(' '))

return sentences


# determines sentence similarity
def sentence_similarity(self, sent1, sent2, stopwords=None):
if stopwords is None:
stopwords = []

sent1 = [w.lower() for w in sent1]
sent2 = [w.lower() for w in sent2]

all_words = list(set(sent1 + sent2))

vector1 = [0] * len(all_words)
vector2 = [0] * len(all_words)

for w in sent1:
if w in stopwords:
continue
vector1[all_words.index(w)] += 1

for w in sent2:
if w in stopwords:
continue
vector2[all_words.index(w)] += 1

return 1 - cosine_distance(vector1, vector2)


# takes article content, returns key words
def build_similarity_matrix(self, content, stop_words, sentences):
similarity_matrix = np.zeros((len(sentences), len(sentences)))

for idx1 in range(len(sentences)):
for idx2 in range(len(sentences)):
if idx1 == idx2:
continue
similarity_matrix[idx1][idx2] = self.sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

return similarity_matrix


# main of function
def main(self):
summarize_text = list()

sentences = self.text_to_sentences()

sentence_similarity_matrix = self.build_similarity_matrix(self.a.text, stopwords.words('english'), sentences)

sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)

ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

for i in range(config.lines):
summarize_text.append(" ".join(ranked_sentence[i][1]))

return summarize_text


if __name__ == '__main__':
print("Summarized Text: \n", ". ".join(summarizer().main()))
5 changes: 5 additions & 0 deletions functions/url/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
url = '' # url you want to evaluate
language = '' # language prefix the article is in

api_key = '' # google api key
cse_id = '' # custom search engine ID
23 changes: 23 additions & 0 deletions functions/url/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# url

Takes title from article, searches nouns and searches if subjects are currently talked about.

## Setup

1. `cd functions/url`
2. `virtualenv env`
3. `source env/bin/activate`
4. `pip install -r requirements.txt`

## Parameters
You can get an API key by visiting [google console](https://code.google.com/apis/console) and clicking "API Access". You will then need to switch on the custom search API on the "Services" tab.

Inside `config.py` file the following parameters are **necessary**
- `api_key`
- `cse_id`

The following parameters are **customizable**:
- `url`
- `language`

***Do not change the value of other parameters***
3 changes: 3 additions & 0 deletions functions/url/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
newspaper3k==0.2.8
nltk==3.4
google-api-python-client==1.7.11
55 changes: 55 additions & 0 deletions functions/url/url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from googleapiclient.discovery import build # news api requires js, can do that later
import newspaper
import pprint
import config
import nltk
import time
import sys


class url_evaluator:
# define variables and prepare article instance
def __init__(self):
self.a = newspaper.build_article(config.url)
self.a.download()
self.a.parse()
self.a.nlp()

self.hot = newspaper.hot()


# get nouns from title
def get_nouns_nltk(self):
is_noun = lambda pos: pos[:2] == 'NN'
tokenized = nltk.word_tokenize(self.a.title)
nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)]

return nouns


# takes a search term, does a google search on it
def google_search(self, search_term, **kwargs):
try:
service = build('customsearch', 'v1', developerKey=config.api_key)
res = service.cse().list(q=search_term, cx=config.cse_id, **kwargs).execute()

return res['items']

except Exception as e:
print('Google API returned error', e)
sys.exit()


# main of function
def main(self):
nouns = self.get_nouns_nltk()

for i in nouns:
results = self.google_search('i', num=10)

for result in results:
pprint.pprint(result)


if __name__ == '__main__':
url_evaluator().main()

0 comments on commit e7e9f53

Please sign in to comment.