Skip to content

Commit d578d79

Browse files
committed
Ability to extract content excerpts as reported in search results.
Also a fix to -s|--some and -n|--none: these did not yet support passing lists of phrases. This now works correctly if you provide separate phrases via commas.
1 parent 9699738 commit d578d79

File tree

2 files changed

+48
-9
lines changed

2 files changed

+48
-9
lines changed

README.md

+7-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Christian
1111
Features
1212
--------
1313

14-
* Extracts publication title, most relevant web link, PDF link, number of citations, number of online versions, link to Google Scholar's article cluster for the work, Google Scholar's cluster of all works referencing the publication.
14+
* Extracts publication title, most relevant web link, PDF link, number of citations, number of online versions, link to Google Scholar's article cluster for the work, Google Scholar's cluster of all works referencing the publication, and excerpt of content.
1515
* Extracts total number of hits as reported by Scholar (new in version 2.5)
1616
* Supports the full range of advanced query options provided by Google Scholar, such as title-only search, publication date timeframes, and inclusion/exclusion of patents and citations.
1717
* Supports article cluster IDs, i.e., information relating to the variants of an article already identified by Google Scholar
@@ -44,6 +44,8 @@ Retrieve one article written by Einstein on quantum theory:
4444
PDF link http://icole.mut-es.ac.ir/downloads/Sci_Sec/W1/Einstein%201917.pdf
4545
Citations list http://scholar.google.com/scholar?cites=17749203648027613321&as_sdt=2005&sciodt=0,5&hl=en
4646
Versions list http://scholar.google.com/scholar?cluster=17749203648027613321&hl=en&as_sdt=0,5
47+
Excerpt The formal similarity between the chromatic distribution curve for thermal radiation [...]
48+
4749

4850
Note the cluster ID in the above. Using this ID, you can directly access the cluster of articles Google Scholar has already determined to be variants of the same paper. So, let's see the versions:
4951

@@ -55,18 +57,21 @@ Note the cluster ID in the above. Using this ID, you can directly access the clu
5557
Cluster ID 17749203648027613321
5658
PDF link http://icole.mut-es.ac.ir/downloads/Sci_Sec/W1/Einstein%201917.pdf
5759
Citations list http://scholar.google.com/scholar?cites=17749203648027613321&as_sdt=2005&sciodt=0,5&hl=en
58-
60+
Excerpt The formal similarity between the chromatic distribution curve for thermal radiation [...]
61+
5962
Title ON THE QUANTUM THEORY OF RADIATION
6063
URL http://www.informationphilosopher.com/solutions/scientists/einstein/1917_Radiation.pdf
6164
Citations 0
6265
Versions 0
6366
PDF link http://www.informationphilosopher.com/solutions/scientists/einstein/1917_Radiation.pdf
67+
Excerpt The formal similarity between the chromatic distribution curve for thermal radiation [...]
6468

6569
Title The Quantum Theory of Radiation
6670
URL http://web.ihep.su/dbserv/compas/src/einstein17/eng.pdf
6771
Citations 0
6872
Versions 0
6973
PDF link http://web.ihep.su/dbserv/compas/src/einstein17/eng.pdf
74+
Excerpt 1 on the assumption that there are discrete elements of energy, from which quantum [...]
7075

7176

7277
Let's retrieve a BibTeX entry for that quantum theory paper. The best BibTeX often seems to be the one linked from search results, not those in the article cluster, so let's do a search again:

scholar.py

+41-7
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@
77
# ChangeLog
88
# ---------
99
#
10+
# 2.7 Ability to extract content excerpts as reported in search results.
11+
# Also a fix to -s|--some and -n|--none: these did not yet support
12+
# passing lists of phrases. This now works correctly if you provide
13+
# separate phrases via commas.
14+
#
1015
# 2.6 Ability to disable inclusion of patents and citations. This
1116
# has the same effect as unchecking the two patents/citations
1217
# checkboxes in the Scholar UI, which are checked by default.
@@ -191,7 +196,7 @@ class QueryArgumentError(Error):
191196
class ScholarConf(object):
192197
"""Helper class for global settings."""
193198

194-
VERSION = '2.6'
199+
VERSION = '2.7'
195200
LOG_LEVEL = 1
196201
MAX_PAGE_RESULTS = 20 # Current maximum for per-page results
197202
SCHOLAR_SITE = 'http://scholar.google.com'
@@ -249,6 +254,7 @@ def __init__(self):
249254
'url_citations': [None, 'Citations list', 7],
250255
'url_versions': [None, 'Versions list', 8],
251256
'url_citation': [None, 'Citation link', 9],
257+
'excerpt': [None, 'Excerpt', 10],
252258
}
253259

254260
# The citation data in one of the standard export formats,
@@ -376,7 +382,6 @@ def _parse_globals(self):
376382
except (IndexError, ValueError):
377383
pass
378384

379-
380385
def _parse_article(self, div):
381386
self.article = ScholarArticle()
382387

@@ -566,6 +571,14 @@ def _parse_article(self, div):
566571
if tag.find('div', {'class': 'gs_fl'}):
567572
self._parse_links(tag.find('div', {'class': 'gs_fl'}))
568573

574+
if tag.find('div', {'class': 'gs_rs'}):
575+
# These are the content excerpts rendered into the results.
576+
raw_text = tag.find('div', {'class': 'gs_rs'}).findAll(text=True)
577+
if len(raw_text) > 0:
578+
raw_text = ''.join(raw_text)
579+
raw_text = raw_text.replace('\n', '')
580+
self.article['excerpt'] = raw_text
581+
569582

570583
class ScholarQuery(object):
571584
"""
@@ -671,7 +684,7 @@ class SearchScholarQuery(ScholarQuery):
671684
+ '&as_publication=%(pub)s' \
672685
+ '&as_ylo=%(ylo)s' \
673686
+ '&as_yhi=%(yhi)s' \
674-
+ '&as_sdt=%(patents)s,5' \
687+
+ '&as_sdt=%(patents)s%%2C5' \
675688
+ '&as_vis=%(citations)s' \
676689
+ '&btnG=&hl=en' \
677690
+ '&num=%(num)s'
@@ -745,9 +758,30 @@ def get_url(self):
745758
and self.timeframe[0] is None and self.timeframe[1] is None:
746759
raise QueryArgumentError('search query needs more parameters')
747760

761+
# If we have some-words or none-words lists, we need to
762+
# process them so GS understands them. For simple
763+
# space-separeted word lists, there's nothing to do. For lists
764+
# of phrases we have to ensure quotations around the phrases,
765+
# separating them by whitespace.
766+
words_some = None
767+
words_none = None
768+
769+
if self.words_some:
770+
if self.words_some.find(',') >= 0:
771+
phrases = self.words_some.split(',')
772+
words_some = ' '.join(['"' + phrase.strip() + '"' for phrase in phrases])
773+
else:
774+
words_some = self.words_some
775+
if self.words_none:
776+
if self.words_none.find(',') >= 0:
777+
phrases = self.words_none.split(',')
778+
words_none = ' '.join(['"' + phrase.strip() + '"' for phrase in phrases])
779+
else:
780+
words_none = self.words_none
781+
748782
urlargs = {'words': self.words or '',
749-
'words_some': self.words_some or '',
750-
'words_none': self.words_none or '',
783+
'words_some': words_some or '',
784+
'words_none': words_none or '',
751785
'phrase': self.phrase or '',
752786
'scope': 'title' if self.scope_title else 'any',
753787
'authors': self.author or '',
@@ -1071,9 +1105,9 @@ def main():
10711105
group.add_option('-A', '--all', metavar='WORDS', default=None, dest='allw',
10721106
help='Results must contain all of these words')
10731107
group.add_option('-s', '--some', metavar='WORDS', default=None,
1074-
help='Results must contain at least one of these words')
1108+
help='Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases')
10751109
group.add_option('-n', '--none', metavar='WORDS', default=None,
1076-
help='Results must contain none of these words')
1110+
help='Results must contain none of these words. See -s|--some re. formatting')
10771111
group.add_option('-p', '--phrase', metavar='PHRASE', default=None,
10781112
help='Results must contain exact phrase')
10791113
group.add_option('-t', '--title-only', action='store_true', default=False,

0 commit comments

Comments
 (0)