|
7 | 7 | # ChangeLog
|
8 | 8 | # ---------
|
9 | 9 | #
|
| 10 | +# 2.7 Ability to extract content excerpts as reported in search results. |
| 11 | +# Also a fix to -s|--some and -n|--none: these did not yet support |
| 12 | +# passing lists of phrases. This now works correctly if you provide |
| 13 | +# separate phrases via commas. |
| 14 | +# |
10 | 15 | # 2.6 Ability to disable inclusion of patents and citations. This
|
11 | 16 | # has the same effect as unchecking the two patents/citations
|
12 | 17 | # checkboxes in the Scholar UI, which are checked by default.
|
@@ -191,7 +196,7 @@ class QueryArgumentError(Error):
|
191 | 196 | class ScholarConf(object):
|
192 | 197 | """Helper class for global settings."""
|
193 | 198 |
|
194 |
| - VERSION = '2.6' |
| 199 | + VERSION = '2.7' |
195 | 200 | LOG_LEVEL = 1
|
196 | 201 | MAX_PAGE_RESULTS = 20 # Current maximum for per-page results
|
197 | 202 | SCHOLAR_SITE = 'http://scholar.google.com'
|
@@ -249,6 +254,7 @@ def __init__(self):
|
249 | 254 | 'url_citations': [None, 'Citations list', 7],
|
250 | 255 | 'url_versions': [None, 'Versions list', 8],
|
251 | 256 | 'url_citation': [None, 'Citation link', 9],
|
| 257 | + 'excerpt': [None, 'Excerpt', 10], |
252 | 258 | }
|
253 | 259 |
|
254 | 260 | # The citation data in one of the standard export formats,
|
@@ -376,7 +382,6 @@ def _parse_globals(self):
|
376 | 382 | except (IndexError, ValueError):
|
377 | 383 | pass
|
378 | 384 |
|
379 |
| - |
380 | 385 | def _parse_article(self, div):
|
381 | 386 | self.article = ScholarArticle()
|
382 | 387 |
|
@@ -566,6 +571,14 @@ def _parse_article(self, div):
|
566 | 571 | if tag.find('div', {'class': 'gs_fl'}):
|
567 | 572 | self._parse_links(tag.find('div', {'class': 'gs_fl'}))
|
568 | 573 |
|
| 574 | + if tag.find('div', {'class': 'gs_rs'}): |
| 575 | + # These are the content excerpts rendered into the results. |
| 576 | + raw_text = tag.find('div', {'class': 'gs_rs'}).findAll(text=True) |
| 577 | + if len(raw_text) > 0: |
| 578 | + raw_text = ''.join(raw_text) |
| 579 | + raw_text = raw_text.replace('\n', '') |
| 580 | + self.article['excerpt'] = raw_text |
| 581 | + |
569 | 582 |
|
570 | 583 | class ScholarQuery(object):
|
571 | 584 | """
|
@@ -671,7 +684,7 @@ class SearchScholarQuery(ScholarQuery):
|
671 | 684 | + '&as_publication=%(pub)s' \
|
672 | 685 | + '&as_ylo=%(ylo)s' \
|
673 | 686 | + '&as_yhi=%(yhi)s' \
|
674 |
| - + '&as_sdt=%(patents)s,5' \ |
| 687 | + + '&as_sdt=%(patents)s%%2C5' \ |
675 | 688 | + '&as_vis=%(citations)s' \
|
676 | 689 | + '&btnG=&hl=en' \
|
677 | 690 | + '&num=%(num)s'
|
@@ -745,9 +758,30 @@ def get_url(self):
|
745 | 758 | and self.timeframe[0] is None and self.timeframe[1] is None:
|
746 | 759 | raise QueryArgumentError('search query needs more parameters')
|
747 | 760 |
|
| 761 | + # If we have some-words or none-words lists, we need to |
| 762 | + # process them so GS understands them. For simple |
| 763 | + # space-separeted word lists, there's nothing to do. For lists |
| 764 | + # of phrases we have to ensure quotations around the phrases, |
| 765 | + # separating them by whitespace. |
| 766 | + words_some = None |
| 767 | + words_none = None |
| 768 | + |
| 769 | + if self.words_some: |
| 770 | + if self.words_some.find(',') >= 0: |
| 771 | + phrases = self.words_some.split(',') |
| 772 | + words_some = ' '.join(['"' + phrase.strip() + '"' for phrase in phrases]) |
| 773 | + else: |
| 774 | + words_some = self.words_some |
| 775 | + if self.words_none: |
| 776 | + if self.words_none.find(',') >= 0: |
| 777 | + phrases = self.words_none.split(',') |
| 778 | + words_none = ' '.join(['"' + phrase.strip() + '"' for phrase in phrases]) |
| 779 | + else: |
| 780 | + words_none = self.words_none |
| 781 | + |
748 | 782 | urlargs = {'words': self.words or '',
|
749 |
| - 'words_some': self.words_some or '', |
750 |
| - 'words_none': self.words_none or '', |
| 783 | + 'words_some': words_some or '', |
| 784 | + 'words_none': words_none or '', |
751 | 785 | 'phrase': self.phrase or '',
|
752 | 786 | 'scope': 'title' if self.scope_title else 'any',
|
753 | 787 | 'authors': self.author or '',
|
@@ -1071,9 +1105,9 @@ def main():
|
1071 | 1105 | group.add_option('-A', '--all', metavar='WORDS', default=None, dest='allw',
|
1072 | 1106 | help='Results must contain all of these words')
|
1073 | 1107 | group.add_option('-s', '--some', metavar='WORDS', default=None,
|
1074 |
| - help='Results must contain at least one of these words') |
| 1108 | + help='Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases') |
1075 | 1109 | group.add_option('-n', '--none', metavar='WORDS', default=None,
|
1076 |
| - help='Results must contain none of these words') |
| 1110 | + help='Results must contain none of these words. See -s|--some re. formatting') |
1077 | 1111 | group.add_option('-p', '--phrase', metavar='PHRASE', default=None,
|
1078 | 1112 | help='Results must contain exact phrase')
|
1079 | 1113 | group.add_option('-t', '--title-only', action='store_true', default=False,
|
|
0 commit comments