Skip to content

Commit

Permalink
ReutersSpider: notes on slow feed and possible fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
pmyteh committed Apr 11, 2017
1 parent 5560731 commit 237c8dd
Showing 1 changed file with 22 additions and 4 deletions.
26 changes: 22 additions & 4 deletions RISJbot/spiders/uk/reuters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from RISJbot.spiders.basespiders import NewsSitemapSpider
from RISJbot.loaders import NewsLoader
# Note: mutate_selector_del_xpath is somewhat naughty. Read its docstring.
from RISJbot.utils import mutate_selector_del_xpath
from RISJbot.utils import mutate_selector_del
from scrapy.loader.processors import Identity, TakeFirst
from scrapy.loader.processors import Join, Compose, MapCompose
import re
Expand All @@ -23,6 +23,19 @@ class ReutersSpider(NewsSitemapSpider):
# allowed_domains = ['uk.reuters.com']
# NOTE: This is a full sitemap, not the usual Google News feed. We use
# sitemap_follow to restrict this only to the last few days
# FIXME: These don't seem to be published until quite early the following
# morning, in a batch. Perhaps move to RSS feeds? (Multiple, at
# http://uk.reuters.com/tools/rss). Note that these spray URLs which
# are (1) not unique for a given article - different URLs accessing
# via different feeds; (2) not the same at arrival as at
# dispatch (301 redirect); (3) not the same as the sitemap URLs.
# This will mean that RefetchControl will reget everything and
# massive numbers of duplicates will be got unless some kind of URL
# normalisation is done in self.parse_node:
# http://feeds.reuters.com/~r/Reuters/UKTopNews/~3/GmNrSI5n5QE/uk-tesco-results-idUKKBN17D1GO
# -> http://uk.reuters.com/articles/uk-tesco-results-idUKKBN17D1GO
# ... and probably also force-set meta['refetchcontrol_key'] to a
# hash of the normalised URL alone.
sitemap_urls = ['http://uk.reuters.com/sitemap_index.xml']
sitemap_follow = [gen_reuters_recent_regex(1)]

Expand All @@ -35,7 +48,7 @@ def parse_page(self, response):
s = response.selector
# Remove any content from the tree before passing it to the loader.
# There aren't native scrapy loader/selector methods for this.
mutate_selector_del_xpath(s, '//div[contains(@class, "related-content")]')
mutate_selector_del(s, 'css', 'div.related-content')

l = NewsLoader(selector=s)

Expand All @@ -53,7 +66,12 @@ def parse_page(self, response):
#l.add_schemaorg_bylines()
#l.add_dublincore()

l.add_xpath('bodytext', '//span[@id="article-text"]/*[not(@class="author")]//text()')
l.add_xpath('summary', '//meta[@name="description"]/@content')
l.add_xpath('bodytext',
'//span[@id="article-text"]/'
'*[not(@class="author")]//text()')
l.add_xpath('summary',
'//meta[@name="description"]/@content')

l.add_value('notes', 'fetchtime delayed by slow feed')

return l.load_item()

0 comments on commit 237c8dd

Please sign in to comment.