diff --git a/.gitignore b/.gitignore index 050c9d9..dc8bb65 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,6 @@ /log/* !/log/.keep /tmp + +# Ignore sitemap +public/sitemap.xml diff --git a/Gemfile b/Gemfile index dadb5a0..7bbeb72 100644 --- a/Gemfile +++ b/Gemfile @@ -23,6 +23,9 @@ gem 'jbuilder', '~> 2.0' # bundle exec rake doc:rails generates the API under doc/api. gem 'sdoc', '~> 0.4.0', group: :doc +# Use Sitemap to generate sitemap +gem 'sitemap' + # Use ActiveModel has_secure_password # gem 'bcrypt', '~> 3.1.7' diff --git a/app/jobs/sitemap_regenerate_job.rb b/app/jobs/sitemap_regenerate_job.rb new file mode 100644 index 0000000..53ef0fa --- /dev/null +++ b/app/jobs/sitemap_regenerate_job.rb @@ -0,0 +1,6 @@ +class SitemapRegenerateJob < ActiveJob::Base + def perform + Rake::Task['sitemap:generate'].invoke + Rake::Task['sitemap:ping'].invoke + end +end diff --git a/config/sitemap.rb b/config/sitemap.rb new file mode 100644 index 0000000..4024d7c --- /dev/null +++ b/config/sitemap.rb @@ -0,0 +1,13 @@ +Sitemap::Generator.instance.load(host: 'example.com') do + path :root, priority: 1, change_frequency: 'weekly' + path :search_catalog, priority: 1, change_frequency: 'weekly' + read_group = Solrizer.solr_name('read_access_group', :symbol) + Work.where(read_group => 'public').each do |f| + literal Rails.application.routes.url_helpers.curation_concerns_work_path(f), + priority: 1, change_frequency: 'weekly' + end + Collection.where(read_group => 'public').each do |c| + literal Rails.application.routes.url_helpers.collection_path(c), + priority: 1, change_frequency: 'weekly' + end +end diff --git a/lib/tasks/gwss.rake b/lib/tasks/gwss.rake new file mode 100644 index 0000000..f38b075 --- /dev/null +++ b/lib/tasks/gwss.rake @@ -0,0 +1,11 @@ +namespace :gwss do + # adding a logger since it got removed from our gemset + def logger + Rails.logger + end + + desc "Queues a job to (re)generate the sitemap.xml" + task "sitemap_queue_generate" => :environment do + SitemapRegenerateJob.perform_later + end +end diff --git a/public/robots.txt b/public/robots.txt index 3c9c7c0..4c696be 100644 --- a/public/robots.txt +++ b/public/robots.txt @@ -1,5 +1,10 @@ # See http://www.robotstxt.org/robotstxt.html for documentation on how to use the robots.txt file # # To ban all spiders from the entire site uncomment the next two lines: -# User-agent: * -# Disallow: / +User-agent: * +Disallow: /*?file=thumbnail$ +Sitemap: https://example.com/sitemap.xml +User-agent: AhrefsBot +Disallow: / +User-agent: Pcore-HTTP/v0.23.20 +Disallow: /