-
-
Notifications
You must be signed in to change notification settings - Fork 2.4k
/
Copy pathscraper.rb
273 lines (225 loc) · 6.6 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
require 'set'
module Docs
class Scraper < Doc
class << self
attr_accessor :base_url, :root_path, :initial_paths, :options, :html_filters, :text_filters, :stubs
def inherited(subclass)
super
subclass.class_eval do
extend AutoloadHelper
autoload_all "docs/filters/#{to_s.demodulize.underscore}", 'filter'
end
subclass.base_url = base_url
subclass.root_path = root_path
subclass.initial_paths = initial_paths.dup
subclass.options = options.deep_dup
subclass.html_filters = html_filters.inheritable_copy
subclass.text_filters = text_filters.inheritable_copy
subclass.stubs = stubs.dup
end
def filters
html_filters.to_a + text_filters.to_a
end
def stub(path, &block)
@stubs[path] = block
@stubs
end
end
include Instrumentable
self.initial_paths = []
self.options = {}
self.stubs = {}
self.html_filters = FilterStack.new
self.text_filters = FilterStack.new
html_filters.push 'apply_base_url', 'container', 'clean_html', 'normalize_urls', 'internal_urls', 'normalize_paths', 'parse_cf_email', 'external_urls'
text_filters.push 'images' # ensure the images filter runs after all html filters
text_filters.push 'inner_html', 'clean_text', 'attribution'
def initialize
super
initialize_stubs
end
def initialize_stubs
self.class.stubs.each do |path, block|
Typhoeus.stub(url_for(path)).and_return do
Typhoeus::Response.new \
effective_url: url_for(path),
code: 200,
headers: { 'Content-Type' => 'text/html' },
body: self.instance_exec(&block)
end
end
end
def build_page(path)
response = request_one url_for(path)
result = handle_response(response)
yield result if block_given?
result
end
def build_pages
history = Set.new initial_urls.map(&:downcase)
instrument 'running.scraper', urls: initial_urls
request_all initial_urls do |response|
next unless data = handle_response(response)
yield data
next unless data[:internal_urls].present?
next_urls = data[:internal_urls].select { |url| history.add?(url.downcase) }
instrument 'queued.scraper', urls: next_urls
next_urls
end
end
def base_url
@base_url ||= URL.parse self.class.base_url
end
def root_url
@root_url ||= root_path? ? URL.parse(File.join(base_url.to_s, root_path)) : base_url.normalize
end
def root_path
self.class.root_path
end
def root_path?
root_path.present? && root_path != '/'
end
def initial_paths
self.class.initial_paths
end
def initial_urls
@initial_urls ||= [root_url.to_s].concat(initial_paths.map(&method(:url_for))).freeze
end
def pipeline
@pipeline ||= ::HTML::Pipeline.new(self.class.filters).tap do |pipeline|
pipeline.instrumentation_service = Docs
end
end
def options
@options ||= self.class.options.deep_dup.tap do |options|
options.merge! base_url: base_url, root_url: root_url,
root_path: root_path, initial_paths: initial_paths,
version: self.class.version, release: self.class.release
if root_path?
(options[:skip] ||= []).concat ['', '/']
end
if options[:only] || options[:only_patterns]
(options[:only] ||= []).concat initial_paths + (root_path? ? [root_path] : ['', '/'])
end
options.merge!(additional_options)
options.freeze
end
end
private
def request_one(url)
raise NotImplementedError
end
def request_all(url, &block)
raise NotImplementedError
end
def process_response?(response)
raise NotImplementedError
end
def url_for(path)
if path.empty? || path == '/'
root_url.to_s
else
File.join(base_url.to_s, path)
end
end
def handle_response(response)
if process_response?(response)
instrument 'process_response.scraper', response: response do
process_response(response)
end
else
instrument 'ignore_response.scraper', response: response
end
rescue => e
if Docs.rescue_errors
instrument 'error.doc', exception: e, url: response.url
nil
else
raise e
end
end
def process_response(response)
data = {}
html, title = parse(response)
context = pipeline_context(response)
context[:html_title] = title
pipeline.call(html, context, data)
data
end
def pipeline_context(response)
options.merge url: response.url
end
def parse(response)
parser = Parser.new(response.body)
[parser.html, parser.title]
end
def with_filters(*filters)
stack = FilterStack.new
stack.push(*filters)
pipeline.instance_variable_set :@filters, stack.to_a.freeze
yield
ensure
@pipeline = nil
end
def additional_options
{}
end
module FixInternalUrlsBehavior
def self.included(base)
base.extend ClassMethods
end
def self.prepended(base)
class << base
prepend ClassMethods
end
end
module ClassMethods
def internal_urls
@internal_urls
end
def store_pages(store)
instrument 'info.doc', msg: 'Building internal urls...'
with_internal_urls do
instrument 'info.doc', msg: 'Continuing...'
super
end
end
private
def with_internal_urls
@internal_urls = new.fetch_internal_urls
yield
ensure
@internal_urls = nil
end
end
def fetch_internal_urls
result = []
build_pages do |page|
result << page[:subpath] if page[:entries].present?
end
result
end
def initial_urls
return super unless self.class.internal_urls
@initial_urls ||= self.class.internal_urls.map(&method(:url_for)).freeze
end
private
def additional_options
if self.class.internal_urls
super.merge! \
only: self.class.internal_urls.to_set,
only_patterns: nil,
skip: nil,
skip_patterns: nil,
skip_links: nil,
fixed_internal_urls: true
else
super
end
end
def process_response(response)
super.merge! response_url: response.url
end
end
end
end