Skip to content

Commit c68ab98

Browse files
committed
Replace hpricot with nokogiri in wordpressdotcom
hpricot is no longer supported and doesn't build on some modern systems like Mac. Our last use of Hpricot was wordpressdotcom importer, which is now converted to use nokogiri.
1 parent a8bdf68 commit c68ab98

File tree

4 files changed

+1013
-47
lines changed

4 files changed

+1013
-47
lines changed

jekyll-import.gemspec

-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ Gem::Specification.new do |s|
5151

5252
# importer dependencies:
5353
# s.add_development_dependency("behance", "~> 0.3") # uses outdated dependencies
54-
s.add_development_dependency("hpricot", "~> 0.8")
5554
s.add_development_dependency("htmlentities", "~> 4.3")
5655
s.add_development_dependency("mysql2", "~> 0.3")
5756
s.add_development_dependency("open_uri_redirections", "~> 0.2")

lib/jekyll-import/importers/wordpressdotcom.rb

+31-32
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ def self.require_deps
88
rubygems
99
fileutils
1010
safe_yaml
11-
hpricot
11+
nokogiri
1212
time
1313
open-uri
1414
open_uri_redirections
@@ -22,16 +22,16 @@ def self.specify_options(c)
2222
end
2323

2424
# Will modify post DOM tree
25-
def self.download_images(title, post_hpricot, assets_folder)
26-
images = (post_hpricot / "img")
25+
def self.download_images(title, post_doc, assets_folder)
26+
images = post_doc.css("img")
2727
return if images.empty?
2828

2929
Jekyll.logger.info "Downloading images for ", title
3030
images.each do |i|
3131
uri = URI::DEFAULT_PARSER.escape(i["src"])
3232

3333
dst = File.join(assets_folder, File.basename(uri))
34-
i["src"] = File.join("{{ site.baseurl }}", dst)
34+
i["src"] = File.join("{{site.baseurl}}", dst)
3535
Jekyll.logger.info uri
3636
if File.exist?(dst)
3737
Jekyll.logger.info "Already in cache. Clean assets folder if you want a redownload."
@@ -55,14 +55,16 @@ def self.download_images(title, post_hpricot, assets_folder)
5555
class Item
5656
def initialize(node)
5757
@node = node
58+
raise "Node is nil" if node.nil?
5859
end
5960

6061
def text_for(path)
61-
@node.at(path).inner_text
62+
subnode = @node.at_xpath("./#{path}") || @node.at(path) || @node.children.find { |child| child.name == path }
63+
subnode.text
6264
end
6365

6466
def title
65-
@title ||= text_for(:title).strip
67+
@title ||= text_for("title").strip
6668
end
6769

6870
def permalink_title
@@ -76,12 +78,10 @@ def permalink_title
7678
end
7779

7880
def permalink
79-
# Hpricot thinks "link" is a self closing tag so it puts the text of the link after the tag
80-
# but sometimes it works right! I think it's the xml declaration
8181
@permalink ||= begin
8282
uri = text_for("link")
83-
uri = @node.at("link").following[0] if uri.empty?
84-
URI(uri.to_s).path
83+
uri = @node.at("link").next_sibling.text if uri.empty?
84+
URI(uri.to_s.strip).path
8585
end
8686
end
8787

@@ -127,12 +127,8 @@ def published?
127127

128128
def excerpt
129129
@excerpt ||= begin
130-
text = Hpricot(text_for("excerpt:encoded")).inner_text
131-
if text.empty?
132-
nil
133-
else
134-
text
135-
end
130+
text = Nokogiri::HTML(text_for("excerpt:encoded")).text
131+
text.empty? ? nil : text
136132
end
137133
end
138134
end
@@ -144,29 +140,32 @@ def self.process(options)
144140
FileUtils.mkdir_p(assets_folder)
145141

146142
import_count = Hash.new(0)
147-
doc = Hpricot::XML(File.read(source))
143+
doc = Nokogiri::XML(File.read(source))
148144
# Fetch authors data from header
149145
authors = Hash[
150-
(doc / :channel / "wp:author").map do |author|
151-
[author.at("wp:author_login").inner_text.strip, {
152-
"login" => author.at("wp:author_login").inner_text.strip,
153-
"email" => author.at("wp:author_email").inner_text,
154-
"display_name" => author.at("wp:author_display_name").inner_text,
155-
"first_name" => author.at("wp:author_first_name").inner_text,
156-
"last_name" => author.at("wp:author_last_name").inner_text,
157-
},]
146+
doc.xpath("//channel/wp:author").map do |author|
147+
[
148+
author.xpath("./wp:author_login").text.strip,
149+
{
150+
"login" => author.xpath("./wp:author_login").text.strip,
151+
"email" => author.xpath("./wp:author_email").text,
152+
"display_name" => author.xpath("./wp:author_display_name").text,
153+
"first_name" => author.xpath("./wp:author_first_name").text,
154+
"last_name" => author.xpath("./wp:author_last_name").text,
155+
},
156+
]
158157
end
159158
] rescue {}
160159

161-
(doc / :channel / :item).each do |node|
160+
doc.css("channel > item").each do |node|
162161
item = Item.new(node)
163-
categories = node.search('category[@domain="category"]').map(&:inner_text).reject { |c| c == "Uncategorized" }.uniq
164-
tags = node.search('category[@domain="post_tag"]').map(&:inner_text).uniq
162+
categories = node.css('category[domain="category"]').map(&:text).reject { |c| c == "Uncategorized" }.uniq
163+
tags = node.css('category[domain="post_tag"]').map(&:text).uniq
165164

166165
metas = {}
167-
node.search("wp:postmeta").each do |meta|
168-
key = meta.at("wp:meta_key").inner_text
169-
value = meta.at("wp:meta_value").inner_text
166+
node.xpath("./wp:postmeta").each do |meta|
167+
key = meta.at_xpath("./wp:meta_key").text
168+
value = meta.at_xpath("./wp:meta_value").text
170169
metas[key] = value
171170
end
172171

@@ -189,7 +188,7 @@ def self.process(options)
189188
}
190189

191190
begin
192-
content = Hpricot(item.text_for("content:encoded"))
191+
content = Nokogiri::HTML(item.text_for("content:encoded"))
193192
header["excerpt"] = item.excerpt if item.excerpt
194193

195194
if fetch

0 commit comments

Comments
 (0)