Skip to content

Commit b5a7e9a

Browse files
committed
Replace hpricot with nokogiri in wordpressdotcom
hpricot is no longer supported and doesn't build on some modern systems like Mac. Our last use of Hpricot was wordpressdotcom importer, which is now converted to use nokogiri. Rephrase wordpressdotcom importer alternative caveat. The wordpressdotcom importer does import pages, posts, images, etc so previous statement was untrue.
1 parent a8bdf68 commit b5a7e9a

File tree

6 files changed

+1035
-53
lines changed

6 files changed

+1035
-53
lines changed

docs/_importers/wordpressdotcom.md

+3-4
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,9 @@ Their default values are what you see above.
1717

1818
### Further WordPress migration alternatives
1919

20-
While the above method works, it does not import much of the metadata that is
21-
usually stored in WordPress posts and pages. If you need to export things like
22-
pages, tags, custom fields, image attachments and so on, the following resources
23-
might be useful to you:
20+
While the above method works, it doesn't import absolutely every piece of
21+
metadata. If you need to import custom fields from your pages and posts,
22+
the following resources might be useful to you:
2423

2524
- [Exitwp](https://github.com/thomasf/exitwp) is a configurable tool written in
2625
Python for migrating one or more WordPress blogs into Jekyll (Markdown) format

jekyll-import.gemspec

-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ Gem::Specification.new do |s|
5151

5252
# importer dependencies:
5353
# s.add_development_dependency("behance", "~> 0.3") # uses outdated dependencies
54-
s.add_development_dependency("hpricot", "~> 0.8")
5554
s.add_development_dependency("htmlentities", "~> 4.3")
5655
s.add_development_dependency("mysql2", "~> 0.3")
5756
s.add_development_dependency("open_uri_redirections", "~> 0.2")

lib/jekyll-import/importers/wordpressdotcom.rb

+34-34
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ def self.require_deps
88
rubygems
99
fileutils
1010
safe_yaml
11-
hpricot
11+
nokogiri
1212
time
1313
open-uri
1414
open_uri_redirections
@@ -22,16 +22,16 @@ def self.specify_options(c)
2222
end
2323

2424
# Will modify post DOM tree
25-
def self.download_images(title, post_hpricot, assets_folder)
26-
images = (post_hpricot / "img")
25+
def self.download_images(title, post_doc, assets_folder)
26+
images = post_doc.css("img")
2727
return if images.empty?
2828

29-
Jekyll.logger.info "Downloading images for ", title
29+
Jekyll.logger.info "Downloading:", "images for #{title}"
3030
images.each do |i|
3131
uri = URI::DEFAULT_PARSER.escape(i["src"])
3232

3333
dst = File.join(assets_folder, File.basename(uri))
34-
i["src"] = File.join("{{ site.baseurl }}", dst)
34+
i["src"] = File.join("{{site.baseurl}}", dst)
3535
Jekyll.logger.info uri
3636
if File.exist?(dst)
3737
Jekyll.logger.info "Already in cache. Clean assets folder if you want a redownload."
@@ -54,15 +54,18 @@ def self.download_images(title, post_hpricot, assets_folder)
5454

5555
class Item
5656
def initialize(node)
57+
raise "Node is nil" if node.nil?
58+
5759
@node = node
5860
end
5961

6062
def text_for(path)
61-
@node.at(path).inner_text
63+
subnode = @node.at_xpath("./#{path}") || @node.at(path) || @node.children.find { |child| child.name == path }
64+
subnode.text
6265
end
6366

6467
def title
65-
@title ||= text_for(:title).strip
68+
@title ||= text_for("title").strip
6669
end
6770

6871
def permalink_title
@@ -76,12 +79,10 @@ def permalink_title
7679
end
7780

7881
def permalink
79-
# Hpricot thinks "link" is a self closing tag so it puts the text of the link after the tag
80-
# but sometimes it works right! I think it's the xml declaration
8182
@permalink ||= begin
8283
uri = text_for("link")
83-
uri = @node.at("link").following[0] if uri.empty?
84-
URI(uri.to_s).path
84+
uri = @node.at("link").next_sibling.text if uri.empty?
85+
URI(uri.to_s.strip).path
8586
end
8687
end
8788

@@ -127,12 +128,8 @@ def published?
127128

128129
def excerpt
129130
@excerpt ||= begin
130-
text = Hpricot(text_for("excerpt:encoded")).inner_text
131-
if text.empty?
132-
nil
133-
else
134-
text
135-
end
131+
text = Nokogiri::HTML(text_for("excerpt:encoded")).text
132+
text.empty? ? nil : text
136133
end
137134
end
138135
end
@@ -144,29 +141,32 @@ def self.process(options)
144141
FileUtils.mkdir_p(assets_folder)
145142

146143
import_count = Hash.new(0)
147-
doc = Hpricot::XML(File.read(source))
144+
doc = Nokogiri::XML(File.read(source))
148145
# Fetch authors data from header
149146
authors = Hash[
150-
(doc / :channel / "wp:author").map do |author|
151-
[author.at("wp:author_login").inner_text.strip, {
152-
"login" => author.at("wp:author_login").inner_text.strip,
153-
"email" => author.at("wp:author_email").inner_text,
154-
"display_name" => author.at("wp:author_display_name").inner_text,
155-
"first_name" => author.at("wp:author_first_name").inner_text,
156-
"last_name" => author.at("wp:author_last_name").inner_text,
157-
},]
147+
doc.xpath("//channel/wp:author").map do |author|
148+
[
149+
author.xpath("./wp:author_login").text.strip,
150+
{
151+
"login" => author.xpath("./wp:author_login").text.strip,
152+
"email" => author.xpath("./wp:author_email").text,
153+
"display_name" => author.xpath("./wp:author_display_name").text,
154+
"first_name" => author.xpath("./wp:author_first_name").text,
155+
"last_name" => author.xpath("./wp:author_last_name").text,
156+
},
157+
]
158158
end
159159
] rescue {}
160160

161-
(doc / :channel / :item).each do |node|
161+
doc.css("channel > item").each do |node|
162162
item = Item.new(node)
163-
categories = node.search('category[@domain="category"]').map(&:inner_text).reject { |c| c == "Uncategorized" }.uniq
164-
tags = node.search('category[@domain="post_tag"]').map(&:inner_text).uniq
163+
categories = node.css('category[domain="category"]').map(&:text).reject { |c| c == "Uncategorized" }.uniq
164+
tags = node.css('category[domain="post_tag"]').map(&:text).uniq
165165

166166
metas = {}
167-
node.search("wp:postmeta").each do |meta|
168-
key = meta.at("wp:meta_key").inner_text
169-
value = meta.at("wp:meta_value").inner_text
167+
node.xpath("./wp:postmeta").each do |meta|
168+
key = meta.at_xpath("./wp:meta_key").text
169+
value = meta.at_xpath("./wp:meta_value").text
170170
metas[key] = value
171171
end
172172

@@ -189,7 +189,7 @@ def self.process(options)
189189
}
190190

191191
begin
192-
content = Hpricot(item.text_for("content:encoded"))
192+
content = Nokogiri::HTML(item.text_for("content:encoded"))
193193
header["excerpt"] = item.excerpt if item.excerpt
194194

195195
if fetch
@@ -221,7 +221,7 @@ def self.process(options)
221221
end
222222

223223
import_count.each do |key, value|
224-
Jekyll.logger.info "Imported #{value} #{key}s"
224+
Jekyll.logger.info "Imported", "#{value} #{Util.pluralize(key, value)}"
225225
end
226226
end
227227

lib/jekyll-import/util.rb

+8
Original file line numberDiff line numberDiff line change
@@ -73,5 +73,13 @@ def self.wpautop(pee, br = true)
7373
end
7474
pee
7575
end
76+
77+
def self.pluralize(word, count)
78+
return word if count <= 1
79+
80+
return word if word.end_with?("s")
81+
82+
"#{word}s"
83+
end
7684
end
7785
end

0 commit comments

Comments
 (0)