Skip to content

Commit 693652d

Browse files
authored
Replace hpricot with nokogiri in wordpressdotcom (#555)
Merge pull request 555
1 parent 0e3a6f1 commit 693652d

File tree

6 files changed

+1035
-53
lines changed

6 files changed

+1035
-53
lines changed

docs/_importers/wordpressdotcom.md

+3-4
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,9 @@ Their default values are what you see above.
1717

1818
### Further WordPress migration alternatives
1919

20-
While the above method works, it does not import much of the metadata that is
21-
usually stored in WordPress posts and pages. If you need to export things like
22-
pages, tags, custom fields, image attachments and so on, the following resources
23-
might be useful to you:
20+
While the above method works, it doesn't import absolutely every piece of
21+
metadata. If you need to import custom fields from your pages and posts,
22+
the following resources might be useful to you:
2423

2524
- [Exitwp](https://github.com/thomasf/exitwp) is a configurable tool written in
2625
Python for migrating one or more WordPress blogs into Jekyll (Markdown) format

jekyll-import.gemspec

-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ Gem::Specification.new do |s|
5151

5252
# importer dependencies:
5353
# s.add_development_dependency("behance", "~> 0.3") # uses outdated dependencies
54-
s.add_development_dependency("hpricot", "~> 0.8")
5554
s.add_development_dependency("htmlentities", "~> 4.3")
5655
s.add_development_dependency("mysql2", "~> 0.3")
5756
s.add_development_dependency("open_uri_redirections", "~> 0.2")

lib/jekyll-import/importers/wordpressdotcom.rb

+34-34
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ def self.require_deps
88
rubygems
99
fileutils
1010
safe_yaml
11-
hpricot
11+
nokogiri
1212
time
1313
open-uri
1414
open_uri_redirections
@@ -22,16 +22,16 @@ def self.specify_options(c)
2222
end
2323

2424
# Will modify post DOM tree
25-
def self.download_images(title, post_hpricot, assets_folder)
26-
images = (post_hpricot / "img")
25+
def self.download_images(title, post_doc, assets_folder)
26+
images = post_doc.css("img")
2727
return if images.empty?
2828

29-
Jekyll.logger.info "Downloading images for ", title
29+
Jekyll.logger.info "Downloading:", "images for #{title}"
3030
images.each do |i|
3131
uri = URI::DEFAULT_PARSER.escape(i["src"])
3232

3333
dst = File.join(assets_folder, File.basename(uri))
34-
i["src"] = File.join("{{ site.baseurl }}", dst)
34+
i["src"] = File.join("{{site.baseurl}}", dst)
3535
Jekyll.logger.info uri
3636
if File.exist?(dst)
3737
Jekyll.logger.info "Already in cache. Clean assets folder if you want a redownload."
@@ -54,15 +54,18 @@ def self.download_images(title, post_hpricot, assets_folder)
5454

5555
class Item
5656
def initialize(node)
57+
raise "Node is nil" if node.nil?
58+
5759
@node = node
5860
end
5961

6062
def text_for(path)
61-
@node.at(path).inner_text
63+
subnode = @node.at_xpath("./#{path}") || @node.at(path) || @node.children.find { |child| child.name == path }
64+
subnode.text
6265
end
6366

6467
def title
65-
@title ||= text_for(:title).strip
68+
@title ||= text_for("title").strip
6669
end
6770

6871
def permalink_title
@@ -76,12 +79,10 @@ def permalink_title
7679
end
7780

7881
def permalink
79-
# Hpricot thinks "link" is a self closing tag so it puts the text of the link after the tag
80-
# but sometimes it works right! I think it's the xml declaration
8182
@permalink ||= begin
8283
uri = text_for("link")
83-
uri = @node.at("link").following[0] if uri.empty?
84-
URI(uri.to_s).path
84+
uri = @node.at("link").next_sibling.text if uri.empty?
85+
URI(uri.to_s.strip).path
8586
end
8687
end
8788

@@ -127,12 +128,8 @@ def published?
127128

128129
def excerpt
129130
@excerpt ||= begin
130-
text = Hpricot(text_for("excerpt:encoded")).inner_text
131-
if text.empty?
132-
nil
133-
else
134-
text
135-
end
131+
text = Nokogiri::HTML(text_for("excerpt:encoded")).text
132+
text.empty? ? nil : text
136133
end
137134
end
138135
end
@@ -144,29 +141,32 @@ def self.process(options)
144141
FileUtils.mkdir_p(assets_folder)
145142

146143
import_count = Hash.new(0)
147-
doc = Hpricot::XML(File.read(source))
144+
doc = Nokogiri::XML(File.read(source))
148145
# Fetch authors data from header
149146
authors = Hash[
150-
(doc / :channel / "wp:author").map do |author|
151-
[author.at("wp:author_login").inner_text.strip, {
152-
"login" => author.at("wp:author_login").inner_text.strip,
153-
"email" => author.at("wp:author_email").inner_text,
154-
"display_name" => author.at("wp:author_display_name").inner_text,
155-
"first_name" => author.at("wp:author_first_name").inner_text,
156-
"last_name" => author.at("wp:author_last_name").inner_text,
157-
},]
147+
doc.xpath("//channel/wp:author").map do |author|
148+
[
149+
author.xpath("./wp:author_login").text.strip,
150+
{
151+
"login" => author.xpath("./wp:author_login").text.strip,
152+
"email" => author.xpath("./wp:author_email").text,
153+
"display_name" => author.xpath("./wp:author_display_name").text,
154+
"first_name" => author.xpath("./wp:author_first_name").text,
155+
"last_name" => author.xpath("./wp:author_last_name").text,
156+
},
157+
]
158158
end
159159
] rescue {}
160160

161-
(doc / :channel / :item).each do |node|
161+
doc.css("channel > item").each do |node|
162162
item = Item.new(node)
163-
categories = node.search('category[@domain="category"]').map(&:inner_text).reject { |c| c == "Uncategorized" }.uniq
164-
tags = node.search('category[@domain="post_tag"]').map(&:inner_text).uniq
163+
categories = node.css('category[domain="category"]').map(&:text).reject { |c| c == "Uncategorized" }.uniq
164+
tags = node.css('category[domain="post_tag"]').map(&:text).uniq
165165

166166
metas = {}
167-
node.search("wp:postmeta").each do |meta|
168-
key = meta.at("wp:meta_key").inner_text
169-
value = meta.at("wp:meta_value").inner_text
167+
node.xpath("./wp:postmeta").each do |meta|
168+
key = meta.at_xpath("./wp:meta_key").text
169+
value = meta.at_xpath("./wp:meta_value").text
170170
metas[key] = value
171171
end
172172

@@ -189,7 +189,7 @@ def self.process(options)
189189
}
190190

191191
begin
192-
content = Hpricot(item.text_for("content:encoded"))
192+
content = Nokogiri::HTML(item.text_for("content:encoded"))
193193
header["excerpt"] = item.excerpt if item.excerpt
194194

195195
if fetch
@@ -221,7 +221,7 @@ def self.process(options)
221221
end
222222

223223
import_count.each do |key, value|
224-
Jekyll.logger.info "Imported #{value} #{key}s"
224+
Jekyll.logger.info "Imported", "#{value} #{Util.pluralize(key, value)}"
225225
end
226226
end
227227

lib/jekyll-import/util.rb

+8
Original file line numberDiff line numberDiff line change
@@ -73,5 +73,13 @@ def self.wpautop(pee, br = true)
7373
end
7474
pee
7575
end
76+
77+
def self.pluralize(word, count)
78+
return word if count <= 1
79+
80+
return word if word.end_with?("s")
81+
82+
"#{word}s"
83+
end
7684
end
7785
end

0 commit comments

Comments
 (0)