@@ -8,7 +8,7 @@ def self.require_deps
8
8
rubygems
9
9
fileutils
10
10
safe_yaml
11
- hpricot
11
+ nokogiri
12
12
time
13
13
open-uri
14
14
open_uri_redirections
@@ -22,16 +22,16 @@ def self.specify_options(c)
22
22
end
23
23
24
24
# Will modify post DOM tree
25
- def self . download_images ( title , post_hpricot , assets_folder )
26
- images = ( post_hpricot / "img" )
25
+ def self . download_images ( title , post_doc , assets_folder )
26
+ images = post_doc . css ( "img" )
27
27
return if images . empty?
28
28
29
29
Jekyll . logger . info "Downloading images for " , title
30
30
images . each do |i |
31
31
uri = URI ::DEFAULT_PARSER . escape ( i [ "src" ] )
32
32
33
33
dst = File . join ( assets_folder , File . basename ( uri ) )
34
- i [ "src" ] = File . join ( "{{ site.baseurl }}" , dst )
34
+ i [ "src" ] = File . join ( "{{site.baseurl}}" , dst )
35
35
Jekyll . logger . info uri
36
36
if File . exist? ( dst )
37
37
Jekyll . logger . info "Already in cache. Clean assets folder if you want a redownload."
@@ -55,14 +55,16 @@ def self.download_images(title, post_hpricot, assets_folder)
55
55
class Item
56
56
def initialize ( node )
57
57
@node = node
58
+ raise "Node is nil" if node . nil?
58
59
end
59
60
60
61
def text_for ( path )
61
- @node . at ( path ) . inner_text
62
+ subnode = @node . at_xpath ( "./#{ path } " ) || @node . at ( path ) || @node . children . find { |child | child . name == path }
63
+ subnode . text
62
64
end
63
65
64
66
def title
65
- @title ||= text_for ( : title) . strip
67
+ @title ||= text_for ( " title" ) . strip
66
68
end
67
69
68
70
def permalink_title
@@ -76,12 +78,10 @@ def permalink_title
76
78
end
77
79
78
80
def permalink
79
- # Hpricot thinks "link" is a self closing tag so it puts the text of the link after the tag
80
- # but sometimes it works right! I think it's the xml declaration
81
81
@permalink ||= begin
82
82
uri = text_for ( "link" )
83
- uri = @node . at ( "link" ) . following [ 0 ] if uri . empty?
84
- URI ( uri . to_s ) . path
83
+ uri = @node . at ( "link" ) . next_sibling . text if uri . empty?
84
+ URI ( uri . to_s . strip ) . path
85
85
end
86
86
end
87
87
@@ -127,12 +127,8 @@ def published?
127
127
128
128
def excerpt
129
129
@excerpt ||= begin
130
- text = Hpricot ( text_for ( "excerpt:encoded" ) ) . inner_text
131
- if text . empty?
132
- nil
133
- else
134
- text
135
- end
130
+ text = Nokogiri ::HTML ( text_for ( "excerpt:encoded" ) ) . text
131
+ text . empty? ? nil : text
136
132
end
137
133
end
138
134
end
@@ -144,29 +140,32 @@ def self.process(options)
144
140
FileUtils . mkdir_p ( assets_folder )
145
141
146
142
import_count = Hash . new ( 0 )
147
- doc = Hpricot ::XML ( File . read ( source ) )
143
+ doc = Nokogiri ::XML ( File . read ( source ) )
148
144
# Fetch authors data from header
149
145
authors = Hash [
150
- ( doc / :channel / "wp:author" ) . map do |author |
151
- [ author . at ( "wp:author_login" ) . inner_text . strip , {
152
- "login" => author . at ( "wp:author_login" ) . inner_text . strip ,
153
- "email" => author . at ( "wp:author_email" ) . inner_text ,
154
- "display_name" => author . at ( "wp:author_display_name" ) . inner_text ,
155
- "first_name" => author . at ( "wp:author_first_name" ) . inner_text ,
156
- "last_name" => author . at ( "wp:author_last_name" ) . inner_text ,
157
- } , ]
146
+ doc . xpath ( "//channel/wp:author" ) . map do |author |
147
+ [
148
+ author . xpath ( "./wp:author_login" ) . text . strip ,
149
+ {
150
+ "login" => author . xpath ( "./wp:author_login" ) . text . strip ,
151
+ "email" => author . xpath ( "./wp:author_email" ) . text ,
152
+ "display_name" => author . xpath ( "./wp:author_display_name" ) . text ,
153
+ "first_name" => author . xpath ( "./wp:author_first_name" ) . text ,
154
+ "last_name" => author . xpath ( "./wp:author_last_name" ) . text ,
155
+ } ,
156
+ ]
158
157
end
159
158
] rescue { }
160
159
161
- ( doc / : channel / : item) . each do |node |
160
+ doc . css ( " channel > item" ) . each do |node |
162
161
item = Item . new ( node )
163
- categories = node . search ( 'category[@ domain="category"]' ) . map ( &:inner_text ) . reject { |c | c == "Uncategorized" } . uniq
164
- tags = node . search ( 'category[@ domain="post_tag"]' ) . map ( &:inner_text ) . uniq
162
+ categories = node . css ( 'category[domain="category"]' ) . map ( &:text ) . reject { |c | c == "Uncategorized" } . uniq
163
+ tags = node . css ( 'category[domain="post_tag"]' ) . map ( &:text ) . uniq
165
164
166
165
metas = { }
167
- node . search ( " wp:postmeta") . each do |meta |
168
- key = meta . at ( " wp:meta_key") . inner_text
169
- value = meta . at ( " wp:meta_value") . inner_text
166
+ node . xpath ( "./ wp:postmeta") . each do |meta |
167
+ key = meta . at_xpath ( "./ wp:meta_key") . text
168
+ value = meta . at_xpath ( "./ wp:meta_value") . text
170
169
metas [ key ] = value
171
170
end
172
171
@@ -189,7 +188,7 @@ def self.process(options)
189
188
}
190
189
191
190
begin
192
- content = Hpricot ( item . text_for ( "content:encoded" ) )
191
+ content = Nokogiri :: HTML ( item . text_for ( "content:encoded" ) )
193
192
header [ "excerpt" ] = item . excerpt if item . excerpt
194
193
195
194
if fetch
0 commit comments