Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use HTMLParser for all floki calls #11

Merged
merged 2 commits into from
Apr 22, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion lib/premailex.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ defmodule Premailex do
Documentation for Premailex.
"""

alias Premailex.HTMLParser

@doc """
Adds inline styles to an HTML string

Expand All @@ -29,7 +31,7 @@ defmodule Premailex do
@spec to_text(String.t()) :: String.t()
def to_text(html) do
html
|> Floki.find("body")
|> HTMLParser.all("body")
|> Premailex.HTMLToPlainText.process()
end
end
4 changes: 2 additions & 2 deletions lib/premailex/html_inline_styles.ex
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ defmodule Premailex.HTMLInlineStyles do

defp add_rule_set_to_html(%{selector: selector, rules: rules, specificity: specificity}, html) do
html
|> Floki.find(selector)
|> HTMLParser.all(selector)
|> Enum.reduce(html, &update_style_for_html(&2, &1, rules, specificity))
end

Expand Down Expand Up @@ -73,7 +73,7 @@ defmodule Premailex.HTMLInlineStyles do

defp normalize_style(html) do
html
|> Floki.find("[style]")
|> HTMLParser.all("[style]")
|> Enum.reduce(html, &merge_styles(&2, &1))
end

Expand Down
20 changes: 17 additions & 3 deletions lib/premailex/html_parser.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ defmodule Premailex.HTMLParser do
"""

@default_parser Premailex.HTMLParser.Floki
@type html_tree :: tuple | list

@doc """
Parses a HTML string into an HTML tree.
Expand All @@ -13,7 +14,7 @@ defmodule Premailex.HTMLParser do
iex> Premailex.HTMLParser.parse("<html><head></head><body><h1>Title</h1></body></html>")
{"html", [], [{"head", [], []}, {"body", [], [{"h1", [], ["Title"]}]}]}
"""
@spec parse(String.t()) :: tuple
@spec parse(String.t()) :: html_tree
def parse(html) do
apply(parser(), :parse, [html])
end
Expand All @@ -26,7 +27,7 @@ defmodule Premailex.HTMLParser do
iex> Premailex.HTMLParser.all({"html", [], [{"head", [], []}, {"body", [], [{"h1", [], ["Title"]}]}]}, "h1")
[{"h1", [], ["Title"]}]
"""
@spec all(tuple, String.t()) :: [tuple]
@spec all(html_tree, String.t()) :: [html_tree]
def all(tree, selector) do
apply(parser(), :all, [tree, selector])
end
Expand All @@ -39,11 +40,24 @@ defmodule Premailex.HTMLParser do
iex> Premailex.HTMLParser.to_string({"html", [], [{"head", [], []}, {"body", [], [{"h1", [], ["Title"]}]}]})
"<html><head></head><body><h1>Title</h1></body></html>"
"""
@spec to_string(tuple) :: String.t()
@spec to_string(html_tree) :: String.t()
def to_string(tree) do
apply(parser(), :to_string, [tree])
end

@doc """
Extracts text elements from the HTML tree.

## Examples

iex> Premailex.HTMLParser.text({"html", [], [{"head", [], []}, {"body", [], [{"h1", [], ["Title"]}]}]})
"Title"
"""
@spec text(html_tree) :: String.t()
def text(tree) do
apply(parser(), :text, [tree])
end

defp parser() do
Application.get_env(:premailex, :html_parser, @default_parser)
end
Expand Down
12 changes: 9 additions & 3 deletions lib/premailex/html_parser/floki.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,28 @@ defmodule Premailex.HTMLParser.Floki do
@moduledoc """
API connection with Floki
"""
alias Premailex.HTMLParser

@doc false
@spec parse(String.t()) :: tuple
@spec parse(String.t()) :: HTMLParser.html_tree()
def parse(html) do
Floki.parse(html)
end

@doc false
@spec all(tuple, String.t()) :: [tuple]
@spec all(HTMLParser.html_tree(), String.t()) :: [HTMLParser.html_tree()]
def all(tree, selector) do
Floki.find(tree, selector)
end

@doc false
@spec to_string(tuple) :: String.t()
@spec to_string(HTMLParser.html_tree()) :: String.t()
def to_string(tree) do
Floki.raw_html(tree)
end

@spec text(HTMLParser.html_tree()) :: String.t()
def text(tree) do
Floki.text(tree)
end
end
46 changes: 40 additions & 6 deletions lib/premailex/html_parser/meeseeks.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@ defmodule Premailex.HTMLParser.Meeseeks do
API connection with Meeseeks
"""

require Logger
import Meeseeks.CSS
alias Premailex.HTMLParser
alias Meeseeks.{Selector.CSS.Parser.ParseError}

@doc false
@spec parse(String.t()) :: tuple
@spec parse(String.t()) :: HTMLParser.html_tree()
def parse(html) do
html
|> Meeseeks.parse()
Expand All @@ -15,21 +18,52 @@ defmodule Premailex.HTMLParser.Meeseeks do
[html] -> html
html -> html
end
|> sanitize()
end

@doc false
@spec all(tuple, String.t()) :: [tuple]
@spec all(HTMLParser.html_tree(), String.t()) :: [HTMLParser.html_tree()]
def all(tree, selector) do
tree
|> Meeseeks.all(css("#{selector}"))
|> Enum.map(&Meeseeks.tree/1)
try do
tree
|> Meeseeks.all(css("#{selector}"))
|> Enum.map(&Meeseeks.tree/1)
rescue
e in ParseError ->
Logger.warn("Meeseeks CSS ParseError: " <> e.message)
[]
end
end

@doc false
@spec to_string(tuple) :: String.t()
@spec to_string(HTMLParser.html_tree()) :: String.t()
def to_string(tree) do
tree
|> Meeseeks.parse()
|> Meeseeks.html()
end

@doc false
@spec text(HTMLParser.html_tree()) :: String.t()
def text(text) when is_binary(text), do: text
def text(list) when is_list(list), do: Enum.map_join(list, "", &text/1)
def text({_element, _attrs, children}), do: text(children)

defp sanitize(list) when is_list(list) do
list
|> Enum.map(&sanitize/1)
|> Enum.reject(&is_empty?/1)
end

defp sanitize({elem, attr, children}) do
{elem, attr, sanitize(children)}
end

defp sanitize(any), do: any

defp is_empty?(text) when is_binary(text) do
String.trim(text) == ""
end

defp is_empty?(_any), do: false
end
42 changes: 28 additions & 14 deletions lib/premailex/html_to_plain_text.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ defmodule Premailex.HTMLToPlainText do
@moduledoc """
Module that converts HTML emails to plain text.
"""
alias Premailex.Util
alias Premailex.{HTMLParser, Util}

@doc """
Processes HTML string into a plain text string.
Expand All @@ -14,7 +14,11 @@ defmodule Premailex.HTMLToPlainText do

"""
@spec process(String.t() | Util.html_tree()) :: String.t()
def process(html) when is_binary(html), do: html |> Floki.parse() |> process()
def process(html) when is_binary(html) do
html
|> HTMLParser.parse()
|> process()
end

def process(html) do
html
Expand All @@ -26,7 +30,7 @@ defmodule Premailex.HTMLToPlainText do
|> unordered_lists()
|> ordered_lists()
|> tables()
|> Floki.text()
|> HTMLParser.text()
|> wordwrap()
|> clear_linebreaks()
|> String.trim()
Expand All @@ -46,30 +50,30 @@ defmodule Premailex.HTMLToPlainText do
defp headings(html), do: Util.traverse(html, Enum.map(1..6, &"h#{&1}"), &heading(&1))

defp heading({type, _, content}) do
text = content |> Floki.text()
text = HTMLParser.text(content)

length =
text
|> String.split("\n")
|> Enum.map(&String.length(&1))
|> Enum.max()

"\n\n#{heading(type, text, length)}\n\n"
"\n\n" <> heading(type, text, length) <> "\n\n"
end

defp heading("h1", text, length) do
heading_line = String.duplicate("*", length)
"#{heading_line}\n#{text}\n#{heading_line}"
heading_line <> "\n" <> text <> "\n" <> heading_line
end

defp heading("h2", text, length) do
heading_line = String.duplicate("-", length)
"#{heading_line}\n#{text}\n#{heading_line}"
heading_line <> "\n" <> text <> "\n" <> heading_line
end

defp heading(_, text, length) do
heading_line = String.duplicate("-", length)
"#{text}\n#{heading_line}"
text <> "\n" <> heading_line
end

defp links(html), do: Util.traverse(html, "a", &link(&1))
Expand All @@ -81,7 +85,7 @@ defmodule Premailex.HTMLToPlainText do
|> elem(1)
|> String.replace("mailto:", "")

text = Floki.text(content)
text = HTMLParser.text(content)

link(String.trim(url), String.trim(text))
end
Expand All @@ -92,7 +96,7 @@ defmodule Premailex.HTMLToPlainText do
defp link(url, text, false), do: "#{text} (#{url})"

defp paragraphs(html), do: Util.traverse(html, "p", &paragraph(&1))
defp paragraph({_, _, content}), do: "#{Floki.text(content)}\n\n"
defp paragraph({_, _, content}), do: HTMLParser.text(content) <> "\n\n"

defp unordered_lists(html), do: Util.traverse(html, "ul", &unordered_list_items(&1))

Expand All @@ -102,7 +106,9 @@ defmodule Premailex.HTMLToPlainText do
|> Enum.join("")
end

defp unordered_list_item({_, _, content}), do: "* #{Floki.text(content)}\n"
defp unordered_list_item({_, _, content}) do
"* " <> HTMLParser.text(content) <> "\n"
end

defp ordered_lists(html), do: Util.traverse(html, "ol", &ordered_list_items(&1))

Expand All @@ -113,24 +119,32 @@ defmodule Premailex.HTMLToPlainText do
|> Enum.join("")
end

defp ordered_list_item({_, _, content}, acc), do: "#{acc + 1}. #{Floki.text(content)}\n"
defp ordered_list_item({_, _, content}, acc) do
"#{acc + 1}. " <> HTMLParser.text(content) <> "\n"
end

defp tables(html), do: Util.traverse(html, "table", &table(&1))

defp table({_, _, table_rows}) do
# Callings tables/1 to make sure all nested tables have been processed
# Calling tables/1 to make sure all nested tables have been processed
table_rows
|> tables()
|> flatten_table_body()
|> Util.traverse("tr", &table_rows(&1))
|> Enum.join("\n")
end

defp table_rows({_, _, table_cells}) do
table_cells
|> Util.traverse("td", &Floki.text(&1))
|> Util.traverse("td", &HTMLParser.text(&1))
|> Enum.join(" ")
end

defp flatten_table_body([tree]), do: flatten_table_body(tree)
defp flatten_table_body(list) when is_list(list), do: Enum.map(list, &flatten_table_body/1)
defp flatten_table_body({"tbody", [], table_cells}), do: table_cells
defp flatten_table_body(elem), do: elem

defp wordwrap(text) do
text
|> String.split("\n")
Expand Down
4 changes: 2 additions & 2 deletions lib/premailex/util.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ defmodule Premailex.Util do
Module that contains utility functions.
"""

@type html_tree :: tuple | list
@type needle :: binary | tuple | list
@type html_tree :: Premailex.HTMLParser.html_tree()
@type needle :: binary | html_tree

@doc """
Traverses tree searching for needle, and will call provided function on
Expand Down