Skip to content

Commit d4df251

Browse files
author
Edward Z. Yang ext:(%22)
committed
Release 0.1.0.
1 parent 45a220c commit d4df251

File tree

6 files changed

+88
-10
lines changed

6 files changed

+88
-10
lines changed

README

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,47 @@
1-
This implementation doesn't work yet. Bits and pieces are getting written
2-
at a time.
1+
html5lib - php flavour
32

4-
This is an implementation of the HTML5 specification for PHP. More friendly
5-
details forthcoming, but here are some notes:
3+
This is an implementation of the tokenization and tree-building parts
4+
of the HTML5 specification in PHP. Potential uses of this library
5+
can be found in web-scrapers and HTML filters.
6+
7+
Warning: This is a pre-alpha release, and as such, certain parts of
8+
this code are not up-to-snuff (e.g. error reporting and performance).
9+
However, the code is very close to spec and passes 100% of tests
10+
not related to parse errors. Nevertheless, expect to have to update
11+
your code on the next upgrade.
12+
13+
14+
Usage notes:
15+
16+
<?php
17+
require_once '/path/to/HTML5/Parser.php';
18+
$dom = HTML5_Parser::parse('<html><body>...');
19+
$nodelist = HTML5_Parser::parseFragment('<b>Boo</b><br>');
20+
$nodelist = HTML5_Parser::parseFragment('<td>Bar</td>', 'table');
21+
22+
23+
Documentation:
24+
25+
HTML5_Parser::parse($text)
26+
$text : HTML to parse
27+
return : DOMDocument of parsed document
28+
29+
HTML5_Parser::parseFragment($text, $context)
30+
$text : HTML to parse
31+
$context : String name of context element
32+
return : DOMDocument of parsed document
33+
34+
35+
Developer notes:
636

737
* To setup unit tests, you need to add a small stub file test-settings.php
838
that contains $simpletest_location = 'path/to/simpletest/'; This needs to
939
be version 1.1 (or, until that is released, SVN trunk) of SimpleTest.
1040

1141
* We don't want to ultimately use PHP's DOM because it is not tolerant
1242
of certain types of errors that HTML 5 allows (for example, an element
13-
"foo@bar"). But for now, we will, since it's much easier.
43+
"foo@bar"). But the current implementation uses it, since it's easy.
44+
Eventually, this html5lib implementation will get a version of SimpleTree;
45+
and may possibly start using that by default.
1446

1547
vim: et sw=4 sts=4

VERSION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0.1.0

library/HTML5/Parser.php

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,36 @@
11
<?php
22

3+
require_once dirname(__FILE__) . '/Data.php';
4+
require_once dirname(__FILE__) . '/InputStream.php';
5+
require_once dirname(__FILE__) . '/TreeBuilder.php';
6+
require_once dirname(__FILE__) . '/Tokenizer.php';
7+
8+
/**
9+
* Outwards facing interface for HTML5.
10+
*/
311
class HTML5_Parser
412
{
5-
13+
/**
14+
* Parses a full HTML document.
15+
* @param $text HTML text to parse
16+
* @param $builder Custom builder implementation
17+
* @return Parsed HTML as DOMDocument
18+
*/
19+
static public function parse($text, $builder = null) {
20+
$tokenizer = new HTML5_Tokenizer($text, $builder);
21+
$tokenizer->parse();
22+
return $tokenizer->save();
23+
}
24+
/**
25+
* Parses an HTML fragment.
26+
* @param $text HTML text to parse
27+
* @param $context String name of context element to pretend parsing is in.
28+
* @param $builder Custom builder implementation
29+
* @return Parsed HTML as DOMDocument
30+
*/
31+
static public function parseFragment($text, $context = null, $builder = null) {
32+
$tokenizer = new HTML5_Tokenizer($text, $builder);
33+
$tokenizer->parseFragment($context);
34+
return $tokenizer->save();
35+
}
636
}

library/HTML5/Tokenizer.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,9 @@ class HTML5_Tokenizer {
8484
/**
8585
* @param $data Data to parse
8686
*/
87-
public function __construct($data) {
87+
public function __construct($data, $builder = null) {
8888
$this->stream = new HTML5_InputStream($data);
89-
$this->tree = new HTML5_TreeBuilder;
89+
if (!$builder) $this->tree = new HTML5_TreeBuilder;
9090
$this->content_model = self::PCDATA;
9191
}
9292

library/HTML5/TreeBuilder.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ private function strConst($number) {
121121
const MARKER = 300;
122122

123123
// Namespaces for foreign content
124-
const NS_HTML = 'http://www.w3.org/1999/xhtml';
124+
const NS_HTML = null; // to prevent DOM from requiring NS on everything
125125
const NS_MATHML = 'http://www.w3.org/1998/Math/MathML';
126126
const NS_SVG = 'http://www.w3.org/2000/svg';
127127
const NS_XLINK = 'http://www.w3.org/1999/xlink';
@@ -3516,8 +3516,8 @@ public function currentTableIsTainted() {
35163516
*/
35173517
public function setupContext($context = null) {
35183518
$this->fragment = true;
3519-
$context = $this->dom->createElementNS(self::NS_HTML, $context);
35203519
if ($context) {
3520+
$context = $this->dom->createElementNS(self::NS_HTML, $context);
35213521
/* 4.1. Set the HTML parser's tokenization stage's content model
35223522
* flag according to the context element, as follows: */
35233523
switch ($context->tagName) {

tests/HTML5/ParserTest.php

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<?php
2+
3+
require_once dirname(__FILE__) . '/../autorun.php';
4+
5+
class HTML5_ParserTest extends UnitTestCase
6+
{
7+
public function testParse() {
8+
$result = HTML5_Parser::parse('<html><body></body></html>');
9+
$this->assertIsA($result, 'DOMDocument');
10+
}
11+
public function testParseFragment() {
12+
$result = HTML5_Parser::parseFragment('<b>asdf</b> foo');
13+
$this->assertIsA($result, 'DOMNodeList');
14+
}
15+
}

0 commit comments

Comments
 (0)