Skip to content

Commit aaa4b55

Browse files
committed
update: TableTracer::inferTableFrom() #1 $source now accepts DOMElement also.
refactor: other codes and comments.
1 parent a919e54 commit aaa4b55

File tree

4 files changed

+153
-60
lines changed

4 files changed

+153
-60
lines changed

Src/Interfaces/TableTracer.php

+25-22
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
use Iterator;
77
use ArrayObject;
8+
use DOMElement;
89
use SplFixedArray;
910
use TheWebSolver\Codegarage\Scraper\Enums\Table;
1011
use TheWebSolver\Codegarage\Scraper\Enums\EventAt;
@@ -14,27 +15,27 @@
1415
/** @template TColumnReturn */
1516
interface TableTracer extends Indexable {
1617
/**
17-
* Sets whether all traced tables should be scanned or not.
18+
* Registers whether all tables present in the given source should be traced or not.
1819
*/
1920
public function withAllTables( bool $trace = true ): static;
2021

2122
/**
22-
* Registers target table structure(s) to be omitted from being traced and inferred.
23+
* Registers targeted table structure(s) to be omitted from being traced.
2324
*
2425
* @no-named-arguments
2526
*/
2627
public function traceWithout( Table ...$targets ): static;
2728

2829
/**
29-
* Registers transformer for targeted table structure.
30+
* Registers transformer for the targeted table structure.
3031
*
3132
* @param Transformer<contravariant static,TReturn> $transformer
3233
* @template TReturn
3334
*/
3435
public function addTransformer( Table $for, Transformer $transformer ): static;
3536

3637
/**
37-
* Registers event listener for targeted table structure.
38+
* Registers event listener for the targeted table structure and at the given event time.
3839
*
3940
* @param callable(TableTraced): void $callback
4041
*/
@@ -43,13 +44,15 @@ public function addEventListener( Table $for, callable $callback, EventAt $event
4344
/**
4445
* Infers table(s) from given HTML content source.
4546
*
46-
* @param bool $normalize When set to true, whitespaces/tabs/newlines and other
47-
* similar characters and controls must get cleaned.
47+
* @param string|DOMElement $source Either a HTML source or a table DOMElement.
48+
* @param bool $normalize When set to true, whitespaces/tabs/newlines and other
49+
* similar characters and controls must be cleaned.
50+
* @throws InvalidSource When unsupported $source given, or no "table" in $source.
4851
*/
49-
public function inferTableFrom( string $source, bool $normalize ): void;
52+
public function inferTableFrom( string|DOMElement $source, bool $normalize ): void;
5053

5154
/**
52-
* Infers table head content from given element list.
55+
* Infers table head content from the given element list.
5356
*
5457
* @param iterable<array-key,TElement> $elementList
5558
* @throws InvalidSource When TElement is not a valid type.
@@ -58,12 +61,11 @@ public function inferTableFrom( string $source, bool $normalize ): void;
5861
public function inferTableHeadFrom( iterable $elementList ): void;
5962

6063
/**
61-
* Infers table column data from given element list.
64+
* Infers table columns' content as a dataset from the given element list.
6265
*
6366
* @param iterable<int,TElement> $elementList
6467
* @return array<TColumnReturn>
6568
* @throws InvalidSource When TElement is not a valid type.
66-
*
6769
* @template TElement
6870
*/
6971
public function inferTableDataFrom( iterable $elementList ): array;
@@ -76,37 +78,38 @@ public function inferTableDataFrom( iterable $elementList ): array;
7678
public function getTableId( bool $current = false ): int|string|array;
7779

7880
/**
79-
* Gets collection of traced table columns's data indexed by respective table ID.
80-
*
81-
* @return array<Iterator<int,ArrayObject<array-key,TColumnReturn>>>
82-
*/
83-
public function getTableData(): array;
84-
85-
/**
86-
* Gets traced table caption data indexed by respective table ID.
81+
* Gets traced table caption content indexed by respective table ID, if any.
8782
*
8883
* @return array<string|null>
8984
*/
9085
public function getTableCaption(): array;
9186

9287
/**
93-
* Gets traced table head data indexed by respective table ID.
88+
* Gets traced table head content indexed by respective table ID.
9489
*
9590
* @return array<SplFixedArray<string>>
9691
*/
9792
public function getTableHead(): array;
9893

9994
/**
100-
* Resets traced structures' details.
95+
* Gets traced table columns' content Iterator indexed by respective table ID.
96+
*
97+
* @return array<Iterator<int,ArrayObject<array-key,TColumnReturn>>>
98+
*/
99+
public function getTableData(): array;
100+
101+
/**
102+
* Resets traced table structures' details.
101103
*
102-
* This may only be invoked after retrieving a table Iterator and no further table tracing is required.
104+
* This may only be invoked after retrieving table columns' content Iterator
105+
* and no further tracing is required of any remaining table structures.
103106
*/
104107
public function resetTableTraced(): void;
105108

106109
/**
107110
* Resets registered hooks such as event listeners and transformers.
108111
*
109-
* This may only be invoked after any iteration is complete to prevent side-effects
112+
* This may only be invoked after an iteration is complete to prevent side-effects
110113
* of hooks not being applied to remaining items of an Iterator being iterated.
111114
*/
112115
public function resetTableHooks(): void;

Src/Traits/Table/HtmlTableFromNode.php

+65-37
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,17 @@ trait HtmlTableFromNode {
2222
/** @use TableExtractor<TColumnReturn> */
2323
use TableExtractor;
2424

25-
public function inferTableFrom( string $source, bool $normalize = true ): void {
26-
$this->inferTableFromDOMNodeList(
27-
DOMDocumentFactory::bodyFromHtml( $source, normalize: $normalize )->childNodes
28-
);
25+
/** @throws InvalidSource When "table" cannot be resolved in given source. */
26+
public function inferTableFrom( string|DOMElement $source, bool $normalize = true ): void {
27+
$source = $this->getValidatedTableSource( $source, $normalize );
28+
29+
if ( $source instanceof DOMNodeList ) {
30+
$this->inferTableFromDOMNodeList( $source );
31+
32+
return;
33+
}
34+
35+
$this->inferChildNodesFromTable( $source );
2936
}
3037

3138
/** @param DOMNodeList<DOMNode> $elementList */
@@ -87,37 +94,43 @@ public function inferTableDataFrom( iterable $elementList ): array {
8794
return $data;
8895
}
8996

90-
/** @param DOMNodeList<DOMNode> $elementList */
91-
protected function inferTableFromDOMNodeList( DOMNodeList $elementList ): void {
92-
foreach ( $elementList as $node ) {
93-
if ( ! $tableStructure = $this->traceStructureFrom( $node ) ) {
94-
continue;
95-
}
97+
protected function inferChildNodesFromTable( DOMElement $element ): bool {
98+
$iterator = $this->childNodesIteratorOfTable( $element );
9699

97-
assert( $node instanceof DOMElement );
100+
if ( ! $iterator || ! $tableStructure = $this->traceTableStructureIn( $iterator ) ) {
101+
return false;
102+
}
98103

99-
[$bodyNode, $captionNode, $headNode] = $tableStructure;
104+
[$bodyNode, $captionNode, $headNode] = $tableStructure;
100105

101-
$splId = spl_object_id( $node );
102-
$id = $splId * spl_object_id( $bodyNode );
106+
$splId = spl_object_id( $element );
107+
$id = $splId * spl_object_id( $bodyNode );
103108

104-
$this->dispatchEventForTable( $id, $bodyNode );
109+
$this->dispatchEventForTable( $id, $bodyNode );
105110

106-
$captionNode && $this->captionStructureContentFrom( $captionNode );
107-
$headNode && $this->headStructureContentFrom( $headNode );
111+
$captionNode && $this->captionStructureContentFrom( $captionNode );
112+
$headNode && $this->headStructureContentFrom( $headNode );
108113

109-
$iterator = $this->bodyStructureIteratorFrom( $bodyNode );
114+
$iterator = $this->bodyStructureIteratorFrom( $bodyNode );
110115

111-
$iterator->valid() && ( $this->discoveredTable__rows[ $id ] = $iterator );
116+
$iterator->valid() && ( $this->discoveredTable__rows[ $id ] = $iterator );
112117

113-
if ( $this->discoveredTargetedTable( $node ) ) {
114-
$this->dispatchEvent( new TableTraced( Table::TBody, EventAt::End, $node, $this ) );
118+
$this->dispatchEvent( new TableTraced( Table::TBody, EventAt::End, $element, $this ) );
115119

116-
break;
120+
return true;
121+
}
122+
123+
/** @param DOMNodeList<DOMNode> $elementList */
124+
protected function inferTableFromDOMNodeList( DOMNodeList $elementList ): void {
125+
foreach ( $elementList as $node ) {
126+
if ( ! AssertDOMElement::isValid( $node ) || ! $this->inferChildNodesFromTable( $node ) ) {
127+
continue;
117128
}
118129

119-
$this->dispatchEvent( new TableTraced( Table::TBody, EventAt::End, $node, $this ) );
120-
}//end foreach
130+
if ( $this->discoveredTargetedTable( $node ) ) {
131+
break;
132+
}
133+
}
121134
}
122135

123136
final protected function findTableStructureIn( DOMNode $node, int $minChildNodesCount = 0 ): void {
@@ -132,8 +145,8 @@ final protected function isTableRowStructure( DOMNode $node ): bool {
132145
}
133146

134147
/** @return Iterator<int,DOMNode> */
135-
private function getChildNodesIteratorFrom( DOMNode $node ): Iterator {
136-
return $node->childNodes->getIterator();
148+
private function getChildNodesIteratorFrom( DOMElement $element ): Iterator {
149+
return $element->childNodes->getIterator();
137150
}
138151

139152
/**
@@ -151,15 +164,15 @@ private function assertCurrentColumnIsDOMElement( mixed $node ): void {
151164
}
152165

153166
/** @return ?Iterator<int,DOMNode> */
154-
private function fromCurrentStructure( DOMNode $node ): ?Iterator {
155-
if ( ! AssertDOMElement::isValid( $node, 'table' ) ) {
156-
$this->findTableStructureIn( $node );
167+
private function childNodesIteratorOfTable( DOMElement $element ): ?Iterator {
168+
if ( 'table' !== $element->tagName ) {
169+
$this->findTableStructureIn( $element );
157170

158171
return null;
159172
}
160173

161-
return $this->isTargetedTable( $node ) && $node->childNodes->length
162-
? $this->getChildNodesIteratorFrom( $node )
174+
return $this->isTargetedTable( $element ) && $element->childNodes->length
175+
? $this->getChildNodesIteratorFrom( $element )
163176
: null;
164177
}
165178

@@ -209,12 +222,11 @@ private function headStructureContentFrom( DOMElement $node ): void {
209222
$this->dispatchEvent( new TableTraced( Table::THead, EventAt::End, $node, $this ) );
210223
}
211224

212-
/** @return ?array{0:DOMElement,1:?DOMElement,2:?DOMElement} */
213-
private function traceStructureFrom( DOMNode $node ): ?array {
214-
if ( ! $tableIterator = $this->fromCurrentStructure( $node ) ) {
215-
return null;
216-
}
217-
225+
/**
226+
* @param Iterator<int,DOMNode> $tableIterator
227+
* @return ?array{0:DOMElement,1:?DOMElement,2:?DOMElement}
228+
*/
229+
private function traceTableStructureIn( Iterator $tableIterator ): ?array {
218230
$bodyNode = $captionNode = $headNode = null;
219231

220232
while ( ! $bodyNode && $tableIterator->valid() ) {
@@ -303,4 +315,20 @@ private function discoveredTargetedTable( mixed $node ): bool {
303315
&& AssertDOMElement::isValid( $node )
304316
&& $this->isTargetedTable( $node );
305317
}
318+
319+
/**
320+
* @return DOMElement|DOMNodeList<DOMNode>
321+
* @throws InvalidSource When source invalid.
322+
*/
323+
private function getValidatedTableSource( string|DOMElement $source, bool $normalize ): DOMElement|DOMNodeList {
324+
if ( ! $source instanceof DOMElement ) {
325+
return DOMDocumentFactory::bodyFromHtml( $source, normalize: $normalize )->childNodes;
326+
}
327+
328+
'table' !== $source->tagName && throw new InvalidSource(
329+
sprintf( '%s trait only supports table "DOMElement"', HtmlTableFromNode::class )
330+
);
331+
332+
return $source;
333+
}
306334
}

Src/Traits/Table/HtmlTableFromString.php

+23-1
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@
55

66
use DOMNode;
77
use Iterator;
8+
use DOMElement;
89
use ArrayObject;
910
use TheWebSolver\Codegarage\Scraper\Enums\Table;
1011
use TheWebSolver\Codegarage\Scraper\Enums\EventAt;
1112
use TheWebSolver\Codegarage\Scraper\Helper\Normalize;
1213
use TheWebSolver\Codegarage\Scraper\Event\TableTraced;
1314
use TheWebSolver\Codegarage\Scraper\Data\CollectionSet;
1415
use TheWebSolver\Codegarage\Scraper\Error\ScraperError;
16+
use TheWebSolver\Codegarage\Scraper\Error\InvalidSource;
1517
use TheWebSolver\Codegarage\Scraper\Traits\Table\TableExtractor;
1618

1719
/** @template TColumnReturn */
@@ -31,7 +33,10 @@ final public function withAllTables( bool $trace = false ): static {
3133
return $this;
3234
}
3335

34-
public function inferTableFrom( string $source, bool $normalize = true ): void {
36+
/** @throws InvalidSource When given $source is not a string. */
37+
public function inferTableFrom( string|DOMElement $source, bool $normalize = true ): void {
38+
$this->validateSourceHasTableStructure( $source );
39+
3540
$node = $normalize ? Normalize::controlsAndWhitespacesIn( $source ) : $source;
3641

3742
if ( ! $tableStructure = $this->traceStructureFrom( $node ) ) {
@@ -284,4 +289,21 @@ private function inspectFirstRowForHeadStructure( array $row ): bool {
284289

285290
return $this->currentIteration__allTableHeads;
286291
}
292+
293+
/**
294+
* @throws InvalidSource When source invalid.
295+
* @phpstan-assert string $source
296+
*/
297+
private function validateSourceHasTableStructure( string|DOMElement $source ): void {
298+
$source instanceof DOMElement && throw new InvalidSource(
299+
sprintf( '%s trait only supports "string" source to infer table.', HtmlTableFromString::class )
300+
);
301+
302+
( str_contains( $source, '<table' ) && str_contains( $source, '</table>' ) ) || throw new InvalidSource(
303+
sprintf(
304+
'%s trait cannot trace table structure from string that does not have table.',
305+
HtmlTableFromString::class
306+
)
307+
);
308+
}
287309
}

Tests/HtmlTableExtractorTest.php

+40
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
use DOMNode;
88
use DOMElement;
99
use ArrayObject;
10+
use DOMDocument;
1011
use LogicException;
1112
use PHPUnit\Framework\TestCase;
1213
use PHPUnit\Framework\Attributes\Test;
@@ -18,6 +19,7 @@
1819
use TheWebSolver\Codegarage\Scraper\Event\TableTraced;
1920
use TheWebSolver\Codegarage\Scraper\DOMDocumentFactory;
2021
use TheWebSolver\Codegarage\Scraper\Error\ScraperError;
22+
use TheWebSolver\Codegarage\Scraper\Error\InvalidSource;
2123
use TheWebSolver\Codegarage\Scraper\Interfaces\TableTracer;
2224
use TheWebSolver\Codegarage\Scraper\Interfaces\Transformer;
2325
use TheWebSolver\Codegarage\Scraper\Marshaller\MarshallTableRow;
@@ -31,6 +33,44 @@ private function getTableFromSource(): string {
3133
return file_get_contents( self::TABLE_SOURCE ) ?: '';
3234
}
3335

36+
/** @param class-string<TableTracer<mixed>> $classname */
37+
#[Test]
38+
#[DataProvider( 'provideInvalidSource' )]
39+
public function itThrowsExceptionWhenInvalidSourceGiven(
40+
string $classname,
41+
string|DOMElement $element,
42+
?int $count = null
43+
): void {
44+
if ( null === $count ) {
45+
$this->expectException( InvalidSource::class );
46+
}
47+
48+
$scanner = new $classname();
49+
$scanner->inferTableFrom( $element, false );
50+
51+
// @phpstan-ignore-next-line -- "null" always throws exception.
52+
$this->assertCount( $count, $scanner->getTableId() );
53+
}
54+
55+
/** @return mixed[] */
56+
public static function provideInvalidSource(): array {
57+
$divElement = new DOMElement( 'div' );
58+
$string = '<div></div>';
59+
$tableString = '<table><tbody><tr><td>content</td></tr></tbody></table>';
60+
61+
$dom = new DOMDocument();
62+
$dom->loadHTML( $tableString, LIBXML_NOERROR | LIBXML_NOBLANKS );
63+
64+
return [
65+
[ DOMNodeScanner::class, $divElement ],
66+
[ DOMNodeScanner::class, $string, 0 ], // Does not verify string value.
67+
[ DOMNodeScanner::class, $dom->getElementsByTagName( 'body' )->item( 0 )?->firstChild, 1 ],
68+
[ DOMStringScanner::class, $divElement ],
69+
[ DOMStringScanner::class, $string ],
70+
[ DOMStringScanner::class, $tableString, 1 ],
71+
];
72+
}
73+
3474
#[Test]
3575
public function itOnlyScansTargetedTable(): void {
3676
$nodeScanner = new DOMNodeScanner( fn( $node ) => AssertDOMElement::hasId( $node, 'inner-content-table' ) );

0 commit comments

Comments
 (0)