Skip to content

Commit 0990158

Browse files
committed
split scraper as separate package.
0 parents  commit 0990158

11 files changed

+24323
-0
lines changed

.editorconfig

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
root = true
2+
3+
[*]
4+
charset = utf-8
5+
end_of_line = lf
6+
insert_final_newline = true
7+
trim_trailing_whitespace = true
8+
indent_style = tab
9+
indent_size = 2
10+
11+
[{.jshintrc,*.json,*.yml}]
12+
indent_style = space

.gitignore

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
/*
2+
!Src
3+
!Tests
4+
!.editorconfig
5+
!composer.json
6+
!phpcs.xml.dist
7+
!phpunit.xml.dist
8+
!phpstan.dist.neon

Src/Helper/Marshaller.php

+82
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
<?php
2+
declare( strict_types = 1 );
3+
4+
namespace TheWebSolver\Codegarage\Scraper\Helper;
5+
6+
use Closure;
7+
use DOMElement;
8+
9+
class Marshaller {
10+
/** @var Closure(string|DomElement): string */
11+
private Closure $callback;
12+
/** @var string[] $content */
13+
private array $content;
14+
/** @var array{html:bool,node:bool} */
15+
private array $collect = array(
16+
'html' => false,
17+
'node' => false,
18+
);
19+
20+
public function __construct( public readonly string $tagName ) {}
21+
22+
/** @return array{html:bool,node:bool} */
23+
public function collectables(): array {
24+
return $this->collect;
25+
}
26+
27+
/** @return string[] */
28+
public function content(): array {
29+
return $this->content;
30+
}
31+
32+
public function collectHtml(): self {
33+
$this->collect['html'] = true;
34+
35+
return $this;
36+
}
37+
38+
public function collectElement(): self {
39+
$this->collect['element'] = true;
40+
41+
return $this;
42+
}
43+
44+
/** @param callable(string|DomElement): string $callback */
45+
public function marshallWith( callable $callback ): self {
46+
$this->callback = $callback( ... );
47+
48+
return $this;
49+
}
50+
51+
/** @return array{0:string,1?:string,2?:DomElement} */
52+
public function collect( string|DOMElement $element ): array {
53+
$content = $element instanceof DOMElement ? $element->textContent : $element;
54+
$marshaller = $this->callback ?? null;
55+
$content = trim(
56+
Normalize::nonBreakingSpaceToWhitespace( $marshaller ? $marshaller( $element ) : $content )
57+
);
58+
59+
$collection = array( $content );
60+
61+
if ( $this->isCollectable( $element, type: 'html' ) ) {
62+
$collection[1] = $element->ownerDocument?->saveHTML( $element ) ?: '';
63+
}
64+
65+
if ( $this->isCollectable( $element, type: 'node' ) ) {
66+
$collection[2] = $element;
67+
}
68+
69+
$this->content[] = $content;
70+
71+
return $collection;
72+
}
73+
74+
public function reset(): void {
75+
unset( $this->content );
76+
}
77+
78+
/** @phpstan-assert-if-true =DOMElement $element */
79+
private function isCollectable( string|DOMElement $element, string $type ): bool {
80+
return $this->collect[ $type ] && $element instanceof DOMElement;
81+
}
82+
}

Src/Helper/Normalize.php

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
<?php
2+
declare( strict_types = 1 );
3+
4+
namespace TheWebSolver\Codegarage\Scraper\Helper;
5+
6+
class Normalize {
7+
public static function nonBreakingSpaceToWhitespace( string $value ): string {
8+
return html_entity_decode( str_replace( '&nbsp;', ' ', htmlentities( $value ) ) );
9+
}
10+
}

Src/Traits/TableNodeAware.php

+193
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
<?php
2+
declare( strict_types = 1 );
3+
4+
namespace TheWebSolver\Codegarage\Scraper\Traits;
5+
6+
use ArrayObject;
7+
use DOMNode;
8+
use DOMElement;
9+
use DOMNodeList;
10+
use SplFixedArray;
11+
use TheWebSolver\Codegarage\Scraper\Helper\Marshaller;
12+
13+
trait TableNodeAware {
14+
private bool $scanAllTables = false;
15+
/** @var int[] */
16+
private array $tableIds = array();
17+
/** @var array<int,ArrayObject<int,string|array{0:string,1?:string,2?:DOMElement}>> */
18+
private array $tableHeads;
19+
/** @var array<ArrayObject<array-key,string|array{0:string,1?:string,2?:DOMElement}>> */
20+
private array $tableRows = array();
21+
/** @var SplFixedArray<string> */
22+
private SplFixedArray $tableHeadNames;
23+
/** @var array<string,Marshaller> */
24+
private array $marshallers;
25+
private bool $onlyContents = false;
26+
27+
/** @return int[] List of scanned tables' `spl_object_id()`. */
28+
public function getTableIds(): array {
29+
return $this->tableIds;
30+
}
31+
32+
/** @return ($namesOnly is true ? SplFixedArray<string> : array<int,ArrayObject<int,string|array{0:string,1?:string,2?:DOMElement}>>) */
33+
public function getTableHead( bool $namesOnly = false ): SplFixedArray|array {
34+
return $namesOnly ? $this->tableHeadNames : $this->tableHeads;
35+
}
36+
37+
/** @return array<ArrayObject<array-key,string|array{0:string,1?:string,2?:DOMElement}>> */
38+
public function getTableData(): array {
39+
return $this->tableRows;
40+
}
41+
42+
public function useMarshaller( Marshaller ...$marshallers ): static {
43+
array_walk( $marshallers, $this->registerMarshaller( ... ) );
44+
45+
return $this;
46+
}
47+
48+
public function withAllTableNodes( bool $scan = true ): static {
49+
$this->scanAllTables = $scan;
50+
51+
return $this;
52+
}
53+
54+
public function withOnlyContents(): static {
55+
$this->onlyContents = true;
56+
57+
return $this;
58+
}
59+
60+
/** @param DomNodeList<DomNode> $nodes */
61+
public function scanTableBodyNodeIn( DOMNodeList $nodes ): void {
62+
if ( empty( $this->marshallers ) ) {
63+
return;
64+
}
65+
66+
foreach ( $nodes as $node ) {
67+
$this->scanContentsOfTableBody( $node );
68+
}
69+
}
70+
71+
// phpcs:ignore Generic.CodeAnalysis.UnusedFunctionParameter.Found -- To be used by exhibiting class.
72+
abstract protected function isTargetedTable( DOMElement $node ): bool;
73+
74+
protected function scanContentsOfTableBody( DOMNode $node ): void {
75+
if ( ! $this->isDomElement( $node, tagName: 'table' ) ) {
76+
$this->scanForTableBodyNodeIn( $node->childNodes );
77+
78+
return;
79+
}
80+
81+
if ( ! $this->isTargetedTable( $node ) ) {
82+
return;
83+
}
84+
85+
/** @var DOMElement[] */
86+
$nodes = array_filter( iterator_to_array( $node->childNodes ), $this->isTableBodyElement( ... ) );
87+
88+
if ( empty( $nodes ) ) {
89+
return;
90+
}
91+
92+
foreach ( $nodes as $tableBodyNode ) {
93+
$this->tableIds[] = $id = spl_object_id( $tableBodyNode );
94+
95+
foreach ( $tableBodyNode->childNodes as $contentNode ) {
96+
$this->isDomElement( $contentNode, tagName: 'tr' )
97+
&& ( $this->scanTableHead( $contentNode->childNodes, tableId: $id )
98+
|| $this->scanTableData( $contentNode->childNodes, tableId: $id ) );
99+
}
100+
}
101+
}
102+
103+
/** @param DomNodeList<DomNode> $nodes */
104+
protected function scanTableHead( DOMNodeList $nodes, int $tableId ): bool {
105+
if ( ! $marshaller = ( $this->marshallers['th'] ?? null ) ) {
106+
return false;
107+
}
108+
109+
if ( ! $nodesArray = $nodes->count() ? iterator_to_array( $nodes ) : array() ) {
110+
return false;
111+
}
112+
113+
if ( ! $heads = array_filter( $nodesArray, $this->isTableHeadElement( ... ) ) ) {
114+
return false;
115+
}
116+
117+
$heads = array_map( $marshaller->collect( ... ), $heads );
118+
$this->tableHeadNames = SplFixedArray::fromArray( $names = array_values( $marshaller->content() ) );
119+
120+
if ( $this->onlyContents ) {
121+
$heads = $names;
122+
}
123+
124+
$this->tableHeads[ $tableId ] = new ArrayObject( $heads );
125+
126+
$marshaller->reset();
127+
128+
return true;
129+
}
130+
131+
/** @param DOMNodeList<DomNode> $nodes */
132+
protected function scanTableData( DOMNodeList $nodes, int $tableId ): bool {
133+
if ( ! $marshaller = ( $this->marshallers['td'] ?? null ) ) {
134+
return false;
135+
}
136+
137+
if ( ! $nodesArray = $nodes->count() ? iterator_to_array( $nodes ) : array() ) {
138+
return false;
139+
}
140+
141+
if ( ! $data = array_filter( $nodesArray, $this->isTableDataElement( ... ) ) ) {
142+
return false;
143+
}
144+
145+
/** @var string[] */
146+
$heads = isset( $this->tableHeadNames ) ? $this->tableHeadNames->toArray() : array();
147+
$row = count( $heads ) === count( $data )
148+
? array_combine( $heads, array_map( $marshaller->collect( ... ), $data ) )
149+
: array_map( $marshaller->collect( ... ), $data );
150+
151+
if ( $this->onlyContents ) {
152+
$row = count( $heads ) === count( $marshaller->content() )
153+
? array_combine( $heads, $marshaller->content() )
154+
: $marshaller->content();
155+
}
156+
157+
$this->tableRows[ $tableId ] = new ArrayObject( $row );
158+
159+
$marshaller->reset();
160+
161+
return true;
162+
}
163+
164+
private function registerMarshaller( Marshaller $marshaller ): void {
165+
$this->marshallers[ $marshaller->tagName ] = $marshaller;
166+
}
167+
168+
/** @param DomNodeList<DomNode> $nodes */
169+
private function scanForTableBodyNodeIn( DOMNodeList $nodes ): void {
170+
( ! $this->tableIds || $this->scanAllTables )
171+
&& $nodes->count() && $this->scanTableBodyNodeIn( $nodes );
172+
}
173+
174+
/** @phpstan-assert-if-true =DOMElement $node */
175+
private function isDomElement( mixed $node, string $tagName ): bool {
176+
return $node instanceof DOMElement && $tagName === $node->tagName;
177+
}
178+
179+
/** @phpstan-assert-if-true =DOMElement $node */
180+
private function isTableBodyElement( mixed $node ): bool {
181+
return $this->isDomElement( $node, tagName: 'tbody' );
182+
}
183+
184+
/** @phpstan-assert-if-true =DOMElement $node */
185+
private function isTableHeadElement( mixed $node ): bool {
186+
return $this->isDomElement( $node, tagName: 'th' );
187+
}
188+
189+
/** @phpstan-assert-if-true =DOMElement $node */
190+
private function isTableDataElement( mixed $node ): bool {
191+
return $this->isDomElement( $node, tagName: 'td' );
192+
}
193+
}

0 commit comments

Comments
 (0)