-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfencingtimeParser.php
387 lines (324 loc) · 12.9 KB
/
fencingtimeParser.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
<?php
/**
* Fencingtime Parser
*
* Extract the results data from every conceivable version of Fencingtime and outputs in a
* variety of formats.
*
* @author Dan Kew <[email protected]>
* @license http://opensource.org/licenses/gpl-3.0 GNU General Public License, version 3 (GPLv3)
* @version v1.0.0
*/
namespace fencingtime;
class FencingtimeParser
{
// The URL of the fencingtime results page
private $_urlInFull;
// Fencingtimelive check as different processes needed for that
private $_isFencingtimeLive = 0;
// The raw HTML of the fencingtime results page
private $_dataBody;
// The header info for the results table
private $_resultsHeader;
// The raw HTML of just the results table
private $_resultsBody;
// Indexes for the results table
private $_rankingIndex;
private $_fullnameIndex;
private $_clubIndex;
private $_countryIndex;
// All the values used to identify the text in the engarde table header
private $_rankArray = array("Rank", "Ranking", "Rnk", "Rg", "Cl.", "Place");
private $_fullnameArray = array("Name", "Surname", "Nom", "Apellido-nom");
private $_clubArray = array("Club", "Egyesület", "Club(s)");
private $_countryArray = array("Country");
private $_olderVersion = 0;
/**
* Initialize the class and set its properties.
*
* @param string $inUrl The URL of the engarde results
*
* @since 1.0.0
*/
public function __construct($inUrl)
{
// So there is no need to pass the URL around, we fix it here
$this->_urlInFull = $inUrl;
// Is is the new fencingtimelive result set?
$this->_isLive();
// Fetch the page data ready for manipulation
$this->getFencingtimePage();
}
/**
* Checks for a URL that contains fencingtimelive - this is the
* new system that uses dynamic page generation and needs to be
* handled differently to the old single page of css formatted
* results
*
* @since 1.0.0
*
* @return none / updates the $_isFencingtimeLive variable accordingly
*/
private function _isLive()
{
if (strpos($this->_urlInFull, 'fencingtimelive') !== false) {
$this->_isFencingtimeLive = 1;
}
}
/**
* Checks that the supplied URL exists and is accessible
*
* @since 1.0.0
*
* @return boolean $exists
*/
private function _URLexists()
{
$urlHeaders = get_headers($this->_urlInFull);
if (!$urlHeaders || $urlHeaders[0] == 'HTTP/1.1 404 Not Found') {
$exists = false;
} else {
$exists = true;
}
return $exists;
}
/**
* Ready the HTML table of results for parsing and load into _resultsBody
*
* @since 1.0.0
*
* @return none
*/
private function _prepareResultsData()
{
$this->_resultsBody = $this->_dataBody;
// To accomodate hostoric data, we need to take into account
// data being extracted from waybackmachine
if (strpos($this->_dataBody, '<!-- END WAYBACK TOOLBAR INSERT -->') !== false) {
$this->_resultsBody = stristr($this->_dataBody, '<!-- END WAYBACK TOOLBAR INSERT -->');
}
// Now there should be just the one <table tag which signifies
// the start of the results data - but this can vary depending on
// fencingtime version
if ($this->_isFencingtimeLive === 1) {
$parseString = '<table id="resultList"';
} elseif (strpos($this->_resultsBody, 'id="finalResults">') !== false) {
// Tabbed CSS version
$parseString = 'id="finalResults">';
} elseif (strpos($this->_resultsBody, '<table class="dataTable"') !== false) {
// Single page coloured version with no tabs
$parseString = '<table class="dataTable"';
} elseif (strpos($this->_resultsBody, '<table class="reporttable"') !== false) {
// White background w/ green header version
$parseString = '<table class="reporttable"';
}
$this->_resultsBody = stristr(stristr($this->_resultsBody, $parseString), '</table>', true);
}
/**
* Extract out the the header information for the results table. These can
* vary depending on what the competition organiser has chosen to include.
* Ordinarily it'll be something like;
*
* Rank | Name | Club | Country
*
* Column headings should be in the first row, but if some spurious results
* start to appear then this will need to be looked at.
*
* @since 1.0.0
*
* @return none
*/
private function _prepareResultsTableHeader()
{
if (!(isset($this->_resultsBody))) {
$this->_prepareResultsData();
}
// Use regx as explode does not handle case sensitivity
$tableRows = preg_split("/<tr/i", $this->_resultsBody);
$this->_resultsHeader = $this->_myArrayFilter($tableRows[1]);
}
/**
* Scans the returned HTML results table header and returns the correct
* position of the passed column label
*
* @param array $colArray predefined array of known column text labels
*
* @return int index into the table header for the desired column
*/
private function _setIndexColumn($colArray)
{
if (!(isset($this->_resultsHeader))) {
$this->_prepareResultsTableHeader();
}
$i=0;
while ($i<count($this->_resultsHeader)) {
if (array_search($this->_resultsHeader[$i], $colArray) !== false) {
break;
}
$i++;
}
return $i<count($this->_resultsHeader) ? $i : -1;
}
/**
* Loads all the column indexes from the HTML results table header so
* we can be sure we are grabbing the right data from the right columns
*
* @return none
*/
private function _prepareResultsTableHeaderIndexes()
{
$this->_rankingIndex = $this->_setIndexColumn($this->_rankArray);
$this->_fullnameIndex = $this->_setIndexColumn($this->_fullnameArray);
$this->_clubIndex = $this->_setIndexColumn($this->_clubArray);
$this->_countryIndex = $this->_setIndexColumn($this->_countryArray);
}
/**
* Extracts all the results from the HTML results table, cleans and loads
* them into an array ready for manipulation
*
* @return array
*/
public function getAllResults()
{
$allResults = array();
if (!(isset($this->_resultsBody))) {
$this->_prepareResultsData();
}
$this->_prepareResultsTableHeaderIndexes();
$tableRows = array_slice(preg_split("/<tr/i", $this->_resultsBody), 2);
for ($i=0; $i < count($tableRows); $i++ ) {
$tempArray = $this->_myArrayFilter($tableRows[$i]);
$rank = $this->_rankingIndex <> -1 ? $tempArray[$this->_rankingIndex] : '';
$newNames = $this->_normalizeName($this->_fullnameIndex <> -1 ? $tempArray[$this->_fullnameIndex] : '');
$club = $this->_clubIndex <> -1 ? $tempArray[$this->_clubIndex] : '';
$country = $this->_countryIndex <> -1 ? $tempArray[$this->_countryIndex] : '';
array_push($allResults, array($rank, $newNames[0], $newNames[1], $club, $country));
}
return $allResults;
}
/**
* Name format is commonly (UPERCASESURNAME Forename) or
* (UPERCASESURNAME, Forename) other random words can also appear
* in the text when competition organisers over-ride the standard
* name protocol. However, older versions of fencingtime do not
* follow this approach.
*
* @param string $inName raw fencer name
*
* @return array parsed forename & surname strings
*/
private function _normalizeName($inName)
{
$forename = $surname = '';
$removeThisText = array("(None)", "(V)", "(C)", "(J)");
// Remove anything that's not needed
$inNameParse = preg_replace('/[\s]+/mu', ' ', str_ireplace($removeThisText, '', $inName));
if (strpos($inNameParse, ',') !== false) {
$inNameParse = explode(",", $inNameParse);
$surname = $inNameParse[0];
$forename = $inNameParse[1];
} else {
$inNameParse = explode(" ", $inNameParse);
for ($i=0; $i < count($inNameParse); $i++) {
mb_strtoupper($inNameParse[$i], 'utf-8') == $inNameParse[$i] ? $surname .= " ".$inNameParse[$i] : $forename .= " ".$inNameParse[$i];
}
}
return array(trim($surname), trim($forename));
}
/**
* Gets the HTML from the source URL and loads it in its raw form
* into _dataBody
*
* Can be called directly
*
* @return string $this->_dataBody
*/
public function getFencingtimePage()
{
if ($this->_URLexists($this->_urlInFull)) {
// This portion of code came from
// https://guymclean.co.uk/web-scraping-after-javascript-finished/
// and is worth its weight in gold!
// Scrape data from fencingtime after Javascript/AJAX requests have run.
$requestContent = json_encode(array( "url" => $this->_urlInFull, "renderType" => "html"));
// Can only make 100 requests a day with the demo key
$url = 'http://PhantomJScloud.com/api/browser/v2/a-demo-key-with-low-quota-per-ip-address/';
$options = array(
'http' => array(
'header' => "Content-type: application/json\r\n",
'method' => 'POST',
'content' => $requestContent
)
);
$context = stream_context_create($options);
if ($this->_isFencingtimeLive == 1) {
$this->_dataBody = file_get_contents($url, false, $context);
} else {
$this->_dataBody = file_get_contents($this->_urlInFull);
}
// here ends the great portion of code
if ($this->_dataBody === false) {
$this->_handleError('Unable to read data from supplied URL.');
}
} else {
$this->_handleError('Unable to locate supplied URL');
}
return $this->_dataBody;
}
/**
* Simple error handler to display message when something doesn't
* go to plan and stop executing the script!
*
* @param string $inError message indicating the error
*
* @return none
*/
private function _handleError($inError)
{
die($inError);
}
/**
* Handles the parsing of the results table HTML
*
* @param string $inString chunkc of HTML to parse containing the result
*
* @return array
*/
private function _myArrayFilter($inString)
{
// remove any hard coded NBSP
$inString = str_replace(" ", "", $inString);
// and prevent empty TD cells from affecting the array index
$inString = preg_replace('~\></td>~', '>-</td>', $inString);
// and now remove any spurious unicode hardspaces that ft includes
$inString = str_replace(chr(194).chr(160), '', $inString);
return array_slice(array_values(array_filter(explode("|", strip_tags(str_replace(">", ">|", $inString))), 'trim')), 1);
}
}
class FencingtimeFormatter extends FencingtimeParser
{
public function getAllResultsCSV()
{
$allResultsArray = $this->getAllResults();
// Handles names and clubs with accents correctly
header('Content-Type: text/html; charset=utf-8');
// If UTF8 encoding being used, you may have to manually select
// that when opening the CSV file in your CSV reader
header("Content-Disposition: attachment; filename=fencingtime.csv");
header("Content-Type: application/vnd.ms-excel");
$fp = fopen('php://output', 'w');
fputcsv($fp, array('Rank', 'Surname', 'Forename', 'Club', 'Country'));
foreach ($allResultsArray as $value) {
fputcsv($fp, $value);
}
fclose($fp);
}
public function getAllResultsJSON()
{
$allResultsArray = $this->getAllResults();
// Handles names and clubs with accents correctly
header('Content-Type: text/html; charset=utf-8');
header('Content-Type: application/json');
echo json_encode($allResultsArray);
}
}