Skip to content

Commit 6c7f192

Browse files
committed
Added support for main content extraction (fixes #5)
1 parent eff322c commit 6c7f192

File tree

4 files changed

+44
-0
lines changed

4 files changed

+44
-0
lines changed

src/Client.php

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,20 @@ public function getText($file)
120120
return $this->request('text', $file);
121121
}
122122

123+
/**
124+
* Extracts main text.
125+
*
126+
* @param string $file
127+
*
128+
* @return string
129+
*
130+
* @throws \Exception
131+
*/
132+
public function getMainText($file)
133+
{
134+
return $this->request('text-main', $file);
135+
}
136+
123137
/**
124138
* Returns current Tika version.
125139
*

src/Clients/CLIClient.php

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,10 @@ protected function getArguments($type, $file = null)
187187
$arguments[] = '--text';
188188
break;
189189

190+
case 'text-main':
191+
$arguments[] = '--text-main';
192+
break;
193+
190194
case 'version':
191195
$arguments[] = '--version';
192196
break;

src/Clients/WebClient.php

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,11 @@ protected function getParameters($type, $file = null)
353353
$headers[] = 'Accept: text/plain';
354354
break;
355355

356+
case 'text-main':
357+
$resource = 'tika/main';
358+
$headers[] = 'Accept: text/plain';
359+
break;
360+
356361
case 'version':
357362
$resource = 'version';
358363
break;

tests/BaseTest.php

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,27 @@ public function testDocumentText($file)
186186
$this->assertContains('Zenonis est, inquam, hoc Stoici', self::$client->getText($file));
187187
}
188188

189+
/**
190+
* Main text test
191+
*
192+
* @dataProvider fileProvider
193+
*
194+
* @param string $file
195+
*/
196+
public function testDocumentMainText($file)
197+
{
198+
$client =& self::$client;
199+
200+
if($client::MODE == 'web' && version_compare(self::$version, '1.15') < 0)
201+
{
202+
$this->markTestSkipped('Apache Tika ' . self::$version . 'lacks main content extraction');
203+
}
204+
else
205+
{
206+
$this->assertContains('Sed quia studebat laudi et dignitati', self::$client->getMainText($file));
207+
}
208+
}
209+
189210
/**
190211
* Metadata test
191212
*

0 commit comments

Comments
 (0)