Skip to content

Commit c5317a9

Browse files
committed
Update
- Added "EPUB" file format support. The file format supportability added for testing purpose. Please report us if your found any error in "EPUB" formatted file. - HTML purser now updated to improve the stability.
1 parent 1524d1d commit c5317a9

File tree

8 files changed

+203
-11
lines changed

8 files changed

+203
-11
lines changed

.php-cs-fixer.cache

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ check file content MIME type before execute.
2727
- **ODT**
2828
- **ODS**
2929
- **RTF**
30+
- **EPUB**
3031

3132
<img src="./blobs/warning.png?raw=true" alt="Note" width="12">***PPT*** support is under development.
3233

composer.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,9 @@
3939
"phpoffice/phpword": "^0.18",
4040
"laravel/framework": "^8.0|^9.0",
4141
"thiagoalessio/tesseract_ocr": "^2.12",
42-
"stechstudio/laravel-php-cs-fixer": "^3.1"
42+
"stechstudio/laravel-php-cs-fixer": "^3.1",
43+
"html2text/html2text": "^4.3",
44+
"lywzx/php-epub": "^0.1.2"
4345
},
4446
"require-dev": {
4547
"friendsofphp/php-cs-fixer": "^v3.8.0",

composer.lock

Lines changed: 148 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
<?php
2+
3+
namespace Nilgems\PhpTextract\ExtractorService\Extractors;
4+
5+
use Html2Text\Html2Text;
6+
use lywzx\epub\EpubParser;
7+
use Nilgems\PhpTextract\Exceptions\TextractException;
8+
use Nilgems\PhpTextract\ExtractorService\Contracts\AbstractTextExtractor;
9+
use PhpOffice\PhpWord\Shared\ZipArchive;
10+
11+
class EpubExtractor extends AbstractTextExtractor
12+
{
13+
protected array $supported_mime_types = [
14+
'application/epub+zip'
15+
];
16+
17+
public array $supported_extension = ['epub'];
18+
19+
/**
20+
* Get extracted text
21+
* @throws \Nilgems\PhpTextract\Exceptions\TextractException
22+
* @throws \Exception
23+
*/
24+
protected function getExtractedText(): string
25+
{
26+
if ($zip_path = $this->utilsService->getFilePath()) {
27+
try {
28+
$zip = new ZipArchive();
29+
$zip->open($zip_path);
30+
$epub = (new EpubParser($this->utilsService->getFilePath()));
31+
$epub->parse();
32+
$data = [];
33+
foreach ($epub->getTOC() as $chapter) {
34+
$data[] = trim((new Html2Text($zip->getFromName($chapter['file_name'])))->getText());
35+
}
36+
$zip->close();
37+
return (string) implode("\n", $data);
38+
} catch (\Exception $exception) {
39+
report($exception);
40+
throw new TextractException('The extractor unable to parse the \'epub\' file.');
41+
}
42+
}
43+
return "";
44+
}
45+
}

src/ExtractorService/Extractors/HtmlExtractor.php

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
namespace Nilgems\PhpTextract\ExtractorService\Extractors;
44

5+
use Html2Text\Html2Text;
56
use Nilgems\PhpTextract\ExtractorService\Contracts\TextProcessorHaveFilter;
67
use Nilgems\PhpTextract\ExtractorService\ExtractorCommonProcessors\TextProcessor;
78

@@ -21,7 +22,7 @@ class HtmlExtractor extends TextProcessor implements TextProcessorHaveFilter
2122
public function getFilteredText(string $output): string
2223
{
2324
if (!empty($output)) {
24-
return strip_tags($output);
25+
return (new Html2Text($output))->getText();
2526
}
2627
return "";
2728
}

src/Providers/ServiceProvider.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
namespace Nilgems\PhpTextract\Providers;
44

55
use Illuminate\Support\ServiceProvider as IlluminateServiceProvider;
6+
use Nilgems\PhpTextract\ExtractorService\Extractors\EpubExtractor;
67
use Nilgems\PhpTextract\ExtractorService\Extractors\HtmlExtractor;
78
use Nilgems\PhpTextract\ExtractorService\Extractors\ImageExtractor;
89
use Nilgems\PhpTextract\ExtractorService\Extractors\MsOfficeDocExtractor;
@@ -56,7 +57,8 @@ protected function registerExtractors(): void
5657
OpenOfficeSpreadSheet::class,
5758
PdfExtractor::class,
5859
RtfExtractor::class,
59-
TxtExtractor::class
60+
TxtExtractor::class,
61+
EpubExtractor::class
6062
];
6163
foreach ($extractors as $extractor) {
6264
$this->app->bind($extractor);

storage/example.epub

187 KB
Binary file not shown.

0 commit comments

Comments
 (0)