File tree 8 files changed +203
-11
lines changed
8 files changed +203
-11
lines changed Load Diff Large diffs are not rendered by default.
Original file line number Diff line number Diff line change @@ -27,6 +27,7 @@ check file content MIME type before execute.
27
27
- ** ODT**
28
28
- ** ODS**
29
29
- ** RTF**
30
+ - ** EPUB**
30
31
31
32
<img src =" ./blobs/warning.png?raw=true " alt =" Note " width =" 12 " >*** PPT*** support is under development.
32
33
Original file line number Diff line number Diff line change 39
39
"phpoffice/phpword" : " ^0.18" ,
40
40
"laravel/framework" : " ^8.0|^9.0" ,
41
41
"thiagoalessio/tesseract_ocr" : " ^2.12" ,
42
- "stechstudio/laravel-php-cs-fixer" : " ^3.1"
42
+ "stechstudio/laravel-php-cs-fixer" : " ^3.1" ,
43
+ "html2text/html2text" : " ^4.3" ,
44
+ "lywzx/php-epub" : " ^0.1.2"
43
45
},
44
46
"require-dev" : {
45
47
"friendsofphp/php-cs-fixer" : " ^v3.8.0" ,
Original file line number Diff line number Diff line change
1
+ <?php
2
+
3
+ namespace Nilgems \PhpTextract \ExtractorService \Extractors ;
4
+
5
+ use Html2Text \Html2Text ;
6
+ use lywzx \epub \EpubParser ;
7
+ use Nilgems \PhpTextract \Exceptions \TextractException ;
8
+ use Nilgems \PhpTextract \ExtractorService \Contracts \AbstractTextExtractor ;
9
+ use PhpOffice \PhpWord \Shared \ZipArchive ;
10
+
11
+ class EpubExtractor extends AbstractTextExtractor
12
+ {
13
+ protected array $ supported_mime_types = [
14
+ 'application/epub+zip '
15
+ ];
16
+
17
+ public array $ supported_extension = ['epub ' ];
18
+
19
+ /**
20
+ * Get extracted text
21
+ * @throws \Nilgems\PhpTextract\Exceptions\TextractException
22
+ * @throws \Exception
23
+ */
24
+ protected function getExtractedText (): string
25
+ {
26
+ if ($ zip_path = $ this ->utilsService ->getFilePath ()) {
27
+ try {
28
+ $ zip = new ZipArchive ();
29
+ $ zip ->open ($ zip_path );
30
+ $ epub = (new EpubParser ($ this ->utilsService ->getFilePath ()));
31
+ $ epub ->parse ();
32
+ $ data = [];
33
+ foreach ($ epub ->getTOC () as $ chapter ) {
34
+ $ data [] = trim ((new Html2Text ($ zip ->getFromName ($ chapter ['file_name ' ])))->getText ());
35
+ }
36
+ $ zip ->close ();
37
+ return (string ) implode ("\n" , $ data );
38
+ } catch (\Exception $ exception ) {
39
+ report ($ exception );
40
+ throw new TextractException ('The extractor unable to parse the \'epub \' file. ' );
41
+ }
42
+ }
43
+ return "" ;
44
+ }
45
+ }
Original file line number Diff line number Diff line change 2
2
3
3
namespace Nilgems \PhpTextract \ExtractorService \Extractors ;
4
4
5
+ use Html2Text \Html2Text ;
5
6
use Nilgems \PhpTextract \ExtractorService \Contracts \TextProcessorHaveFilter ;
6
7
use Nilgems \PhpTextract \ExtractorService \ExtractorCommonProcessors \TextProcessor ;
7
8
@@ -21,7 +22,7 @@ class HtmlExtractor extends TextProcessor implements TextProcessorHaveFilter
21
22
public function getFilteredText (string $ output ): string
22
23
{
23
24
if (!empty ($ output )) {
24
- return strip_tags ( $ output );
25
+ return ( new Html2Text ( $ output))-> getText ( );
25
26
}
26
27
return "" ;
27
28
}
Original file line number Diff line number Diff line change 3
3
namespace Nilgems \PhpTextract \Providers ;
4
4
5
5
use Illuminate \Support \ServiceProvider as IlluminateServiceProvider ;
6
+ use Nilgems \PhpTextract \ExtractorService \Extractors \EpubExtractor ;
6
7
use Nilgems \PhpTextract \ExtractorService \Extractors \HtmlExtractor ;
7
8
use Nilgems \PhpTextract \ExtractorService \Extractors \ImageExtractor ;
8
9
use Nilgems \PhpTextract \ExtractorService \Extractors \MsOfficeDocExtractor ;
@@ -56,7 +57,8 @@ protected function registerExtractors(): void
56
57
OpenOfficeSpreadSheet::class,
57
58
PdfExtractor::class,
58
59
RtfExtractor::class,
59
- TxtExtractor::class
60
+ TxtExtractor::class,
61
+ EpubExtractor::class
60
62
];
61
63
foreach ($ extractors as $ extractor ) {
62
64
$ this ->app ->bind ($ extractor );
You can’t perform that action at this time.
0 commit comments