Skip to content

Commit 5ce51c2

Browse files
committed
Setup
0 parents  commit 5ce51c2

35 files changed

+4859
-0
lines changed

README.md

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
## PHP Textract
2+
A laravel 9 package to extract text from files. This work is ispired by ["textract node"](https://www.npmjs.com/package/textract).
3+
4+
### Supported file formats
5+
Following file formats is supported currently. You need to install proper extensions
6+
to your server to work with all the following extension related files. The package will
7+
check file content MIME type before execute. So with the extension you have maintain
8+
the current content type to work this package-
9+
- HTML
10+
- TEXT
11+
- DOC
12+
- DOCX
13+
- XLS, XLSX, XLSM, XLTX, XLTM, XLT
14+
- CSV
15+
- PDF
16+
- Image
17+
- Jpeg
18+
- Pdf
19+
- ODT
20+
- ODS
21+
- RTF
22+
23+
### Install
24+
```
25+
composer require nilgems/textract:^0.1
26+
```
27+
### Configuration
28+
You don't need to anything special for your laravel application to work with this
29+
package.
30+
31+
### Example
32+
Use the use ```Nilge\Textract\Textract``` facade to run the extractor.
33+
34+
Example 1:
35+
```
36+
........
37+
use Nilge\Textract\Textract;
38+
39+
Route::get('/textract', function(){
40+
return Textract::run({file_path});
41+
});
42+
........
43+
```
44+
45+
Example 2:
46+
47+
### Dependencies

composer.json

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
{
2+
"name": "nilgems/php-textract",
3+
"description": "A text extraction package for laravel",
4+
"type": "library",
5+
"require": {
6+
"php": "^8.0",
7+
"ext-fileinfo": "*",
8+
"ext-zip": "*",
9+
"symfony/process": "6.1",
10+
"illuminate/console": "^9",
11+
"phpoffice/phpspreadsheet": "^1.23",
12+
"phpoffice/phpword": "^0.18"
13+
},
14+
"require-dev": {
15+
"squizlabs/php_codesniffer": "^3.6"
16+
},
17+
"license": "MIT",
18+
"autoload": {
19+
"psr-4": {
20+
"Nilgems\\PhpTextract\\": "src/"
21+
}
22+
},
23+
"extra": {
24+
"laravel": {
25+
"providers": [
26+
"Nilgems\\PhpTextract\\Providers\\ServiceProvider"
27+
],
28+
"aliases": {
29+
"Textract":"Nilgems\\PhpTextract\\Textract"
30+
}
31+
}
32+
},
33+
"authors": [
34+
{
35+
"name": "Niladri Shekhar Mondal",
36+
"email": "[email protected]"
37+
}
38+
],
39+
"minimum-stability": "dev"
40+
}

config/textract.php

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
<?php
2+
3+
return [];

src/Concerns/AbstractExtractor.php

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
<?php
2+
3+
namespace Nilgems\PhpTextract\Concerns;
4+
5+
use Illuminate\Support\Collection;
6+
use Nilgems\PhpTextract\Exceptions\TextractException;
7+
use Nilgems\PhpTextract\Services\UtilsService;
8+
9+
abstract class AbstractExtractor
10+
{
11+
/**
12+
* @var string $file_path
13+
*/
14+
protected string $file_path = "";
15+
/**
16+
* @var string $error_message
17+
*/
18+
protected string $error_message = "The extractor plugin is not installed in the system. Please install and try again.";
19+
/**
20+
* Extractor name
21+
* @var string $extractor_name
22+
*/
23+
protected string $extractor_name = 'The extractor';
24+
25+
protected array $extractor_supported_extension = [];
26+
/**
27+
* @var array $mime_accepts
28+
*/
29+
protected array $mime_accepts = [];
30+
/**
31+
* @var string $current_mime_type
32+
*/
33+
protected string $current_mime_type = "";
34+
/**
35+
* @var Collection $data
36+
*/
37+
protected Collection $data;
38+
39+
public function __construct() {
40+
$this->data = new Collection([]);
41+
}
42+
43+
/**
44+
* Set data
45+
* @param $key
46+
* @param $value
47+
* @return $this
48+
*/
49+
public function setData($key, $value): self {
50+
$this->data->put($key, $value);
51+
return $this;
52+
}
53+
/**
54+
* Get accept mime types
55+
* @return array
56+
*/
57+
public function getAcceptMimeTypes(): array {
58+
if(method_exists($this, 'mimeAccepts')) {
59+
return $this->mimeAccepts();
60+
}
61+
return $this->mime_accepts;
62+
}
63+
64+
/**
65+
* Get acceptable extensions
66+
* @return array
67+
*/
68+
public function getAcceptExtensions(): array {
69+
return $this->extractor_supported_extension;
70+
}
71+
/**
72+
* Has match mime type
73+
* @param string $mime_type
74+
* @return bool
75+
*/
76+
public function hasMatchMimeType(string $mime_type): bool
77+
{
78+
$acceptable_mime_type = $this->getAcceptMimeTypes();
79+
if(empty($acceptable_mime_type)) {
80+
return true;
81+
}
82+
return in_array(strtolower($mime_type), $acceptable_mime_type, true);
83+
}
84+
85+
/**
86+
* @param string $file_path
87+
* @param array $data
88+
* @return string|null
89+
* @throws TextractException
90+
*/
91+
public function boot(string $file_path, array $data = []): ?string {
92+
$this->file_path = $file_path;
93+
$this->data = $this->data->merge($data);
94+
$utilsService = app(UtilsService::class)->setFilePath($file_path);
95+
$utilsService->setFilePath($file_path);
96+
$this->current_mime_type = $utilsService->getFileMimeType();
97+
if(!$this->hasMatchMimeType($this->current_mime_type)) {
98+
throw new TextractException($this->extractor_name . ' unable to process the file. Please ensure the content of file is a ' . implode('/', $this->extractor_supported_extension) . 'file.');
99+
}
100+
$has_valid = $this->checkHaveProviderPackage();
101+
if($has_valid) {
102+
return $this->getTextFromFile();
103+
}
104+
throw new TextractException($this->error_message);
105+
}
106+
107+
abstract protected function checkHaveProviderPackage();
108+
109+
abstract protected function getTextFromFile();
110+
}

src/Concerns/MustHaveResponse.php

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
<?php
2+
3+
namespace Nilgems\PhpTextract\Concerns;
4+
5+
interface MustHaveResponse
6+
{
7+
public function __construct(string $job_id, string $file_path, string $output = null, string $error = null);
8+
}

src/Concerns/TextractOutput.php

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
<?php
2+
3+
namespace Nilgems\PhpTextract\Concerns;
4+
5+
use Illuminate\Contracts\Support\Arrayable;
6+
use Illuminate\Support\Collection;
7+
8+
/**
9+
* @property-read string $text
10+
* @property-read string $word_count
11+
*/
12+
class TextractOutput implements Arrayable
13+
{
14+
protected Collection $collection;
15+
16+
/**
17+
* @param string $raw_output
18+
*/
19+
public function __construct(string $raw_output)
20+
{
21+
$this->collection = new Collection([
22+
'text' => $raw_output,
23+
'word_count' => str_word_count($raw_output, 0)
24+
]);
25+
}
26+
27+
/**
28+
* To array
29+
* @return array
30+
*/
31+
public function toArray(): array
32+
{
33+
return $this->collection->toArray();
34+
}
35+
36+
public function __get(string $key) {
37+
return $this->collection->get($key);
38+
}
39+
40+
/**
41+
* To string
42+
* @return string
43+
*/
44+
public function __toString(): string
45+
{
46+
return $this->collection->get('text');
47+
}
48+
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
<?php
2+
3+
namespace Nilgems\PhpTextract\Console\Contracts;
4+
5+
interface MustHaveExtractCommand
6+
{
7+
public function handle(): int;
8+
}

src/Console/TextractCommand.php

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
<?php
2+
3+
namespace Nilgems\PhpTextract\Console;
4+
5+
use Illuminate\Console\Command;
6+
use Nilgems\PhpTextract\Console\Contracts\MustHaveExtractCommand;
7+
use Nilgems\PhpTextract\Services\ConsoleExtractionService;
8+
9+
class TextractCommand extends Command implements MustHaveExtractCommand
10+
{
11+
protected $name = "Textract command";
12+
13+
protected $signature = 'textract:run {file_path} {job_id}';
14+
15+
protected $description = 'Extract text from the supported file';
16+
17+
public function handle(): int {
18+
$file_path = $this->argument('file_path');
19+
$job_id = $this->argument('job_id');
20+
$output = app()->get('textract')->run($file_path, $job_id);
21+
info($output);
22+
return 1;
23+
}
24+
}

src/Exceptions/TextractException.php

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
<?php
2+
3+
namespace Nilgems\PhpTextract\Exceptions;
4+
5+
class TextractException extends \Exception
6+
{
7+
8+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
<?php
2+
3+
namespace Nilgems\PhpTextract\Extractor\Contracts;
4+
5+
use PhpOffice\PhpWord\Element\Cell;
6+
use PhpOffice\PhpWord\Element\Text as PhpWordElementText;
7+
use PhpOffice\PhpWord\Element\TextRun as PhpWordElementTextRun;
8+
use PhpOffice\PhpWord\IOFactory;
9+
10+
trait HasPhpWord
11+
{
12+
protected function getSectionsText(string $file_path, string $readerName = 'Word2007'): string
13+
{
14+
$data = [];
15+
$phpWord = IOFactory::load($file_path, $readerName);
16+
foreach ($phpWord->getSections() as $section) {
17+
$elements = $section->getElements();
18+
$data = [...$data, ...$this->getElementText($elements)];
19+
}
20+
return implode(" ", array_filter($data));
21+
}
22+
23+
protected function getElementText(array $elements): array {
24+
$docs = [];
25+
foreach ($elements as $element) {
26+
if($element instanceof PhpWordElementText) {
27+
$docs[] = trim($element->getText());
28+
}
29+
if($element instanceof PhpWordElementTextRun) {
30+
$nested_data = $this->getElementText($element->getElements());
31+
$docs = [...$docs, ...$nested_data];
32+
}
33+
// if($element instanceof PhpWordElementTable) {
34+
// $nested_data = $this->getTableRowText($element->getRows());
35+
// $docs = [...$docs, ...$nested_data];
36+
// }
37+
}
38+
return $docs;
39+
}
40+
41+
protected function getTableRowText(array $rows): array
42+
{
43+
$data = [];
44+
foreach ($rows as $row) {
45+
foreach ($row->getCells() as $cell) {
46+
if($cell instanceof Cell) {
47+
$data[] = $this->getElementText($cell->getElements());
48+
}
49+
}
50+
}
51+
return $data;
52+
}
53+
}

0 commit comments

Comments
 (0)