Skip to content

Commit a446020

Browse files
authored
Merge pull request #46 from Euak/concod-mark
Improved concordance method
2 parents ba5c76f + 758e7fb commit a446020

File tree

2 files changed

+108
-52
lines changed

2 files changed

+108
-52
lines changed

src/Corpus/TextCorpus.php

Lines changed: 54 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -4,30 +4,31 @@
44

55
use TextAnalysis\Tokenizers\GeneralTokenizer;
66
use TextAnalysis\LexicalDiversity\Naive;
7+
use TextAnalysis\Utilities\Text;
78

89
/**
910
* Explore the text corpus
1011
* @author yooper
1112
*/
12-
class TextCorpus
13+
class TextCorpus
1314
{
1415
/**
1516
*
1617
* @var string
1718
*/
1819
protected $text;
19-
20+
2021
/**
2122
*
2223
* @var array
2324
*/
2425
protected $tokens = [];
25-
26-
public function __construct(string $text)
26+
27+
public function __construct(string $text)
2728
{
2829
$this->text = $text;
2930
}
30-
31+
3132
/**
3233
* Returns the original text
3334
* @return string
@@ -36,15 +37,15 @@ public function getText() : string
3637
{
3738
return $this->text;
3839
}
39-
40+
4041
public function getTokens(string $tokenizerClassName = GeneralTokenizer::class) : array
4142
{
42-
if(empty($this->tokens)) {
43+
if(empty($this->tokens)) {
4344
$this->tokens = tokenize($this->getText(), $tokenizerClassName);
4445
}
4546
return $this->tokens;
4647
}
47-
48+
4849
/**
4950
* Return a list of positions that the needs were found in the text
5051
* @param array $needles
@@ -59,7 +60,7 @@ public function getDispersion(array $needles) : array
5960
}
6061
return $found;
6162
}
62-
63+
6364
/**
6465
* Compute the lexical diversity, the default uses a naive algorithm
6566
* @param string $lexicalDiversityClassName
@@ -69,23 +70,51 @@ public function getLexicalDiversity(string $lexicalDiversityClassName = Naive::c
6970
{
7071
return lexical_diversity($this->getTokens(), $lexicalDiversityClassName);
7172
}
72-
73+
7374
/**
7475
* See https://stackoverflow.com/questions/15737408/php-find-all-occurrences-of-a-substring-in-a-string
7576
* @param string $needle
7677
* @param int $contextLength The amount of space left and right of the found needle
7778
* @param bool $ignorecase
7879
* @param int $position. Available options: contain, begin, end, equal.
80+
* @param bool $mark Option to mark the needle
7981
* @return array
8082
*/
81-
public function concordance(string $needle, int $contextLength = 20, bool $ignorecase = true, string $position = 'contain') : array
83+
public function concordance(string $needle, int $contextLength = 20, bool $ignorecase = true, string $position = 'contain', bool $mark = false) : array
8284
{
8385
// temporary solution to handle unicode chars
84-
$this->text = utf8_decode($this->text);
86+
$text = utf8_decode($this->text);
87+
$text = trim(preg_replace('/[\s\t\n\r\s]+/', ' ', $text));
8588
$needle = utf8_decode($needle);
86-
89+
$needleLength = strlen($needle);
90+
$found = [];
91+
92+
$positions = $this->concordancePositions($text, $needle, $contextLength, $ignorecase, $position);
93+
94+
// Getting excerpts
95+
foreach($positions as $needlePosition) {
96+
//marking the term
97+
$text_marked = ($mark) ? Text::markString($text, $needlePosition, $needleLength, ['{{','}}']) : $text;
98+
$needleLength_marked = ($mark) ? $needleLength+4 : $needleLength;
99+
100+
$found[] = utf8_encode(Text::getExcerpt($text_marked, $needlePosition, $needleLength_marked, $contextLength));
101+
}
102+
103+
return $found;
104+
}
105+
106+
/**
107+
* Return all positions of the needle in the text according to the position of the needle in a word.
108+
* @param string $text
109+
* @param int $needle
110+
* @param int $contextLength The amount of space left and right of the found needle
111+
* @param bool $ignorecase
112+
* @param int $position. Available options: contain, begin, end, equal.
113+
* @return array
114+
*/
115+
public function concordancePositions(string $text, string $needle, int $contextLength = 20, bool $ignorecase = true, string $position = 'contain') : array
116+
{
87117
$found = [];
88-
$text = ' ' . trim(preg_replace('/[\s\t\n\r\s]+/', ' ', $this->text)) . ' ';
89118
$needleLength = strlen($needle);
90119
$textLength = strlen($text);
91120
$bufferLength = $needleLength + 2 * $contextLength;
@@ -97,13 +126,13 @@ public function concordance(string $needle, int $contextLength = 20, bool $ignor
97126

98127
switch ($position) {
99128
case 'equal':
100-
$pattern = "/[^$word_part]($needle)[^$word_part]/";
129+
$pattern = "/(?<![$word_part])($needle)(?![$word_part])/";
101130
break;
102131
case 'begin':
103-
$pattern = "/[^$word_part]($needle)[$special_chars]?[\p{L}]*|^($needle)/";
132+
$pattern = "/(?<![$word_part])($needle)[$special_chars]?[\p{L}]*|^($needle)/";
104133
break;
105134
case 'end':
106-
$pattern = "/[\p{L}]*[$special_chars]?[\p{L}]*($needle)[^$word_part]/";
135+
$pattern = "/[\p{L}]*[$special_chars]?[\p{L}]*($needle)(?![$word_part])/";
107136
break;
108137
case 'contain':
109138
$pattern = "/($needle)/";
@@ -115,24 +144,11 @@ public function concordance(string $needle, int $contextLength = 20, bool $ignor
115144

116145
$case = $ignorecase ? 'i' : '';
117146
preg_match_all($pattern.$case, $text, $matches, PREG_OFFSET_CAPTURE);
147+
$positions = array_column($matches[1], 1);
118148

119-
// Getting excerpts
120-
foreach($matches[1] as $match) {
121-
122-
$needlePosition = $match[1];
123-
$left = max($needlePosition - $contextLength, 0);
124-
125-
if($needleLength + $contextLength + $needlePosition > $textLength) {
126-
$tmp = substr($text, $left);
127-
} else {
128-
$tmp = substr($text, $left, $bufferLength);
129-
}
130-
$found[] = utf8_encode($tmp);
131-
}
132-
133-
return $found;
149+
return $positions;
134150
}
135-
151+
136152
/**
137153
* Get percentage of times the needle shows up in the text
138154
* @param string $needle
@@ -143,7 +159,7 @@ public function percentage(string $needle) : float
143159
$freqDist = freq_dist($this->getTokens());
144160
return $freqDist->getKeyValuesByFrequency()[$needle] / $freqDist->getTotalTokens();
145161
}
146-
162+
147163
/**
148164
* Performs a case insensitive search for the needle
149165
* @param string $needle
@@ -153,7 +169,7 @@ public function count(string $needle) : int
153169
{
154170
return substr_count(strtolower($this->getText()), strtolower($needle));
155171
}
156-
172+
157173
/**
158174
* Return all the position of the needle found in the text
159175
* @param string $needle
@@ -166,7 +182,7 @@ public function findAll(string $needle) : array
166182
$needle = strtolower($needle);
167183
$text = strtolower($this->getText());
168184
$needleLength = strlen($needle);
169-
while (($lastPos = stripos($text, $needle, $lastPos))!== false)
185+
while (($lastPos = stripos($text, $needle, $lastPos))!== false)
170186
{
171187
$positions[] = $lastPos;
172188
$lastPos += $needleLength;
@@ -177,8 +193,8 @@ public function toString()
177193
{
178194
return $this->text;
179195
}
180-
181-
public function __destruct()
196+
197+
public function __destruct()
182198
{
183199
unset($this->text);
184200
unset($this->tokens);

src/Utilities/Text.php

Lines changed: 54 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@
55
* Additional string functions
66
* @author yooper (yooper)
77
*/
8-
class Text
8+
class Text
99
{
1010
protected function __construct(){}
11-
11+
1212
/**
1313
* http://stackoverflow.com/questions/834303/php-startswith-and-endswith-functions
1414
* @param string $haystack
1515
* @param string $needle
16-
* @return boolean
16+
* @return boolean
1717
*/
1818
static public function startsWith($haystack, $needle)
1919
{
@@ -23,19 +23,19 @@ static public function startsWith($haystack, $needle)
2323
/**
2424
* @param string $haystack
2525
* @param string $needle
26-
* @return boolean
26+
* @return boolean
2727
*/
2828
static public function endsWith($haystack, $needle)
2929
{
3030
return strpos($haystack, $needle) === (strlen($haystack) - strlen($needle));
3131
}
32-
32+
3333
static public function contains($haystack, $needle)
3434
{
3535
return (strpos($haystack, $needle) !== false);
3636
}
37-
38-
37+
38+
3939
/**
4040
* Takes a string and produces all possible substrings
4141
* @param string $text
@@ -45,17 +45,17 @@ static public function getAllSubStrings($text)
4545
{
4646
$splitText = str_split($text);
4747
$splitCount = count($splitText);
48-
$subStrings = [];
49-
for ($i = 0; $i < $splitCount; $i++)
48+
$subStrings = [];
49+
for ($i = 0; $i < $splitCount; $i++)
5050
{
51-
for ($j = $i; $j < $splitCount; $j++)
51+
for ($j = $i; $j < $splitCount; $j++)
5252
{
5353
$subStrings[] = implode(array_slice($splitText, $i, $j - $i + 1));
54-
}
54+
}
5555
}
56-
return $subStrings;
56+
return $subStrings;
5757
}
58-
58+
5959
/**
6060
* Find Date in a String
6161
*
@@ -215,5 +215,45 @@ static public function findDate( $string ) {
215215
else
216216
return $date;
217217
}
218-
218+
219+
/**
220+
* Mark a string
221+
*
222+
* @param string $text
223+
* @param int $position
224+
* @param int $length
225+
* @param array $mark
226+
* @return string
227+
*/
228+
static function markString(string $text, int $position, int $length, array $mark) : string
229+
{
230+
$text = substr_replace($text, $mark[0], $position, 0);
231+
$text = substr_replace($text, $mark[1], $position + $length + strlen($mark[0]), 0);
232+
233+
return $text;
234+
}
235+
236+
/**
237+
* Get a excerpt from a string by a neddle postion and its length
238+
*
239+
* @param string $text
240+
* @param int $needlePosition
241+
* @param int $needleLength
242+
* @param int $contextLength
243+
* @return string
244+
*/
245+
static function getExcerpt(String $text, int $needlePosition, int $needleLength, int $contextLength) : string
246+
{
247+
$left = max($needlePosition - $contextLength, 0);
248+
$bufferLength = $needleLength + (2 * $contextLength);
249+
250+
if($needleLength + $contextLength + $needlePosition > strlen($text)) {
251+
$text = substr($text, $left);
252+
} else {
253+
$text = substr($text, $left, $bufferLength);
254+
}
255+
256+
return $text;
257+
}
258+
219259
}

0 commit comments

Comments
 (0)