4
4
5
5
use TextAnalysis \Tokenizers \GeneralTokenizer ;
6
6
use TextAnalysis \LexicalDiversity \Naive ;
7
+ use TextAnalysis \Utilities \Text ;
7
8
8
9
/**
9
10
* Explore the text corpus
10
11
* @author yooper
11
12
*/
12
- class TextCorpus
13
+ class TextCorpus
13
14
{
14
15
/**
15
16
*
16
17
* @var string
17
18
*/
18
19
protected $ text ;
19
-
20
+
20
21
/**
21
22
*
22
23
* @var array
23
24
*/
24
25
protected $ tokens = [];
25
-
26
- public function __construct (string $ text )
26
+
27
+ public function __construct (string $ text )
27
28
{
28
29
$ this ->text = $ text ;
29
30
}
30
-
31
+
31
32
/**
32
33
* Returns the original text
33
34
* @return string
@@ -36,15 +37,15 @@ public function getText() : string
36
37
{
37
38
return $ this ->text ;
38
39
}
39
-
40
+
40
41
public function getTokens (string $ tokenizerClassName = GeneralTokenizer::class) : array
41
42
{
42
- if (empty ($ this ->tokens )) {
43
+ if (empty ($ this ->tokens )) {
43
44
$ this ->tokens = tokenize ($ this ->getText (), $ tokenizerClassName );
44
45
}
45
46
return $ this ->tokens ;
46
47
}
47
-
48
+
48
49
/**
49
50
* Return a list of positions that the needs were found in the text
50
51
* @param array $needles
@@ -59,7 +60,7 @@ public function getDispersion(array $needles) : array
59
60
}
60
61
return $ found ;
61
62
}
62
-
63
+
63
64
/**
64
65
* Compute the lexical diversity, the default uses a naive algorithm
65
66
* @param string $lexicalDiversityClassName
@@ -69,23 +70,51 @@ public function getLexicalDiversity(string $lexicalDiversityClassName = Naive::c
69
70
{
70
71
return lexical_diversity ($ this ->getTokens (), $ lexicalDiversityClassName );
71
72
}
72
-
73
+
73
74
/**
74
75
* See https://stackoverflow.com/questions/15737408/php-find-all-occurrences-of-a-substring-in-a-string
75
76
* @param string $needle
76
77
* @param int $contextLength The amount of space left and right of the found needle
77
78
* @param bool $ignorecase
78
79
* @param int $position. Available options: contain, begin, end, equal.
80
+ * @param bool $mark Option to mark the needle
79
81
* @return array
80
82
*/
81
- public function concordance (string $ needle , int $ contextLength = 20 , bool $ ignorecase = true , string $ position = 'contain ' ) : array
83
+ public function concordance (string $ needle , int $ contextLength = 20 , bool $ ignorecase = true , string $ position = 'contain ' , bool $ mark = false ) : array
82
84
{
83
85
// temporary solution to handle unicode chars
84
- $ this ->text = utf8_decode ($ this ->text );
86
+ $ text = utf8_decode ($ this ->text );
87
+ $ text = trim (preg_replace ('/[\s\t\n\r\s]+/ ' , ' ' , $ text ));
85
88
$ needle = utf8_decode ($ needle );
86
-
89
+ $ needleLength = strlen ($ needle );
90
+ $ found = [];
91
+
92
+ $ positions = $ this ->concordancePositions ($ text , $ needle , $ contextLength , $ ignorecase , $ position );
93
+
94
+ // Getting excerpts
95
+ foreach ($ positions as $ needlePosition ) {
96
+ //marking the term
97
+ $ text_marked = ($ mark ) ? Text::markString ($ text , $ needlePosition , $ needleLength , ['{{ ' ,'}} ' ]) : $ text ;
98
+ $ needleLength_marked = ($ mark ) ? $ needleLength +4 : $ needleLength ;
99
+
100
+ $ found [] = utf8_encode (Text::getExcerpt ($ text_marked , $ needlePosition , $ needleLength_marked , $ contextLength ));
101
+ }
102
+
103
+ return $ found ;
104
+ }
105
+
106
+ /**
107
+ * Return all positions of the needle in the text according to the position of the needle in a word.
108
+ * @param string $text
109
+ * @param int $needle
110
+ * @param int $contextLength The amount of space left and right of the found needle
111
+ * @param bool $ignorecase
112
+ * @param int $position. Available options: contain, begin, end, equal.
113
+ * @return array
114
+ */
115
+ public function concordancePositions (string $ text , string $ needle , int $ contextLength = 20 , bool $ ignorecase = true , string $ position = 'contain ' ) : array
116
+ {
87
117
$ found = [];
88
- $ text = ' ' . trim (preg_replace ('/[\s\t\n\r\s]+/ ' , ' ' , $ this ->text )) . ' ' ;
89
118
$ needleLength = strlen ($ needle );
90
119
$ textLength = strlen ($ text );
91
120
$ bufferLength = $ needleLength + 2 * $ contextLength ;
@@ -97,13 +126,13 @@ public function concordance(string $needle, int $contextLength = 20, bool $ignor
97
126
98
127
switch ($ position ) {
99
128
case 'equal ' :
100
- $ pattern = "/[^ $ word_part]( $ needle)[^ $ word_part]/ " ;
129
+ $ pattern = "/(?<![ $ word_part]) ( $ needle)(?![ $ word_part]) / " ;
101
130
break ;
102
131
case 'begin ' :
103
- $ pattern = "/[^ $ word_part]( $ needle)[ $ special_chars]?[\p{L}]*|^( $ needle)/ " ;
132
+ $ pattern = "/(?<![ $ word_part]) ( $ needle)[ $ special_chars]?[\p{L}]*|^( $ needle)/ " ;
104
133
break ;
105
134
case 'end ' :
106
- $ pattern = "/[\p{L}]*[ $ special_chars]?[\p{L}]*( $ needle)[^ $ word_part]/ " ;
135
+ $ pattern = "/[\p{L}]*[ $ special_chars]?[\p{L}]*( $ needle)(?![ $ word_part]) / " ;
107
136
break ;
108
137
case 'contain ' :
109
138
$ pattern = "/( $ needle)/ " ;
@@ -115,24 +144,11 @@ public function concordance(string $needle, int $contextLength = 20, bool $ignor
115
144
116
145
$ case = $ ignorecase ? 'i ' : '' ;
117
146
preg_match_all ($ pattern .$ case , $ text , $ matches , PREG_OFFSET_CAPTURE );
147
+ $ positions = array_column ($ matches [1 ], 1 );
118
148
119
- // Getting excerpts
120
- foreach ($ matches [1 ] as $ match ) {
121
-
122
- $ needlePosition = $ match [1 ];
123
- $ left = max ($ needlePosition - $ contextLength , 0 );
124
-
125
- if ($ needleLength + $ contextLength + $ needlePosition > $ textLength ) {
126
- $ tmp = substr ($ text , $ left );
127
- } else {
128
- $ tmp = substr ($ text , $ left , $ bufferLength );
129
- }
130
- $ found [] = utf8_encode ($ tmp );
131
- }
132
-
133
- return $ found ;
149
+ return $ positions ;
134
150
}
135
-
151
+
136
152
/**
137
153
* Get percentage of times the needle shows up in the text
138
154
* @param string $needle
@@ -143,7 +159,7 @@ public function percentage(string $needle) : float
143
159
$ freqDist = freq_dist ($ this ->getTokens ());
144
160
return $ freqDist ->getKeyValuesByFrequency ()[$ needle ] / $ freqDist ->getTotalTokens ();
145
161
}
146
-
162
+
147
163
/**
148
164
* Performs a case insensitive search for the needle
149
165
* @param string $needle
@@ -153,7 +169,7 @@ public function count(string $needle) : int
153
169
{
154
170
return substr_count (strtolower ($ this ->getText ()), strtolower ($ needle ));
155
171
}
156
-
172
+
157
173
/**
158
174
* Return all the position of the needle found in the text
159
175
* @param string $needle
@@ -166,7 +182,7 @@ public function findAll(string $needle) : array
166
182
$ needle = strtolower ($ needle );
167
183
$ text = strtolower ($ this ->getText ());
168
184
$ needleLength = strlen ($ needle );
169
- while (($ lastPos = stripos ($ text , $ needle , $ lastPos ))!== false )
185
+ while (($ lastPos = stripos ($ text , $ needle , $ lastPos ))!== false )
170
186
{
171
187
$ positions [] = $ lastPos ;
172
188
$ lastPos += $ needleLength ;
@@ -177,8 +193,8 @@ public function toString()
177
193
{
178
194
return $ this ->text ;
179
195
}
180
-
181
- public function __destruct ()
196
+
197
+ public function __destruct ()
182
198
{
183
199
unset($ this ->text );
184
200
unset($ this ->tokens );
0 commit comments