Merge pull request #46 from Euak/concod-mark

yooper · web-flow · commit a4460204a42e · 2019-01-14T08:42:59.000-05:00
Improved concordance method
diff --git a/src/Corpus/TextCorpus.php b/src/Corpus/TextCorpus.php
@@ -4,30 +4,31 @@
 
 use TextAnalysis\Tokenizers\GeneralTokenizer;
 use TextAnalysis\LexicalDiversity\Naive;
+use TextAnalysis\Utilities\Text;
 
 /**
  * Explore the text corpus
  * @author yooper
  */
-class TextCorpus 
+class TextCorpus
 {
     /**
      *
      * @var string
      */
     protected $text;
-    
+
     /**
      *
      * @var array
      */
     protected $tokens = [];
-    
-    public function __construct(string $text) 
+
+    public function __construct(string $text)
     {
         $this->text = $text;
     }
-    
+
     /**
      * Returns the original text
      * @return string
@@ -36,15 +37,15 @@ public function getText() : string
     {
         return $this->text;
     }
-    
+
     public function getTokens(string $tokenizerClassName = GeneralTokenizer::class) : array
     {
-        if(empty($this->tokens)) {            
+        if(empty($this->tokens)) {
             $this->tokens = tokenize($this->getText(), $tokenizerClassName);
         }
         return $this->tokens;
     }
-    
+
     /**
      * Return a list of positions that the needs were found in the text
      * @param array $needles
@@ -59,7 +60,7 @@ public function getDispersion(array $needles) : array
         }
         return $found;
     }
-    
+
     /**
      * Compute the lexical diversity, the default uses a naive algorithm
      * @param string $lexicalDiversityClassName
@@ -69,23 +70,51 @@ public function getLexicalDiversity(string $lexicalDiversityClassName = Naive::c
     {
         return lexical_diversity($this->getTokens(), $lexicalDiversityClassName);
     }
-    
+
     /**
      * See https://stackoverflow.com/questions/15737408/php-find-all-occurrences-of-a-substring-in-a-string
      * @param string $needle
      * @param int $contextLength The amount of space left and right of the found needle
      * @param bool $ignorecase
      * @param int $position. Available options: contain, begin, end, equal.
+     * @param bool $mark Option to mark the needle
      * @return array
      */
-    public function concordance(string $needle, int $contextLength = 20, bool $ignorecase = true, string $position = 'contain') : array
+    public function concordance(string $needle, int $contextLength = 20, bool $ignorecase = true, string $position = 'contain', bool $mark = false) : array
     {
         // temporary solution to handle unicode chars
-        $this->text = utf8_decode($this->text);
+        $text = utf8_decode($this->text);
+        $text = trim(preg_replace('/[\s\t\n\r\s]+/', ' ', $text));
         $needle = utf8_decode($needle);
-        
+        $needleLength = strlen($needle);
+        $found = [];
+
+        $positions = $this->concordancePositions($text, $needle, $contextLength, $ignorecase, $position);
+
+        // Getting excerpts
+        foreach($positions as $needlePosition) {
+            //marking the term
+            $text_marked = ($mark) ? Text::markString($text, $needlePosition, $needleLength, ['{{','}}']) : $text;
+            $needleLength_marked = ($mark) ? $needleLength+4 : $needleLength;
+
+            $found[] = utf8_encode(Text::getExcerpt($text_marked, $needlePosition, $needleLength_marked, $contextLength));
+        }
+
+        return $found;
+    }
+
+    /**
+     * Return all positions of the needle in the text according to the position of the needle in a word.
+     * @param string $text
+     * @param int $needle
+     * @param int $contextLength The amount of space left and right of the found needle
+     * @param bool $ignorecase
+     * @param int $position. Available options: contain, begin, end, equal.
+     * @return array
+     */
+    public function concordancePositions(string $text, string $needle, int $contextLength = 20, bool $ignorecase = true, string $position = 'contain') : array
+    {
         $found = [];
-        $text = ' ' . trim(preg_replace('/[\s\t\n\r\s]+/', ' ', $this->text)) . ' ';
         $needleLength = strlen($needle);
         $textLength = strlen($text);
         $bufferLength = $needleLength + 2 * $contextLength;
@@ -97,13 +126,13 @@ public function concordance(string $needle, int $contextLength = 20, bool $ignor
 
         switch ($position) {
             case 'equal':
-                $pattern = "/[^$word_part]($needle)[^$word_part]/";
+                $pattern = "/(?<![$word_part])($needle)(?![$word_part])/";
                 break;
             case 'begin':
-                $pattern = "/[^$word_part]($needle)[$special_chars]?[\p{L}]*|^($needle)/";
+                $pattern = "/(?<![$word_part])($needle)[$special_chars]?[\p{L}]*|^($needle)/";
                 break;
             case 'end':
-                $pattern = "/[\p{L}]*[$special_chars]?[\p{L}]*($needle)[^$word_part]/";
+                $pattern = "/[\p{L}]*[$special_chars]?[\p{L}]*($needle)(?![$word_part])/";
                 break;
             case 'contain':
                 $pattern = "/($needle)/";
@@ -115,24 +144,11 @@ public function concordance(string $needle, int $contextLength = 20, bool $ignor
 
         $case = $ignorecase ? 'i' : '';
         preg_match_all($pattern.$case, $text, $matches, PREG_OFFSET_CAPTURE);
+        $positions = array_column($matches[1], 1);
 
-        // Getting excerpts
-        foreach($matches[1] as $match) {
-
-            $needlePosition = $match[1];
-            $left = max($needlePosition - $contextLength, 0);
-
-            if($needleLength + $contextLength + $needlePosition > $textLength) {
-                $tmp = substr($text, $left);
-            } else {
-                $tmp = substr($text, $left, $bufferLength);
-            }
-            $found[] = utf8_encode($tmp);
-        }
-
-        return $found;
+        return $positions;
     }
-    
+
     /**
      * Get percentage of times the needle shows up in the text
      * @param string $needle
@@ -143,7 +159,7 @@ public function percentage(string $needle) : float
         $freqDist = freq_dist($this->getTokens());
         return $freqDist->getKeyValuesByFrequency()[$needle] / $freqDist->getTotalTokens();
     }
-    
+
     /**
      * Performs a case insensitive search for the needle
      * @param string $needle
@@ -153,7 +169,7 @@ public function count(string $needle) : int
     {
         return substr_count(strtolower($this->getText()), strtolower($needle));
     }
-    
+
     /**
      * Return all the position of the needle found in the text
      * @param string $needle
@@ -166,7 +182,7 @@ public function findAll(string $needle) : array
         $needle = strtolower($needle);
         $text = strtolower($this->getText());
         $needleLength = strlen($needle);
-        while (($lastPos = stripos($text, $needle, $lastPos))!== false) 
+        while (($lastPos = stripos($text, $needle, $lastPos))!== false)
         {
             $positions[] = $lastPos;
             $lastPos += $needleLength;
@@ -177,8 +193,8 @@ public function toString()
     {
         return $this->text;
     }
-    
-    public function __destruct() 
+
+    public function __destruct()
     {
         unset($this->text);
         unset($this->tokens);
diff --git a/src/Utilities/Text.php b/src/Utilities/Text.php
@@ -5,15 +5,15 @@
  * Additional string functions
  * @author yooper (yooper)
  */
-class Text 
+class Text
 {
     protected function __construct(){}
-    
+
     /**
      * http://stackoverflow.com/questions/834303/php-startswith-and-endswith-functions
      * @param string $haystack
      * @param string $needle
-     * @return boolean 
+     * @return boolean
      */
     static public function startsWith($haystack, $needle)
     {
@@ -23,19 +23,19 @@ static public function startsWith($haystack, $needle)
     /**
      * @param string $haystack
      * @param string $needle
-     * @return boolean 
+     * @return boolean
      */
     static public function endsWith($haystack, $needle)
     {
         return strpos($haystack, $needle) === (strlen($haystack) - strlen($needle));
     }
-    
+
     static public function contains($haystack, $needle)
     {
         return (strpos($haystack, $needle) !== false);
     }
-    
-    
+
+
     /**
      * Takes a string and produces all possible substrings
      * @param string $text
@@ -45,17 +45,17 @@ static public function getAllSubStrings($text)
     {
         $splitText = str_split($text);
         $splitCount = count($splitText);
-        $subStrings = [];        
-        for ($i = 0; $i < $splitCount; $i++) 
+        $subStrings = [];
+        for ($i = 0; $i < $splitCount; $i++)
         {
-            for ($j = $i; $j < $splitCount; $j++) 
+            for ($j = $i; $j < $splitCount; $j++)
             {
                 $subStrings[] = implode(array_slice($splitText, $i, $j - $i + 1));
-            }       
+            }
         }
-        return $subStrings;        
+        return $subStrings;
     }
-    
+
     /**
      * Find Date in a String
      *
@@ -215,5 +215,45 @@ static public function findDate( $string ) {
       else
         return $date;
     }
-  
+
+    /**
+    * Mark a string
+    *
+    * @param string $text
+    * @param int $position
+    * @param int $length
+    * @param array $mark
+    * @return string
+    */
+    static function markString(string $text, int $position, int $length, array $mark) : string
+    {
+        $text = substr_replace($text, $mark[0], $position, 0);
+        $text = substr_replace($text, $mark[1], $position + $length + strlen($mark[0]), 0);
+
+        return $text;
+    }
+
+    /**
+    * Get a excerpt from a string by a neddle postion and its length
+    *
+    * @param string $text
+    * @param int $needlePosition
+    * @param int $needleLength
+    * @param int $contextLength
+    * @return string
+    */
+    static function getExcerpt(String $text, int $needlePosition, int $needleLength, int $contextLength) : string
+    {
+        $left = max($needlePosition - $contextLength, 0);
+        $bufferLength = $needleLength + (2 * $contextLength);
+
+        if($needleLength + $contextLength + $needlePosition > strlen($text)) {
+            $text = substr($text, $left);
+        } else {
+            $text = substr($text, $left, $bufferLength);
+        }
+
+        return $text;
+    }
+
 }