Skip to content

Commit 6d2e3cb

Browse files
author
Stéphane Bauland
committed
Updated with PSR4 class and composer for autoloading packagist.
1 parent febe16b commit 6d2e3cb

File tree

4 files changed

+636
-1
lines changed

4 files changed

+636
-1
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@ A PHP Bayesian classifier algorithm library.
33

44
### Installation
55
```php
6-
composer require roomoot/classifier
6+
composer require rookmoot/classifier
77
```

composer.json

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"name": "rookmoot/classifier",
3+
"type": "library",
4+
"description": "A PHP Bayesian classifier algorithm library.",
5+
"keywords": ["Bayes", "Bayesian", "classifier", "algorithm"],
6+
"homepage": "http://www.diatelys.fr",
7+
"license": "BSD",
8+
"authors": [
9+
{
10+
"name": "Stéphane Bauland",
11+
"email": "[email protected]"
12+
}
13+
],
14+
"autoload": {
15+
"psr-4": {"Classify\\": "src/"}
16+
}
17+
}

src/Classify.php

Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
<?php
2+
3+
namespace Classify;
4+
5+
/**
6+
* PHP version of Ruby Bayes classifier library
7+
* @see http://github.com/xaviershay/classifier
8+
*/
9+
class Classify
10+
{
11+
private $_categories;
12+
private $_totalWords;
13+
14+
/**
15+
* The class can be created with one or more categories, each of which will be
16+
* initialized and given a training method. E.g.,
17+
* $b = new Bayes('Interesting', 'Uninteresting', 'Spam')
18+
*/
19+
public function __construct()
20+
{
21+
$categories = func_get_args();
22+
23+
$this->_totalWords = 0;
24+
25+
$this->_categories = array();
26+
foreach ($categories as $category)
27+
{
28+
$this->_categories[$category] = array();
29+
}
30+
}
31+
32+
/**
33+
* Provides a general training method for all categories specified in Bayes#new
34+
* For example:
35+
* $b = new Bayes('this', 'that', 'the_other')
36+
* $b->train('this', 'This text')
37+
* $b->train('that', 'That text')
38+
* $b->train('the_other', 'The other text')
39+
*/
40+
public function train($category, $text)
41+
{
42+
foreach ($this->_wordArray($text) as $word => $count)
43+
{
44+
if (!isset($this->_categories[$category][$word]))
45+
$this->_categories[$category][$word] = 0;
46+
47+
$this->_categories[$category][$word] += $count;
48+
49+
$this->_totalWords += $count;
50+
}
51+
}
52+
53+
/**
54+
* Returns the scores in each category the provided +text+. E.g.,
55+
* $b->classifications("I hate bad words and you")
56+
* => {"Uninteresting" => -12.6997928013932, "Interesting" => -18.4206807439524}
57+
* The largest of these scores (the one closest to 0) is the one picked out by classify()
58+
*/
59+
public function classifications($text)
60+
{
61+
$score = array();
62+
63+
foreach ($this->_categories as $category => $categoryWords)
64+
{
65+
$score[$category] = 0.0;
66+
$total = array_sum(array_values($categoryWords));
67+
68+
foreach ($this->_wordArray($text) as $word => $count)
69+
{
70+
$s = isset($categoryWords[$word]) ? $categoryWords[$word] : 0.1;
71+
$score[$category] += log($s / $total);
72+
}
73+
}
74+
75+
return $score;
76+
}
77+
78+
/**
79+
* Returns the classification of the provided +text+, which is one of the
80+
* categories given in the initializer. E.g.,
81+
* $b->classify("I hate bad words and you")
82+
* => 'Uninteresting'
83+
*/
84+
public function classify($text)
85+
{
86+
$a = $this->classifications($text);
87+
arsort($a);
88+
return array_shift(array_keys($a));
89+
}
90+
91+
// ----
92+
93+
/**
94+
* Return an array of strings => ints. Each word in the string is stemmed,
95+
* and indexes to its frequency in the document.
96+
*/
97+
private function _wordArray($word)
98+
{
99+
return $this->_wordArrayForWords(
100+
array_merge(
101+
preg_split('/\s+/', preg_replace('/[^\w\s]/','', $word)),
102+
preg_split('/\s+/', preg_replace('/[\w]/',' ', $word))));
103+
}
104+
105+
private function _wordArrayForWords($words)
106+
{
107+
$d = array();
108+
109+
foreach ($words as $word)
110+
{
111+
if (preg_match('/[\w]+/',$word)) $word = strtolower($word);
112+
$key = Stemmer::Stem($word);
113+
114+
if (preg_match('/[^\w]/',$word)
115+
|| !in_array($word, self::$CORPUS_SKIP_WORDS)
116+
&& strlen($word) > 2)
117+
{
118+
if (!isset($d[$key]))
119+
$d[$key] = 0;
120+
121+
$d[$key] += 1;
122+
}
123+
}
124+
125+
return $d;
126+
}
127+
128+
private static $CORPUS_SKIP_WORDS = array(
129+
"a",
130+
"again",
131+
"all",
132+
"along",
133+
"are",
134+
"also",
135+
"an",
136+
"and",
137+
"as",
138+
"at",
139+
"but",
140+
"by",
141+
"came",
142+
"can",
143+
"cant",
144+
"couldnt",
145+
"did",
146+
"didn",
147+
"didnt",
148+
"do",
149+
"doesnt",
150+
"dont",
151+
"ever",
152+
"first",
153+
"from",
154+
"have",
155+
"her",
156+
"here",
157+
"him",
158+
"how",
159+
"i",
160+
"if",
161+
"in",
162+
"into",
163+
"is",
164+
"isnt",
165+
"it",
166+
"itll",
167+
"just",
168+
"last",
169+
"least",
170+
"like",
171+
"most",
172+
"my",
173+
"new",
174+
"no",
175+
"not",
176+
"now",
177+
"of",
178+
"on",
179+
"or",
180+
"should",
181+
"sinc",
182+
"so",
183+
"some",
184+
"th",
185+
"than",
186+
"this",
187+
"that",
188+
"the",
189+
"their",
190+
"then",
191+
"those",
192+
"to",
193+
"told",
194+
"too",
195+
"true",
196+
"try",
197+
"until",
198+
"url",
199+
"us",
200+
"were",
201+
"when",
202+
"whether",
203+
"while",
204+
"with",
205+
"within",
206+
"yes",
207+
"you",
208+
"youll",
209+
);
210+
}
211+

0 commit comments

Comments
 (0)