Skip to content

Commit b8d35fc

Browse files
author
wanghui03
committed
新增以文件方式构建敏感词树
1 parent 69323da commit b8d35fc

File tree

4 files changed

+1214
-44
lines changed

4 files changed

+1214
-44
lines changed

README.md

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ php实现基于确定有穷自动机算法的铭感词过滤
2424

2525
use DfaFilter\SensitiveHelper;
2626

27-
### 获取敏感词库
27+
### 构建敏感词库树
28+
场景一: 可以拿到不同(用户)词库数组
2829

2930
// 获取感词库索引数组
3031
$wordData = array(
@@ -35,21 +36,31 @@ php实现基于确定有穷自动机算法的铭感词过滤
3536
'成人卡通',
3637
......
3738
);
39+
40+
// get one helper
41+
$handle = SensitiveHelper::init()->setTree($wordData);
42+
43+
场景二: 全站使用一套敏感词库
44+
45+
// 获取感词库文件路径
46+
$wordFilePath = 'tests/data/words.txt';
47+
// get one helper
48+
$handle = SensitiveHelper::init()->setTreeByFile($wordFilePath);
3849

3950
### 检测是否含有敏感词
4051

41-
$islegal = SensitiveHelper::init()->setTree($wordData)->islegal($content);
52+
$islegal = $handle->islegal($content);
4253
### 敏感词过滤
4354

4455
// 敏感词替换为***为例
45-
$filterContent = SensitiveHelper::init()->setTree($wordData)->replace($content, '***');
56+
$filterContent = $handle->replace($content, '***');
4657

4758
### 获取文字中的敏感词
4859

4960
// 获取内容中所有的敏感词
50-
$sensitiveWordGroup = SensitiveHelper::init()->setTree($wordData)->getBadWord($content);
61+
$sensitiveWordGroup = $handle->getBadWord($content);
5162
// 仅且获取一个敏感词
52-
$sensitiveWordGroup = SensitiveHelper::init()->setTree($wordData)->getBadWord($content, 1);
63+
$sensitiveWordGroup = $handle->getBadWord($content, 1);
5364

5465
### 如何使用单元测试进行测试
5566
#### 安装PHPUnit

src/DfaFilter/SensitiveHelper.php

Lines changed: 91 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -51,46 +51,45 @@ public static function init()
5151
return self::$_instance;
5252
}
5353

54+
/**
55+
* 构建铭感词树【文件模式】
56+
*
57+
* @param string $sensitiveWord
58+
* @return $this
59+
*/
60+
public function setTreeByFile($filepath = '')
61+
{
62+
if (! file_exists($filepath)) {
63+
throw new \Exception('词库文件不存在');
64+
}
65+
66+
// 词库树初始化
67+
$this->wordTree = new HashMap();
68+
69+
foreach ($this->yieldToReadFile($filepath) as $word) {
70+
$this->buildWordToTree(trim($word));
71+
}
72+
73+
return $this;
74+
}
75+
5476

5577
/**
56-
* 构建铭感词树
78+
* 构建铭感词树【数组模式】
5779
*
5880
* @param string $sensitiveWord
5981
* @return $this
6082
*/
61-
public function setTree($sensitiveWords = '')
83+
public function setTree($sensitiveWords = null)
6284
{
6385
if (empty($sensitiveWords)) {
6486
throw new \Exception('词库不能为空');
6587
}
66-
$this->wordTree = new HashMap();
67-
foreach ($sensitiveWords as $word) {
68-
$tree = $this->wordTree;
69-
$wordLength = mb_strlen($word, 'utf-8');
70-
for ($i = 0; $i < $wordLength; $i++) {
71-
$keyChar = mb_substr($word, $i, 1, 'utf-8');
72-
73-
// 获取子节点树结构
74-
$tempTree = $tree->get($keyChar);
75-
76-
if ($tempTree) {
77-
$tree = $tempTree;
78-
} else {
79-
// 设置标志位
80-
$newTree = new HashMap();
81-
$newTree->put('ending', false);
82-
83-
// 添加到集合
84-
$tree->put($keyChar, $newTree);
85-
$tree = $newTree;
86-
}
8788

88-
// 到达最后一个节点
89-
if ($i == $wordLength - 1) {
90-
$tree->put('ending', true);
91-
}
89+
$this->wordTree = new HashMap();
9290

93-
}
91+
foreach ($sensitiveWords as $word) {
92+
$this->buildWordToTree($word);
9493
}
9594
return $this;
9695
}
@@ -117,21 +116,27 @@ public function getBadWord($content, $matchType = 1)
117116
$nowMap = $tempMap->get($keyChar);
118117

119118
// 不存在节点树,直接返回
120-
if (empty($nowMap)) break;
119+
if (empty($nowMap)) {
120+
break;
121+
}
121122

122123
// 存在,则判断是否为最后一个
123124
$tempMap = $nowMap;
124125

125126
// 找到相应key,偏移量+1
126-
$matchFlag ++;
127+
$matchFlag++;
127128

128129
// 如果为最后一个匹配规则,结束循环,返回匹配标识数
129-
if (false === $nowMap->get('ending')) continue;
130+
if (false === $nowMap->get('ending')) {
131+
continue;
132+
}
130133

131134
$flag = true;
132135

133136
// 最小规则,直接退出
134-
if (1 === $matchType) break;
137+
if (1 === $matchType) {
138+
break;
139+
}
135140
}
136141

137142
if (! $flag) {
@@ -204,16 +209,18 @@ public function islegal($content)
204209
$nowMap = $tempMap->get($keyChar);
205210

206211
// 不存在节点树,直接返回
207-
if (empty($nowMap)) break;
208-
209-
// 存在,则判断是否为最后一个
210-
$tempMap = $nowMap;
212+
if (empty($nowMap)) {
213+
break;
214+
}
211215

212216
// 找到相应key,偏移量+1
213-
$matchFlag ++;
217+
$tempMap = $nowMap;
218+
$matchFlag++;
214219

215220
// 如果为最后一个匹配规则,结束循环,返回匹配标识数
216-
if (false === $nowMap->get('ending')) continue;
221+
if (false === $nowMap->get('ending')) {
222+
continue;
223+
}
217224

218225
return true;
219226
}
@@ -228,4 +235,49 @@ public function islegal($content)
228235
}
229236
return false;
230237
}
231-
}
238+
239+
protected function yieldToReadFile($filepath)
240+
{
241+
$fp = fopen($filepath, 'r');
242+
while (! feof($fp)) {
243+
yield fgets($fp);
244+
}
245+
fclose($fp);
246+
}
247+
248+
// 将单个敏感词构建成树结构
249+
protected function buildWordToTree($word = '')
250+
{
251+
if ('' === $word) {
252+
return;
253+
}
254+
$tree = $this->wordTree;
255+
256+
$wordLength = mb_strlen($word, 'utf-8');
257+
for ($i = 0; $i < $wordLength; $i++) {
258+
$keyChar = mb_substr($word, $i, 1, 'utf-8');
259+
260+
// 获取子节点树结构
261+
$tempTree = $tree->get($keyChar);
262+
263+
if ($tempTree) {
264+
$tree = $tempTree;
265+
} else {
266+
// 设置标志位
267+
$newTree = new HashMap();
268+
$newTree->put('ending', false);
269+
270+
// 添加到集合
271+
$tree->put($keyChar, $newTree);
272+
$tree = $newTree;
273+
}
274+
275+
// 到达最后一个节点
276+
if ($i == $wordLength - 1) {
277+
$tree->put('ending', true);
278+
}
279+
}
280+
281+
return;
282+
}
283+
}

tests/ProTest.php

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
<?php
2+
/**
3+
* Created by PhpStorm.
4+
* User: zed
5+
* Date: 17-11-13
6+
* Time: 上午10:00
7+
*/
8+
9+
use DfaFilter\SensitiveHelper;
10+
use PHPUnit\Framework\TestCase;
11+
12+
class BaseTest extends TestCase
13+
{
14+
protected $wordPoolPath;
15+
16+
public function setUp()
17+
{
18+
parent::setUp();
19+
20+
// 铭感词文件路径
21+
$this->wordPoolPath = 'tests/data/words.txt';
22+
}
23+
24+
public function testGetBadWord()
25+
{
26+
$content = '这是一段测试语句,请忽略赌球网';
27+
28+
// 过滤,其中【赌球网】在词库中
29+
$filterContent = SensitiveHelper::init()
30+
->setTreeByFile($this->wordPoolPath)
31+
->getBadWord($content);
32+
33+
$this->assertEquals('赌球网', $filterContent[0]);
34+
}
35+
36+
public function testFilterWord()
37+
{
38+
$content = '这是一段测试语句,请忽略赌球网';
39+
40+
// 过滤,其中【赌球网】在词库中
41+
$filterContent = SensitiveHelper::init()
42+
->setTreeByFile($this->wordPoolPath)
43+
->replace($content,'*');
44+
45+
$this->assertEquals('这是一段测试语句,请忽略*', $filterContent);
46+
}
47+
}

0 commit comments

Comments
 (0)