Skip to content

Commit 6d7c24a

Browse files
fix:提供新的方法处理自定义用户词典分词带空格的情况(huaban#137)
1 parent e46e44b commit 6d7c24a

File tree

5 files changed

+212
-0
lines changed

5 files changed

+212
-0
lines changed

conf/userextend.dict

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
弹性公网IP,3
2+
IPSEC VPN,3

src/main/java/com/huaban/analysis/jieba/CharacterUtil.java

+18
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
public class CharacterUtil {
77
public static Pattern reSkip = Pattern.compile("(\\d+\\.\\d+|[a-zA-Z0-9]+)");
88
private static final char[] connectors = new char[] { '+', '#', '&', '.', '_', '-' };
9+
private static final char[] connectorsExtend = new char[] { '+', '#', '&', '.', '_', '-', ' ' };
910

1011

1112
public static boolean isChineseLetter(char ch) {
@@ -36,6 +37,12 @@ public static boolean isConnector(char ch) {
3637
return false;
3738
}
3839

40+
public static boolean isConnectorExtend(char ch) {
41+
for (char connector : connectorsExtend)
42+
if (ch == connector)
43+
return true;
44+
return false;
45+
}
3946

4047
public static boolean ccFind(char ch) {
4148
if (isChineseLetter(ch))
@@ -49,6 +56,17 @@ public static boolean ccFind(char ch) {
4956
return false;
5057
}
5158

59+
public static boolean ccFindExtend(char ch) {
60+
if (isChineseLetter(ch))
61+
return true;
62+
if (isEnglishLetter(ch))
63+
return true;
64+
if (isDigit(ch))
65+
return true;
66+
if (isConnectorExtend(ch))
67+
return true;
68+
return false;
69+
}
5270

5371
/**
5472
* 全角 to 半角,大写 to 小写

src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java

+80
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,86 @@ public List<SegToken> process(String paragraph, SegMode mode) {
175175
return tokens;
176176
}
177177

178+
public List<SegToken> processExtend(String paragraph, SegMode mode) {
179+
List<SegToken> tokens = new ArrayList<SegToken>();
180+
StringBuilder sb = new StringBuilder();
181+
int offset = 0;
182+
for (int i = 0; i < paragraph.length(); ++i) {
183+
char ch = CharacterUtil.regularize(paragraph.charAt(i));
184+
if (CharacterUtil.ccFindExtend(ch))
185+
sb.append(ch);
186+
else {
187+
if (sb.length() > 0) {
188+
// process
189+
if (mode == SegMode.SEARCH) {
190+
for (String word : sentenceProcess(sb.toString())) {
191+
tokens.add(new SegToken(word, offset, offset += word.length()));
192+
}
193+
}
194+
else {
195+
for (String token : sentenceProcess(sb.toString())) {
196+
if (token.length() > 2) {
197+
String gram2;
198+
int j = 0;
199+
for (; j < token.length() - 1; ++j) {
200+
gram2 = token.substring(j, j + 2);
201+
if (wordDict.containsWord(gram2))
202+
tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
203+
}
204+
}
205+
if (token.length() > 3) {
206+
String gram3;
207+
int j = 0;
208+
for (; j < token.length() - 2; ++j) {
209+
gram3 = token.substring(j, j + 3);
210+
if (wordDict.containsWord(gram3))
211+
tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
212+
}
213+
}
214+
tokens.add(new SegToken(token, offset, offset += token.length()));
215+
}
216+
}
217+
sb = new StringBuilder();
218+
offset = i;
219+
}
220+
if (wordDict.containsWord(paragraph.substring(i, i + 1)))
221+
tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
222+
else
223+
tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
224+
}
225+
}
226+
if (sb.length() > 0)
227+
if (mode == SegMode.SEARCH) {
228+
for (String token : sentenceProcess(sb.toString())) {
229+
tokens.add(new SegToken(token, offset, offset += token.length()));
230+
}
231+
}
232+
else {
233+
for (String token : sentenceProcess(sb.toString())) {
234+
if (token.length() > 2) {
235+
String gram2;
236+
int j = 0;
237+
for (; j < token.length() - 1; ++j) {
238+
gram2 = token.substring(j, j + 2);
239+
if (wordDict.containsWord(gram2))
240+
tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
241+
}
242+
}
243+
if (token.length() > 3) {
244+
String gram3;
245+
int j = 0;
246+
for (; j < token.length() - 2; ++j) {
247+
gram3 = token.substring(j, j + 3);
248+
if (wordDict.containsWord(gram3))
249+
tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
250+
}
251+
}
252+
tokens.add(new SegToken(token, offset, offset += token.length()));
253+
}
254+
}
255+
256+
return tokens;
257+
}
178258

179259
/*
180260
*

src/main/java/com/huaban/analysis/jieba/WordDictionary.java

+57
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,27 @@ public void init(String[] paths) {
8888
}
8989
}
9090
}
91+
92+
public void init(Path configFile, String splitChar) {
93+
String abspath = configFile.toAbsolutePath().toString();
94+
Log.debug("initialize user dictionary:" + abspath);
95+
synchronized (WordDictionary.class) {
96+
if (loadedPath.contains(abspath))
97+
return;
98+
99+
DirectoryStream<Path> stream;
100+
try {
101+
stream = Files.newDirectoryStream(configFile, String.format(Locale.getDefault(), "*%s", USER_DICT_SUFFIX));
102+
for (Path path: stream){
103+
Log.error(String.format(Locale.getDefault(), "loading dict %s", path.toString()));
104+
singleton.loadUserDict(path, splitChar);
105+
}
106+
loadedPath.add(abspath);
107+
} catch (IOException e) {
108+
Log.error(String.format(Locale.getDefault(), "%s: load user dict failure!", configFile.toString()));
109+
}
110+
}
111+
}
91112

92113
/**
93114
* let user just use their own dict instead of the default dict
@@ -156,6 +177,10 @@ public void loadUserDict(Path userDict) {
156177
loadUserDict(userDict, StandardCharsets.UTF_8);
157178
}
158179

180+
public void loadUserDict(Path userDict, String splitChar) {
181+
loadUserDict(userDict, StandardCharsets.UTF_8, splitChar);
182+
}
183+
159184
public void loadUserDict(String userDictPath) {
160185
loadUserDict(userDictPath, StandardCharsets.UTF_8);
161186
}
@@ -223,6 +248,38 @@ public void loadUserDict(String userDictPath, Charset charset) {
223248
Log.error(String.format(Locale.getDefault(), "%s: load user dict failure!", userDictPath));
224249
}
225250
}
251+
252+
public void loadUserDict(Path userDict, Charset charset, String splitChar) {
253+
try {
254+
BufferedReader br = Files.newBufferedReader(userDict, charset);
255+
long s = System.currentTimeMillis();
256+
int count = 0;
257+
while (br.ready()) {
258+
String line = br.readLine();
259+
String[] tokens = line.split(splitChar);
260+
261+
if (tokens.length < 1) {
262+
// Ignore empty line
263+
continue;
264+
}
265+
266+
String word = tokens[0];
267+
268+
double freq = 3.0d;
269+
if (tokens.length == 2)
270+
freq = Double.valueOf(tokens[1]);
271+
word = addWord(word);
272+
freqs.put(word, Math.log(freq / total));
273+
count++;
274+
}
275+
// System.out.println(freqs);
276+
Log.debug(String.format(Locale.getDefault(), "user dict %s load finished, tot words:%d, time elapsed:%dms", userDict.toString(), count, System.currentTimeMillis() - s));
277+
br.close();
278+
}
279+
catch (IOException e) {
280+
Log.error(String.format(Locale.getDefault(), "%s: load user dict failure!", userDict.toString()));
281+
}
282+
}
226283

227284
public DictSegment getTrie() {
228285
return this._dict;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/**
2+
*
3+
*/
4+
package com.huaban.analysis.jieba;
5+
6+
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode;
7+
import junit.framework.TestCase;
8+
import org.junit.Test;
9+
10+
import java.nio.file.Paths;
11+
import java.util.List;
12+
import java.util.Locale;
13+
14+
15+
/**
16+
* @author matrix
17+
*
18+
*/
19+
public class JiebaSegmenterExtendTest extends TestCase {
20+
private JiebaSegmenter segmenter = new JiebaSegmenter();
21+
String[] sentences =
22+
new String[] {
23+
"订购弹性公网IP",
24+
"订购IPSEC VPN"
25+
};
26+
27+
@Override
28+
protected void setUp() throws Exception {
29+
WordDictionary.getInstance().init(Paths.get("conf"), ",");
30+
}
31+
32+
33+
@Override
34+
protected void tearDown() throws Exception {
35+
super.tearDown();
36+
}
37+
38+
39+
@Test
40+
public void testCutForSearch() {
41+
for (String sentence : sentences) {
42+
List<SegToken> tokens = segmenter.processExtend(sentence, SegMode.SEARCH);
43+
System.out.print(String.format(Locale.getDefault(), "\n%s\n%s", sentence, tokens.toString()));
44+
}
45+
}
46+
47+
48+
@Test
49+
public void testCutForIndex() {
50+
for (String sentence : sentences) {
51+
List<SegToken> tokens = segmenter.processExtend(sentence, SegMode.INDEX);
52+
System.out.print(String.format(Locale.getDefault(), "\n%s\n%s", sentence, tokens.toString()));
53+
}
54+
}
55+
}

0 commit comments

Comments
 (0)