Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add 1.9.5版本,用户热更新时,用户可配置是否把从远程字典获取的词存入到自己的字典中 #430

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions config/IKAnalyzer.cfg.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@
<properties>
<comment>IK Analyzer 扩展配置</comment>
<!--用户可以在这里配置自己的扩展字典 -->
<entry key="ext_dict"></entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords"></entry>
<entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic;remote.dic</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">custom/ext_stopword.dic</entry>
<!--用户可以在这里配置远程扩展字典 -->
<!-- <entry key="remote_ext_dict">words_location</entry> -->
<!--用户可以在这里配置远程扩展停止词字典-->
<!-- <entry key="remote_ext_stopwords">words_location</entry> -->
<!--用户可以在这里配置是否把远程扩展字典的内容备份到自己的扩展字典中,默认为false,如果要启用则设置为true
注意:如果设置为true,需要修改jre的java.policy文件,给remote.dic加上write权限,具体操作参照该网址
http://www.cnblogs.com/yiwangzhibujian/p/6207212.html-->
<entry key="rebase_remote_ext_dict">false</entry>
</properties>
1 change: 1 addition & 0 deletions config/remote.dic
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
//这是远程扩展字典的备份
74 changes: 55 additions & 19 deletions src/main/java/org/wltea/analyzer/dic/Dictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,7 @@
*/
package org.wltea.analyzer.dic;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.*;
import java.nio.file.Path;
import java.util.*;
import java.util.concurrent.Executors;
Expand Down Expand Up @@ -88,15 +82,17 @@ public class Dictionary {
public static final String PATH_DIC_SUFFIX = "suffix.dic";
public static final String PATH_DIC_PREP = "preposition.dic";
public static final String PATH_DIC_STOP = "stopword.dic";
public static final String PATH_DIC_REMOTE = "remote.dic";

private final static String FILE_NAME = "IKAnalyzer.cfg.xml";
private final static String EXT_DICT = "ext_dict";
private final static String REMOTE_EXT_DICT = "remote_ext_dict";
private final static String EXT_STOP = "ext_stopwords";
private final static String REMOTE_EXT_STOP = "remote_ext_stopwords";
private final static String REBASE_REMOTE_EXT_DICT = "rebase_remote_ext_dict";

private Path conf_dir;
private Properties props;
private static Path conf_dir;
private static Properties props;

private Dictionary(Configuration cfg) {
this.configuration = cfg;
Expand Down Expand Up @@ -130,7 +126,7 @@ private Dictionary(Configuration cfg) {
}
}

public String getProperty(String key){
public static String getProperty(String key){
if(props!=null){
return props.getProperty(key);
}
Expand All @@ -139,7 +135,7 @@ public String getProperty(String key){
/**
* 词典初始化 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
* 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间 该方法提供了一个在应用加载阶段就初始化字典的手段
*
*
* @return Dictionary
*/
public static synchronized Dictionary initial(Configuration cfg) {
Expand Down Expand Up @@ -239,14 +235,14 @@ public List<String> getRemoteExtStopWordDictionarys() {
return remoteExtStopWordDictFiles;
}

public String getDictRoot() {
public static String getDictRoot() {
return conf_dir.toAbsolutePath().toString();
}


/**
* 获取词典单子实例
*
*
* @return Dictionary 单例对象
*/
public static Dictionary getSingleton() {
Expand All @@ -259,7 +255,7 @@ public static Dictionary getSingleton() {

/**
* 批量加载新词条
*
*
* @param words
* Collection<String>词条列表
*/
Expand Down Expand Up @@ -290,7 +286,7 @@ public void disableWords(Collection<String> words) {

/**
* 检索匹配主词典
*
*
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray) {
Expand All @@ -299,7 +295,7 @@ public Hit matchInMainDict(char[] charArray) {

/**
* 检索匹配主词典
*
*
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray, int begin, int length) {
Expand All @@ -308,7 +304,7 @@ public Hit matchInMainDict(char[] charArray, int begin, int length) {

/**
* 检索匹配量词词典
*
*
* @return Hit 匹配结果描述
*/
public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
Expand All @@ -317,7 +313,7 @@ public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {

/**
* 从已匹配的Hit中直接取出DictSegment,继续向下匹配
*
*
* @return Hit
*/
public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
Expand All @@ -327,7 +323,7 @@ public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {

/**
* 判断是否是停止词
*
*
* @return boolean
*/
public boolean isStopWord(char[] charArray, int begin, int length) {
Expand Down Expand Up @@ -483,6 +479,10 @@ private static List<String> getRemoteWords(String location) {
}
in.close();
response.close();
String isRebase = getProperty(REBASE_REMOTE_EXT_DICT);
if (isRebase.equals("true")) {
return writeRemoteWords(buffer);
}
return buffer;
}
response.close();
Expand Down Expand Up @@ -761,4 +761,40 @@ public void reLoadMainDict() {
logger.info("重新加载词典完毕...");
}

/**
* 把远程扩展字典的词备份下来,并放入用户自定义的扩展字典中
*/
private static List<String> writeRemoteWords(List<String> buffer) throws IOException {
if (buffer == null || buffer.size() == 0) {
return new ArrayList<>();
}
Set<String> bufferSet = new HashSet<>(buffer);
if (bufferSet.size() == 0) {
return new ArrayList<>();
}
Path path = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_REMOTE);
File file = path.toFile();
List<String> words = new ArrayList<>();
FileReader fr = new FileReader(file);
BufferedReader br = new BufferedReader(fr);
String line;
while ((line = br.readLine()) != null) {
words.add(line);
}
for (String str : words) {
if (bufferSet.contains(str)) {
bufferSet.remove(str);
}
}
fr.close();
br.close();
//写入remote.dic中
FileWriter fw = new FileWriter(file, true);
for (String buf : bufferSet) {
fw.append("\n");
fw.append(buf);
}
fw.close();
return new ArrayList<>(bufferSet);
}
}