diff --git a/config/IKAnalyzer.cfg.xml b/config/IKAnalyzer.cfg.xml
index fe69bb20..b8d5359f 100644
--- a/config/IKAnalyzer.cfg.xml
+++ b/config/IKAnalyzer.cfg.xml
@@ -3,11 +3,15 @@
IK Analyzer 扩展配置
-
-
-
+ custom/mydict.dic;custom/single_word_low_freq.dic;remote.dic
+
+ custom/ext_stopword.dic
+
+ false
diff --git a/config/remote.dic b/config/remote.dic
new file mode 100644
index 00000000..0049df16
--- /dev/null
+++ b/config/remote.dic
@@ -0,0 +1 @@
+//这是远程扩展字典的备份
\ No newline at end of file
diff --git a/src/main/java/org/wltea/analyzer/dic/Dictionary.java b/src/main/java/org/wltea/analyzer/dic/Dictionary.java
index daa5b62d..b4c3cd70 100644
--- a/src/main/java/org/wltea/analyzer/dic/Dictionary.java
+++ b/src/main/java/org/wltea/analyzer/dic/Dictionary.java
@@ -25,13 +25,7 @@
*/
package org.wltea.analyzer.dic;
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
+import java.io.*;
import java.nio.file.Path;
import java.util.*;
import java.util.concurrent.Executors;
@@ -88,15 +82,17 @@ public class Dictionary {
public static final String PATH_DIC_SUFFIX = "suffix.dic";
public static final String PATH_DIC_PREP = "preposition.dic";
public static final String PATH_DIC_STOP = "stopword.dic";
+ public static final String PATH_DIC_REMOTE = "remote.dic";
private final static String FILE_NAME = "IKAnalyzer.cfg.xml";
private final static String EXT_DICT = "ext_dict";
private final static String REMOTE_EXT_DICT = "remote_ext_dict";
private final static String EXT_STOP = "ext_stopwords";
private final static String REMOTE_EXT_STOP = "remote_ext_stopwords";
+ private final static String REBASE_REMOTE_EXT_DICT = "rebase_remote_ext_dict";
- private Path conf_dir;
- private Properties props;
+ private static Path conf_dir;
+ private static Properties props;
private Dictionary(Configuration cfg) {
this.configuration = cfg;
@@ -130,7 +126,7 @@ private Dictionary(Configuration cfg) {
}
}
- public String getProperty(String key){
+ public static String getProperty(String key){
if(props!=null){
return props.getProperty(key);
}
@@ -139,7 +135,7 @@ public String getProperty(String key){
/**
* 词典初始化 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
* 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间 该方法提供了一个在应用加载阶段就初始化字典的手段
- *
+ *
* @return Dictionary
*/
public static synchronized Dictionary initial(Configuration cfg) {
@@ -239,14 +235,14 @@ public List getRemoteExtStopWordDictionarys() {
return remoteExtStopWordDictFiles;
}
- public String getDictRoot() {
+ public static String getDictRoot() {
return conf_dir.toAbsolutePath().toString();
}
/**
* 获取词典单子实例
- *
+ *
* @return Dictionary 单例对象
*/
public static Dictionary getSingleton() {
@@ -259,7 +255,7 @@ public static Dictionary getSingleton() {
/**
* 批量加载新词条
- *
+ *
* @param words
* Collection词条列表
*/
@@ -290,7 +286,7 @@ public void disableWords(Collection words) {
/**
* 检索匹配主词典
- *
+ *
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray) {
@@ -299,7 +295,7 @@ public Hit matchInMainDict(char[] charArray) {
/**
* 检索匹配主词典
- *
+ *
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray, int begin, int length) {
@@ -308,7 +304,7 @@ public Hit matchInMainDict(char[] charArray, int begin, int length) {
/**
* 检索匹配量词词典
- *
+ *
* @return Hit 匹配结果描述
*/
public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
@@ -317,7 +313,7 @@ public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
/**
* 从已匹配的Hit中直接取出DictSegment,继续向下匹配
- *
+ *
* @return Hit
*/
public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
@@ -327,7 +323,7 @@ public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
/**
* 判断是否是停止词
- *
+ *
* @return boolean
*/
public boolean isStopWord(char[] charArray, int begin, int length) {
@@ -483,6 +479,10 @@ private static List getRemoteWords(String location) {
}
in.close();
response.close();
+ String isRebase = getProperty(REBASE_REMOTE_EXT_DICT);
+ if (isRebase.equals("true")) {
+ return writeRemoteWords(buffer);
+ }
return buffer;
}
response.close();
@@ -761,4 +761,40 @@ public void reLoadMainDict() {
logger.info("重新加载词典完毕...");
}
+ /**
+ * 把远程扩展字典的词备份下来,并放入用户自定义的扩展字典中
+ */
+ private static List writeRemoteWords(List buffer) throws IOException {
+ if (buffer == null || buffer.size() == 0) {
+ return new ArrayList<>();
+ }
+ Set bufferSet = new HashSet<>(buffer);
+ if (bufferSet.size() == 0) {
+ return new ArrayList<>();
+ }
+ Path path = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_REMOTE);
+ File file = path.toFile();
+ List words = new ArrayList<>();
+ FileReader fr = new FileReader(file);
+ BufferedReader br = new BufferedReader(fr);
+ String line;
+ while ((line = br.readLine()) != null) {
+ words.add(line);
+ }
+ for (String str : words) {
+ if (bufferSet.contains(str)) {
+ bufferSet.remove(str);
+ }
+ }
+ fr.close();
+ br.close();
+ //写入remote.dic中
+ FileWriter fw = new FileWriter(file, true);
+ for (String buf : bufferSet) {
+ fw.append("\n");
+ fw.append(buf);
+ }
+ fw.close();
+ return new ArrayList<>(bufferSet);
+ }
}