diff --git a/config/IKAnalyzer.cfg.xml b/config/IKAnalyzer.cfg.xml index fe69bb20..b8d5359f 100644 --- a/config/IKAnalyzer.cfg.xml +++ b/config/IKAnalyzer.cfg.xml @@ -3,11 +3,15 @@ IK Analyzer 扩展配置 - - - + custom/mydict.dic;custom/single_word_low_freq.dic;remote.dic + + custom/ext_stopword.dic + + false diff --git a/config/remote.dic b/config/remote.dic new file mode 100644 index 00000000..0049df16 --- /dev/null +++ b/config/remote.dic @@ -0,0 +1 @@ +//这是远程扩展字典的备份 \ No newline at end of file diff --git a/src/main/java/org/wltea/analyzer/dic/Dictionary.java b/src/main/java/org/wltea/analyzer/dic/Dictionary.java index daa5b62d..b4c3cd70 100644 --- a/src/main/java/org/wltea/analyzer/dic/Dictionary.java +++ b/src/main/java/org/wltea/analyzer/dic/Dictionary.java @@ -25,13 +25,7 @@ */ package org.wltea.analyzer.dic; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; +import java.io.*; import java.nio.file.Path; import java.util.*; import java.util.concurrent.Executors; @@ -88,15 +82,17 @@ public class Dictionary { public static final String PATH_DIC_SUFFIX = "suffix.dic"; public static final String PATH_DIC_PREP = "preposition.dic"; public static final String PATH_DIC_STOP = "stopword.dic"; + public static final String PATH_DIC_REMOTE = "remote.dic"; private final static String FILE_NAME = "IKAnalyzer.cfg.xml"; private final static String EXT_DICT = "ext_dict"; private final static String REMOTE_EXT_DICT = "remote_ext_dict"; private final static String EXT_STOP = "ext_stopwords"; private final static String REMOTE_EXT_STOP = "remote_ext_stopwords"; + private final static String REBASE_REMOTE_EXT_DICT = "rebase_remote_ext_dict"; - private Path conf_dir; - private Properties props; + private static Path conf_dir; + private static Properties props; private Dictionary(Configuration cfg) { this.configuration = cfg; @@ -130,7 +126,7 @@ private Dictionary(Configuration cfg) { } } - public String getProperty(String key){ + public static String getProperty(String key){ if(props!=null){ return props.getProperty(key); } @@ -139,7 +135,7 @@ public String getProperty(String key){ /** * 词典初始化 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化 * 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间 该方法提供了一个在应用加载阶段就初始化字典的手段 - * + * * @return Dictionary */ public static synchronized Dictionary initial(Configuration cfg) { @@ -239,14 +235,14 @@ public List getRemoteExtStopWordDictionarys() { return remoteExtStopWordDictFiles; } - public String getDictRoot() { + public static String getDictRoot() { return conf_dir.toAbsolutePath().toString(); } /** * 获取词典单子实例 - * + * * @return Dictionary 单例对象 */ public static Dictionary getSingleton() { @@ -259,7 +255,7 @@ public static Dictionary getSingleton() { /** * 批量加载新词条 - * + * * @param words * Collection词条列表 */ @@ -290,7 +286,7 @@ public void disableWords(Collection words) { /** * 检索匹配主词典 - * + * * @return Hit 匹配结果描述 */ public Hit matchInMainDict(char[] charArray) { @@ -299,7 +295,7 @@ public Hit matchInMainDict(char[] charArray) { /** * 检索匹配主词典 - * + * * @return Hit 匹配结果描述 */ public Hit matchInMainDict(char[] charArray, int begin, int length) { @@ -308,7 +304,7 @@ public Hit matchInMainDict(char[] charArray, int begin, int length) { /** * 检索匹配量词词典 - * + * * @return Hit 匹配结果描述 */ public Hit matchInQuantifierDict(char[] charArray, int begin, int length) { @@ -317,7 +313,7 @@ public Hit matchInQuantifierDict(char[] charArray, int begin, int length) { /** * 从已匹配的Hit中直接取出DictSegment,继续向下匹配 - * + * * @return Hit */ public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) { @@ -327,7 +323,7 @@ public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) { /** * 判断是否是停止词 - * + * * @return boolean */ public boolean isStopWord(char[] charArray, int begin, int length) { @@ -483,6 +479,10 @@ private static List getRemoteWords(String location) { } in.close(); response.close(); + String isRebase = getProperty(REBASE_REMOTE_EXT_DICT); + if (isRebase.equals("true")) { + return writeRemoteWords(buffer); + } return buffer; } response.close(); @@ -761,4 +761,40 @@ public void reLoadMainDict() { logger.info("重新加载词典完毕..."); } + /** + * 把远程扩展字典的词备份下来,并放入用户自定义的扩展字典中 + */ + private static List writeRemoteWords(List buffer) throws IOException { + if (buffer == null || buffer.size() == 0) { + return new ArrayList<>(); + } + Set bufferSet = new HashSet<>(buffer); + if (bufferSet.size() == 0) { + return new ArrayList<>(); + } + Path path = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_REMOTE); + File file = path.toFile(); + List words = new ArrayList<>(); + FileReader fr = new FileReader(file); + BufferedReader br = new BufferedReader(fr); + String line; + while ((line = br.readLine()) != null) { + words.add(line); + } + for (String str : words) { + if (bufferSet.contains(str)) { + bufferSet.remove(str); + } + } + fr.close(); + br.close(); + //写入remote.dic中 + FileWriter fw = new FileWriter(file, true); + for (String buf : bufferSet) { + fw.append("\n"); + fw.append(buf); + } + fw.close(); + return new ArrayList<>(bufferSet); + } }