From 46dfa57e5fe391e597f0e33c288c6c4eee260e41 Mon Sep 17 00:00:00 2001 From: liuchao31 Date: Sat, 4 May 2019 02:40:05 +0800 Subject: [PATCH] independent remote dictionary support. --- .../index/analysis/IkAnalyzerProvider.java | 2 +- .../index/analysis/IkTokenizerFactory.java | 2 +- .../org/wltea/analyzer/cfg/Configuration.java | 19 ++- .../wltea/analyzer/core/AnalyzeContext.java | 2 +- .../org/wltea/analyzer/core/CJKSegmenter.java | 10 +- .../analyzer/core/CN_QuantifierSegmenter.java | 10 +- .../org/wltea/analyzer/core/IKSegmenter.java | 4 +- .../org/wltea/analyzer/dic/Dictionary.java | 141 ++++++++---------- .../java/org/wltea/analyzer/dic/Monitor.java | 7 +- 9 files changed, 99 insertions(+), 98 deletions(-) diff --git a/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java b/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java index 552f9561..248a1516 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java +++ b/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java @@ -12,7 +12,7 @@ public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider tmpHits; + private Configuration configuration; - - CJKSegmenter(){ + CJKSegmenter(Configuration configuration){ this.tmpHits = new LinkedList(); + this.configuration = configuration; } /* (non-Javadoc) @@ -58,7 +60,7 @@ public void analyze(AnalyzeContext context) { //处理词段队列 Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]); for(Hit hit : tmpArray){ - hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); + hit = configuration.getDictionary().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); if(hit.isMatch()){ //输出当前的词 Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD); @@ -77,7 +79,7 @@ public void analyze(AnalyzeContext context) { //********************************* //再对当前指针位置的字符进行单字匹配 - Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); + Hit singleCharHit = configuration.getDictionary().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); if(singleCharHit.isMatch()){//首字成词 //输出当前的词 Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD); diff --git a/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java b/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java index e147c2f0..b8f17eed 100644 --- a/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java +++ b/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java @@ -29,6 +29,7 @@ import java.util.List; import java.util.Set; +import org.wltea.analyzer.cfg.Configuration; import org.wltea.analyzer.dic.Dictionary; import org.wltea.analyzer.dic.Hit; @@ -65,12 +66,15 @@ class CN_QuantifierSegmenter implements ISegmenter{ //待处理的量词hit队列 private List countHits; + + private Configuration configuration; - CN_QuantifierSegmenter(){ + CN_QuantifierSegmenter(Configuration configuration){ nStart = -1; nEnd = -1; this.countHits = new LinkedList(); + this.configuration = configuration; } /** @@ -153,7 +157,7 @@ private void processCount(AnalyzeContext context){ //处理词段队列 Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]); for(Hit hit : tmpArray){ - hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); + hit = configuration.getDictionary().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); if(hit.isMatch()){ //输出当前的词 Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT); @@ -172,7 +176,7 @@ private void processCount(AnalyzeContext context){ //********************************* //对当前指针位置的字符进行单字匹配 - Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1); + Hit singleCharHit = configuration.getDictionary().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1); if(singleCharHit.isMatch()){//首字成量词词 //输出当前的词 Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT); diff --git a/src/main/java/org/wltea/analyzer/core/IKSegmenter.java b/src/main/java/org/wltea/analyzer/core/IKSegmenter.java index 789a3a67..c96460b3 100644 --- a/src/main/java/org/wltea/analyzer/core/IKSegmenter.java +++ b/src/main/java/org/wltea/analyzer/core/IKSegmenter.java @@ -79,9 +79,9 @@ private List loadSegmenters(){ //处理字母的子分词器 segmenters.add(new LetterSegmenter()); //处理中文数量词的子分词器 - segmenters.add(new CN_QuantifierSegmenter()); + segmenters.add(new CN_QuantifierSegmenter(configuration)); //处理中文词的子分词器 - segmenters.add(new CJKSegmenter()); + segmenters.add(new CJKSegmenter(configuration)); return segmenters; } diff --git a/src/main/java/org/wltea/analyzer/dic/Dictionary.java b/src/main/java/org/wltea/analyzer/dic/Dictionary.java index b61e0381..12719efb 100644 --- a/src/main/java/org/wltea/analyzer/dic/Dictionary.java +++ b/src/main/java/org/wltea/analyzer/dic/Dictionary.java @@ -25,39 +25,36 @@ */ package org.wltea.analyzer.dic; -import java.io.BufferedReader; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.nio.file.attribute.BasicFileAttributes; -import java.nio.file.Files; -import java.nio.file.FileVisitResult; -import java.nio.file.Path; -import java.nio.file.SimpleFileVisitor; -import java.security.AccessController; -import java.security.PrivilegedAction; -import java.util.*; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.TimeUnit; - import org.apache.http.Header; import org.apache.http.HttpEntity; -import org.apache.http.client.ClientProtocolException; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; +import org.apache.logging.log4j.Logger; import org.elasticsearch.SpecialPermission; import org.elasticsearch.common.io.PathUtils; import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin; import org.wltea.analyzer.cfg.Configuration; -import org.apache.logging.log4j.Logger; import org.wltea.analyzer.help.ESPluginLoggerFactory; +import java.io.*; +import java.nio.file.FileVisitResult; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.SimpleFileVisitor; +import java.nio.file.attribute.BasicFileAttributes; +import java.security.AccessController; +import java.security.PrivilegedAction; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Properties; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; + /** * 词典管理类,单子模式 @@ -67,8 +64,6 @@ public class Dictionary { /* * 词典单子实例 */ - private static Dictionary singleton; - private DictSegment _MainDict; private DictSegment _QuantifierDict; @@ -100,7 +95,7 @@ public class Dictionary { private Path conf_dir; private Properties props; - private Dictionary(Configuration cfg) { + public Dictionary(Configuration cfg) { this.configuration = cfg; this.props = new Properties(); this.conf_dir = cfg.getEnvironment().configFile().resolve(AnalysisIkPlugin.PLUGIN_NAME); @@ -128,6 +123,16 @@ private Dictionary(Configuration cfg) { logger.error("ik-analyzer", e); } } + + try { + this.initial(); + }catch (Exception e){ + logger.error("init dic error.."); + } + } + + public Configuration getConfiguration() { + return configuration; } private String getProperty(String key){ @@ -142,31 +147,21 @@ private String getProperty(String key){ * * @return Dictionary */ - public static synchronized void initial(Configuration cfg) { - if (singleton == null) { - synchronized (Dictionary.class) { - if (singleton == null) { - - singleton = new Dictionary(cfg); - singleton.loadMainDict(); - singleton.loadSurnameDict(); - singleton.loadQuantifierDict(); - singleton.loadSuffixDict(); - singleton.loadPrepDict(); - singleton.loadStopWordDict(); - - if(cfg.isEnableRemoteDict()){ - // 建立监控线程 - for (String location : singleton.getRemoteExtDictionarys()) { - // 10 秒是初始延迟可以修改的 60是间隔时间 单位秒 - pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS); - } - for (String location : singleton.getRemoteExtStopWordDictionarys()) { - pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS); - } - } - - } + private void initial() { + loadMainDict(); + loadSurnameDict(); + loadQuantifierDict(); + loadSuffixDict(); + loadPrepDict(); + loadStopWordDict(); + if(this.configuration.isEnableRemoteDict()){ + // 建立监控线程 + for (String location : this.getRemoteExtDictionarys()) { + // 10 秒是初始延迟可以修改的 60是间隔时间 单位秒 + pool.scheduleAtFixedRate(new Monitor(location + "/" + getIndexName(), this), 10, 60, TimeUnit.SECONDS); + } + for (String location : this.getRemoteExtStopWordDictionarys()) { + pool.scheduleAtFixedRate(new Monitor(location + "/" + getIndexName(), this), 10, 60, TimeUnit.SECONDS); } } } @@ -287,19 +282,6 @@ private String getDictRoot() { } - /** - * 获取词典单子实例 - * - * @return Dictionary 单例对象 - */ - public static Dictionary getSingleton() { - if (singleton == null) { - throw new IllegalStateException("词典尚未初始化,请先调用initial方法"); - } - return singleton; - } - - /** * 批量加载新词条 * @@ -311,7 +293,7 @@ public void addWords(Collection words) { for (String word : words) { if (word != null) { // 批量加载词条到主内存词典中 - singleton._MainDict.fillSegment(word.trim().toCharArray()); + this._MainDict.fillSegment(word.trim().toCharArray()); } } } @@ -325,7 +307,7 @@ public void disableWords(Collection words) { for (String word : words) { if (word != null) { // 批量屏蔽词条 - singleton._MainDict.disableSegment(word.trim().toCharArray()); + this._MainDict.disableSegment(word.trim().toCharArray()); } } } @@ -337,7 +319,7 @@ public void disableWords(Collection words) { * @return Hit 匹配结果描述 */ public Hit matchInMainDict(char[] charArray) { - return singleton._MainDict.match(charArray); + return this._MainDict.match(charArray); } /** @@ -346,7 +328,7 @@ public Hit matchInMainDict(char[] charArray) { * @return Hit 匹配结果描述 */ public Hit matchInMainDict(char[] charArray, int begin, int length) { - return singleton._MainDict.match(charArray, begin, length); + return this._MainDict.match(charArray, begin, length); } /** @@ -355,7 +337,7 @@ public Hit matchInMainDict(char[] charArray, int begin, int length) { * @return Hit 匹配结果描述 */ public Hit matchInQuantifierDict(char[] charArray, int begin, int length) { - return singleton._QuantifierDict.match(charArray, begin, length); + return this._QuantifierDict.match(charArray, begin, length); } /** @@ -374,7 +356,7 @@ public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) { * @return boolean */ public boolean isStopWord(char[] charArray, int begin, int length) { - return singleton._StopWords.match(charArray, begin, length).isMatch(); + return this._StopWords.match(charArray, begin, length).isMatch(); } /** @@ -415,11 +397,11 @@ private void loadExtDict() { private void loadRemoteExtDict() { List remoteExtDictFiles = getRemoteExtDictionarys(); for (String location : remoteExtDictFiles) { - logger.info("[Dict Loading] " + location); - List lists = getRemoteWords(location); + logger.info("[Dict Loading] " + location + "/" + getIndexName()); + List lists = getRemoteWords(location + "/" + getIndexName()); // 如果找不到扩展的字典,则忽略 if (lists == null) { - logger.error("[Dict Loading] " + location + "加载失败"); + logger.error("[Dict Loading] " + location + "/" + getIndexName() + "加载失败"); continue; } for (String theWord : lists) { @@ -451,6 +433,7 @@ private static List getRemoteWordsUnprivileged(String location) { CloseableHttpClient httpclient = HttpClients.createDefault(); CloseableHttpResponse response; BufferedReader in; + logger.info("--> location : " + location); HttpGet get = new HttpGet(location); get.setConfig(rc); try { @@ -514,11 +497,11 @@ private void loadStopWordDict() { // 加载远程停用词典 List remoteExtStopWordDictFiles = getRemoteExtStopWordDictionarys(); for (String location : remoteExtStopWordDictFiles) { - logger.info("[Dict Loading] " + location); - List lists = getRemoteWords(location); + logger.info("[Dict Loading] " + location + "/" + getIndexName()); + List lists = getRemoteWords(location + "/" + getIndexName()); // 如果找不到扩展的字典,则忽略 if (lists == null) { - logger.error("[Dict Loading] " + location + "加载失败"); + logger.error("[Dict Loading] " + location + "/" + getIndexName() + "加载失败"); continue; } for (String theWord : lists) { @@ -563,14 +546,12 @@ private void loadPrepDict() { void reLoadMainDict() { logger.info("重新加载词典..."); - // 新开一个实例加载词典,减少加载过程对当前词典使用的影响 - Dictionary tmpDict = new Dictionary(configuration); - tmpDict.configuration = getSingleton().configuration; - tmpDict.loadMainDict(); - tmpDict.loadStopWordDict(); - _MainDict = tmpDict._MainDict; - _StopWords = tmpDict._StopWords; + this.loadMainDict(); + this.loadStopWordDict(); logger.info("重新加载词典完毕..."); } + public String getIndexName() { + return configuration.getIndexName(); + } } diff --git a/src/main/java/org/wltea/analyzer/dic/Monitor.java b/src/main/java/org/wltea/analyzer/dic/Monitor.java index a5771ef8..c15868ca 100644 --- a/src/main/java/org/wltea/analyzer/dic/Monitor.java +++ b/src/main/java/org/wltea/analyzer/dic/Monitor.java @@ -32,10 +32,13 @@ public class Monitor implements Runnable { */ private String location; - public Monitor(String location) { + private Dictionary dictionary; + + public Monitor(String location, Dictionary dictionary) { this.location = location; this.last_modified = null; this.eTags = null; + this.dictionary = dictionary; } public void run() { @@ -84,7 +87,7 @@ public void runUnprivileged() { ||((response.getLastHeader("ETag")!=null) && !response.getLastHeader("ETag").getValue().equalsIgnoreCase(eTags))) { // 远程词库有更新,需要重新加载词典,并修改last_modified,eTags - Dictionary.getSingleton().reLoadMainDict(); + this.dictionary.reLoadMainDict(); last_modified = response.getLastHeader("Last-Modified")==null?null:response.getLastHeader("Last-Modified").getValue(); eTags = response.getLastHeader("ETag")==null?null:response.getLastHeader("ETag").getValue(); }