diff --git a/src/main/java/org/wltea/analyzer/cfg/Configuration.java b/src/main/java/org/wltea/analyzer/cfg/Configuration.java index dadd0f20..344b9541 100644 --- a/src/main/java/org/wltea/analyzer/cfg/Configuration.java +++ b/src/main/java/org/wltea/analyzer/cfg/Configuration.java @@ -23,6 +23,8 @@ public class Configuration { //是否启用远程词典加载 private boolean enableRemoteDict=false; + //自定义词库名 + private String remoteDictName=""; //是否启用小写处理 private boolean enableLowercase=true; @@ -36,6 +38,7 @@ public Configuration(Environment env,Settings settings) { this.useSmart = settings.get("use_smart", "false").equals("true"); this.enableLowercase = settings.get("enable_lowercase", "true").equals("true"); this.enableRemoteDict = settings.get("enable_remote_dict", "true").equals("true"); + this.remoteDictName = settings.get("custom_dict_name", ""); Dictionary.initial(this); @@ -68,6 +71,9 @@ public Settings getSettings() { public boolean isEnableRemoteDict() { return enableRemoteDict; } + public String getRemoteDictName() { + return remoteDictName; + } public boolean isEnableLowercase() { return enableLowercase; diff --git a/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java b/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java index 890d9080..59c88cc4 100644 --- a/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java +++ b/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java @@ -322,7 +322,7 @@ Lexeme getNextLexeme(){ while(result != null){ //数量词合并 this.compound(result); - if(Dictionary.getSingleton().isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){ + if(Dictionary.getDictionary(cfg.getRemoteDictName()).isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){ //是停止词继续取列表的下一个 result = this.results.pollFirst(); }else{ diff --git a/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java b/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java index a31a5d41..fe5495dd 100644 --- a/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java +++ b/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java @@ -25,6 +25,7 @@ */ package org.wltea.analyzer.core; +import org.wltea.analyzer.cfg.Configuration; import org.wltea.analyzer.dic.Dictionary; import org.wltea.analyzer.dic.Hit; @@ -39,12 +40,14 @@ class CJKSegmenter implements ISegmenter { //子分词器标签 static final String SEGMENTER_NAME = "CJK_SEGMENTER"; + private final Configuration cfg; //待处理的分词hit队列 private List tmpHits; - CJKSegmenter(){ + CJKSegmenter(Configuration cfg){ this.tmpHits = new LinkedList(); + this.cfg = cfg; } /* (non-Javadoc) @@ -58,7 +61,7 @@ public void analyze(AnalyzeContext context) { //处理词段队列 Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]); for(Hit hit : tmpArray){ - hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); + hit = Dictionary.getDictionary(cfg.getRemoteDictName()).matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); if(hit.isMatch()){ //输出当前的词 Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD); @@ -77,7 +80,7 @@ public void analyze(AnalyzeContext context) { //********************************* //再对当前指针位置的字符进行单字匹配 - Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); + Hit singleCharHit = Dictionary.getDictionary(cfg.getRemoteDictName()).matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); if(singleCharHit.isMatch()){//首字成词 //输出当前的词 Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD); diff --git a/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java b/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java index e147c2f0..db45a125 100644 --- a/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java +++ b/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java @@ -29,6 +29,7 @@ import java.util.List; import java.util.Set; +import org.wltea.analyzer.cfg.Configuration; import org.wltea.analyzer.dic.Dictionary; import org.wltea.analyzer.dic.Hit; @@ -50,7 +51,9 @@ class CN_QuantifierSegmenter implements ISegmenter{ ChnNumberChars.add(nChar); } } - + + private final Configuration cfg; + /* * 词元的开始位置, * 同时作为子分词器状态标识 @@ -67,10 +70,11 @@ class CN_QuantifierSegmenter implements ISegmenter{ private List countHits; - CN_QuantifierSegmenter(){ + CN_QuantifierSegmenter(Configuration cfg){ nStart = -1; nEnd = -1; this.countHits = new LinkedList(); + this.cfg = cfg; } /** @@ -153,7 +157,7 @@ private void processCount(AnalyzeContext context){ //处理词段队列 Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]); for(Hit hit : tmpArray){ - hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); + hit = Dictionary.getDictionary(cfg.getRemoteDictName()).matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); if(hit.isMatch()){ //输出当前的词 Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT); @@ -172,7 +176,7 @@ private void processCount(AnalyzeContext context){ //********************************* //对当前指针位置的字符进行单字匹配 - Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1); + Hit singleCharHit = Dictionary.getDictionary(cfg.getRemoteDictName()).matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1); if(singleCharHit.isMatch()){//首字成量词词 //输出当前的词 Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT); diff --git a/src/main/java/org/wltea/analyzer/core/IKSegmenter.java b/src/main/java/org/wltea/analyzer/core/IKSegmenter.java index 789a3a67..969fd368 100644 --- a/src/main/java/org/wltea/analyzer/core/IKSegmenter.java +++ b/src/main/java/org/wltea/analyzer/core/IKSegmenter.java @@ -79,9 +79,9 @@ private List loadSegmenters(){ //处理字母的子分词器 segmenters.add(new LetterSegmenter()); //处理中文数量词的子分词器 - segmenters.add(new CN_QuantifierSegmenter()); + segmenters.add(new CN_QuantifierSegmenter(this.configuration)); //处理中文词的子分词器 - segmenters.add(new CJKSegmenter()); + segmenters.add(new CJKSegmenter(this.configuration)); return segmenters; } diff --git a/src/main/java/org/wltea/analyzer/dic/Dictionary.java b/src/main/java/org/wltea/analyzer/dic/Dictionary.java index 39212002..28322cb4 100644 --- a/src/main/java/org/wltea/analyzer/dic/Dictionary.java +++ b/src/main/java/org/wltea/analyzer/dic/Dictionary.java @@ -67,7 +67,7 @@ public class Dictionary { /* * 词典单子实例 */ - private static Dictionary singleton; + private static Map dictionaryMap = new HashMap(); private DictSegment _MainDict; @@ -143,26 +143,25 @@ private String getProperty(String key){ * @return Dictionary */ public static synchronized void initial(Configuration cfg) { - if (singleton == null) { + if (!dictionaryMap.containsKey(cfg.getRemoteDictName())) { synchronized (Dictionary.class) { - if (singleton == null) { - - singleton = new Dictionary(cfg); - singleton.loadMainDict(); - singleton.loadSurnameDict(); - singleton.loadQuantifierDict(); - singleton.loadSuffixDict(); - singleton.loadPrepDict(); - singleton.loadStopWordDict(); - + if (!dictionaryMap.containsKey(cfg.getRemoteDictName())) { + Dictionary newDict = new Dictionary(cfg); + newDict.loadMainDict(); + newDict.loadSurnameDict(); + newDict.loadQuantifierDict(); + newDict.loadSuffixDict(); + newDict.loadPrepDict(); + newDict.loadStopWordDict(); + dictionaryMap.put(cfg.getRemoteDictName(), newDict); if(cfg.isEnableRemoteDict()){ // 建立监控线程 - for (String location : singleton.getRemoteExtDictionarys()) { + for (String location : dictionaryMap.get(cfg.getRemoteDictName()).getRemoteExtDictionarys()) { // 10 秒是初始延迟可以修改的 60是间隔时间 单位秒 - pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS); + pool.scheduleAtFixedRate(new Monitor(location, cfg.getRemoteDictName()), 10, 60, TimeUnit.SECONDS); } - for (String location : singleton.getRemoteExtStopWordDictionarys()) { - pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS); + for (String location : dictionaryMap.get(cfg.getRemoteDictName()).getRemoteExtStopWordDictionarys()) { + pool.scheduleAtFixedRate(new Monitor(location, cfg.getRemoteDictName()), 10, 60, TimeUnit.SECONDS); } } @@ -241,6 +240,9 @@ private List getRemoteExtDictionarys() { String[] filePaths = remoteExtDictCfg.split(";"); for (String filePath : filePaths) { if (filePath != null && !"".equals(filePath.trim())) { + if (this.configuration.getRemoteDictName()!="") { + filePath += "/" + this.configuration.getRemoteDictName(); + } remoteExtDictFiles.add(filePath); } @@ -274,6 +276,9 @@ private List getRemoteExtStopWordDictionarys() { String[] filePaths = remoteExtStopWordDictCfg.split(";"); for (String filePath : filePaths) { if (filePath != null && !"".equals(filePath.trim())) { + if (this.configuration.getRemoteDictName()!="") { + filePath += "/" + this.configuration.getRemoteDictName(); + } remoteExtStopWordDictFiles.add(filePath); } @@ -287,16 +292,27 @@ private String getDictRoot() { } +// /** +// * 获取词典单子实例 +// * +// * @return Dictionary 单例对象 +// */ +// public static Dictionary getSingleton() { +// if (singleton == null) { +// throw new IllegalStateException("词典尚未初始化,请先调用initial方法"); +// } +// return singleton; +// } /** - * 获取词典单子实例 - * - * @return Dictionary 单例对象 + * 获取词典实例 + * + * @return Dictionary 对象 */ - public static Dictionary getSingleton() { - if (singleton == null) { + public static Dictionary getDictionary(String dictName) { + if (!dictionaryMap.containsKey(dictName)) { throw new IllegalStateException("ik dict has not been initialized yet, please call initial method first."); } - return singleton; + return dictionaryMap.get(dictName); } @@ -311,7 +327,7 @@ public void addWords(Collection words) { for (String word : words) { if (word != null) { // 批量加载词条到主内存词典中 - singleton._MainDict.fillSegment(word.trim().toCharArray()); + this._MainDict.fillSegment(word.trim().toCharArray()); } } } @@ -325,7 +341,7 @@ public void disableWords(Collection words) { for (String word : words) { if (word != null) { // 批量屏蔽词条 - singleton._MainDict.disableSegment(word.trim().toCharArray()); + this._MainDict.disableSegment(word.trim().toCharArray()); } } } @@ -337,7 +353,7 @@ public void disableWords(Collection words) { * @return Hit 匹配结果描述 */ public Hit matchInMainDict(char[] charArray) { - return singleton._MainDict.match(charArray); + return this._MainDict.match(charArray); } /** @@ -346,7 +362,7 @@ public Hit matchInMainDict(char[] charArray) { * @return Hit 匹配结果描述 */ public Hit matchInMainDict(char[] charArray, int begin, int length) { - return singleton._MainDict.match(charArray, begin, length); + return this._MainDict.match(charArray, begin, length); } /** @@ -355,7 +371,7 @@ public Hit matchInMainDict(char[] charArray, int begin, int length) { * @return Hit 匹配结果描述 */ public Hit matchInQuantifierDict(char[] charArray, int begin, int length) { - return singleton._QuantifierDict.match(charArray, begin, length); + return this._QuantifierDict.match(charArray, begin, length); } /** @@ -374,7 +390,7 @@ public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) { * @return boolean */ public boolean isStopWord(char[] charArray, int begin, int length) { - return singleton._StopWords.match(charArray, begin, length).isMatch(); + return this._StopWords.match(charArray, begin, length).isMatch(); } /** @@ -565,7 +581,7 @@ void reLoadMainDict() { logger.info("start to reload ik dict."); // 新开一个实例加载词典,减少加载过程对当前词典使用的影响 Dictionary tmpDict = new Dictionary(configuration); - tmpDict.configuration = getSingleton().configuration; + tmpDict.configuration = this.configuration; tmpDict.loadMainDict(); tmpDict.loadStopWordDict(); _MainDict = tmpDict._MainDict; diff --git a/src/main/java/org/wltea/analyzer/dic/Monitor.java b/src/main/java/org/wltea/analyzer/dic/Monitor.java index a5771ef8..3c7c29d9 100644 --- a/src/main/java/org/wltea/analyzer/dic/Monitor.java +++ b/src/main/java/org/wltea/analyzer/dic/Monitor.java @@ -18,6 +18,10 @@ public class Monitor implements Runnable { private static final Logger logger = ESPluginLoggerFactory.getLogger(Monitor.class.getName()); private static CloseableHttpClient httpclient = HttpClients.createDefault(); + /* + * 自定义词典名,默认为common + */ + private final String customRemoteDictName; /* * 上次更改时间 */ @@ -32,10 +36,11 @@ public class Monitor implements Runnable { */ private String location; - public Monitor(String location) { + public Monitor(String location, String customRemoteDictName) { this.location = location; this.last_modified = null; this.eTags = null; + this.customRemoteDictName = customRemoteDictName; } public void run() { @@ -84,7 +89,7 @@ public void runUnprivileged() { ||((response.getLastHeader("ETag")!=null) && !response.getLastHeader("ETag").getValue().equalsIgnoreCase(eTags))) { // 远程词库有更新,需要重新加载词典,并修改last_modified,eTags - Dictionary.getSingleton().reLoadMainDict(); + Dictionary.getDictionary(customRemoteDictName).reLoadMainDict(); last_modified = response.getLastHeader("Last-Modified")==null?null:response.getLastHeader("Last-Modified").getValue(); eTags = response.getLastHeader("ETag")==null?null:response.getLastHeader("ETag").getValue(); }