Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

支持多个不同的远程词库 #777

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/main/java/org/wltea/analyzer/cfg/Configuration.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ public class Configuration {

//是否启用远程词典加载
private boolean enableRemoteDict=false;
//自定义词库名
private String remoteDictName="";

//是否启用小写处理
private boolean enableLowercase=true;
Expand All @@ -36,6 +38,7 @@ public Configuration(Environment env,Settings settings) {
this.useSmart = settings.get("use_smart", "false").equals("true");
this.enableLowercase = settings.get("enable_lowercase", "true").equals("true");
this.enableRemoteDict = settings.get("enable_remote_dict", "true").equals("true");
this.remoteDictName = settings.get("custom_dict_name", "");

Dictionary.initial(this);

Expand Down Expand Up @@ -68,6 +71,9 @@ public Settings getSettings() {
public boolean isEnableRemoteDict() {
return enableRemoteDict;
}
public String getRemoteDictName() {
return remoteDictName;
}

public boolean isEnableLowercase() {
return enableLowercase;
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/wltea/analyzer/core/AnalyzeContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ Lexeme getNextLexeme(){
while(result != null){
//数量词合并
this.compound(result);
if(Dictionary.getSingleton().isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){
if(Dictionary.getDictionary(cfg.getRemoteDictName()).isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){
//是停止词继续取列表的下一个
result = this.results.pollFirst();
}else{
Expand Down
9 changes: 6 additions & 3 deletions src/main/java/org/wltea/analyzer/core/CJKSegmenter.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
*/
package org.wltea.analyzer.core;

import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;

Expand All @@ -39,12 +40,14 @@ class CJKSegmenter implements ISegmenter {

//子分词器标签
static final String SEGMENTER_NAME = "CJK_SEGMENTER";
private final Configuration cfg;
//待处理的分词hit队列
private List<Hit> tmpHits;


CJKSegmenter(){
CJKSegmenter(Configuration cfg){
this.tmpHits = new LinkedList<Hit>();
this.cfg = cfg;
}

/* (non-Javadoc)
Expand All @@ -58,7 +61,7 @@ public void analyze(AnalyzeContext context) {
//处理词段队列
Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
for(Hit hit : tmpArray){
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
hit = Dictionary.getDictionary(cfg.getRemoteDictName()).matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if(hit.isMatch()){
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
Expand All @@ -77,7 +80,7 @@ public void analyze(AnalyzeContext context) {

//*********************************
//再对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
Hit singleCharHit = Dictionary.getDictionary(cfg.getRemoteDictName()).matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成词
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import java.util.List;
import java.util.Set;

import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;

Expand All @@ -50,7 +51,9 @@ class CN_QuantifierSegmenter implements ISegmenter{
ChnNumberChars.add(nChar);
}
}


private final Configuration cfg;

/*
* 词元的开始位置,
* 同时作为子分词器状态标识
Expand All @@ -67,10 +70,11 @@ class CN_QuantifierSegmenter implements ISegmenter{
private List<Hit> countHits;


CN_QuantifierSegmenter(){
CN_QuantifierSegmenter(Configuration cfg){
nStart = -1;
nEnd = -1;
this.countHits = new LinkedList<Hit>();
this.cfg = cfg;
}

/**
Expand Down Expand Up @@ -153,7 +157,7 @@ private void processCount(AnalyzeContext context){
//处理词段队列
Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
for(Hit hit : tmpArray){
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
hit = Dictionary.getDictionary(cfg.getRemoteDictName()).matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if(hit.isMatch()){
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
Expand All @@ -172,7 +176,7 @@ private void processCount(AnalyzeContext context){

//*********************************
//对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
Hit singleCharHit = Dictionary.getDictionary(cfg.getRemoteDictName()).matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成量词词
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/wltea/analyzer/core/IKSegmenter.java
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,9 @@ private List<ISegmenter> loadSegmenters(){
//处理字母的子分词器
segmenters.add(new LetterSegmenter());
//处理中文数量词的子分词器
segmenters.add(new CN_QuantifierSegmenter());
segmenters.add(new CN_QuantifierSegmenter(this.configuration));
//处理中文词的子分词器
segmenters.add(new CJKSegmenter());
segmenters.add(new CJKSegmenter(this.configuration));
return segmenters;
}

Expand Down
74 changes: 45 additions & 29 deletions src/main/java/org/wltea/analyzer/dic/Dictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ public class Dictionary {
/*
* 词典单子实例
*/
private static Dictionary singleton;
private static Map<String, Dictionary> dictionaryMap = new HashMap<String, Dictionary>();

private DictSegment _MainDict;

Expand Down Expand Up @@ -143,26 +143,25 @@ private String getProperty(String key){
* @return Dictionary
*/
public static synchronized void initial(Configuration cfg) {
if (singleton == null) {
if (!dictionaryMap.containsKey(cfg.getRemoteDictName())) {
synchronized (Dictionary.class) {
if (singleton == null) {

singleton = new Dictionary(cfg);
singleton.loadMainDict();
singleton.loadSurnameDict();
singleton.loadQuantifierDict();
singleton.loadSuffixDict();
singleton.loadPrepDict();
singleton.loadStopWordDict();

if (!dictionaryMap.containsKey(cfg.getRemoteDictName())) {
Dictionary newDict = new Dictionary(cfg);
newDict.loadMainDict();
newDict.loadSurnameDict();
newDict.loadQuantifierDict();
newDict.loadSuffixDict();
newDict.loadPrepDict();
newDict.loadStopWordDict();
dictionaryMap.put(cfg.getRemoteDictName(), newDict);
if(cfg.isEnableRemoteDict()){
// 建立监控线程
for (String location : singleton.getRemoteExtDictionarys()) {
for (String location : dictionaryMap.get(cfg.getRemoteDictName()).getRemoteExtDictionarys()) {
// 10 秒是初始延迟可以修改的 60是间隔时间 单位秒
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
pool.scheduleAtFixedRate(new Monitor(location, cfg.getRemoteDictName()), 10, 60, TimeUnit.SECONDS);
}
for (String location : singleton.getRemoteExtStopWordDictionarys()) {
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
for (String location : dictionaryMap.get(cfg.getRemoteDictName()).getRemoteExtStopWordDictionarys()) {
pool.scheduleAtFixedRate(new Monitor(location, cfg.getRemoteDictName()), 10, 60, TimeUnit.SECONDS);
}
}

Expand Down Expand Up @@ -241,6 +240,9 @@ private List<String> getRemoteExtDictionarys() {
String[] filePaths = remoteExtDictCfg.split(";");
for (String filePath : filePaths) {
if (filePath != null && !"".equals(filePath.trim())) {
if (this.configuration.getRemoteDictName()!="") {
filePath += "/" + this.configuration.getRemoteDictName();
}
remoteExtDictFiles.add(filePath);

}
Expand Down Expand Up @@ -274,6 +276,9 @@ private List<String> getRemoteExtStopWordDictionarys() {
String[] filePaths = remoteExtStopWordDictCfg.split(";");
for (String filePath : filePaths) {
if (filePath != null && !"".equals(filePath.trim())) {
if (this.configuration.getRemoteDictName()!="") {
filePath += "/" + this.configuration.getRemoteDictName();
}
remoteExtStopWordDictFiles.add(filePath);

}
Expand All @@ -287,16 +292,27 @@ private String getDictRoot() {
}


// /**
// * 获取词典单子实例
// *
// * @return Dictionary 单例对象
// */
// public static Dictionary getSingleton() {
// if (singleton == null) {
// throw new IllegalStateException("词典尚未初始化,请先调用initial方法");
// }
// return singleton;
// }
/**
* 获取词典单子实例
*
* @return Dictionary 单例对象
* 获取词典实例
*
* @return Dictionary 对象
*/
public static Dictionary getSingleton() {
if (singleton == null) {
public static Dictionary getDictionary(String dictName) {
if (!dictionaryMap.containsKey(dictName)) {
throw new IllegalStateException("ik dict has not been initialized yet, please call initial method first.");
}
return singleton;
return dictionaryMap.get(dictName);
}


Expand All @@ -311,7 +327,7 @@ public void addWords(Collection<String> words) {
for (String word : words) {
if (word != null) {
// 批量加载词条到主内存词典中
singleton._MainDict.fillSegment(word.trim().toCharArray());
this._MainDict.fillSegment(word.trim().toCharArray());
}
}
}
Expand All @@ -325,7 +341,7 @@ public void disableWords(Collection<String> words) {
for (String word : words) {
if (word != null) {
// 批量屏蔽词条
singleton._MainDict.disableSegment(word.trim().toCharArray());
this._MainDict.disableSegment(word.trim().toCharArray());
}
}
}
Expand All @@ -337,7 +353,7 @@ public void disableWords(Collection<String> words) {
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray) {
return singleton._MainDict.match(charArray);
return this._MainDict.match(charArray);
}

/**
Expand All @@ -346,7 +362,7 @@ public Hit matchInMainDict(char[] charArray) {
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray, int begin, int length) {
return singleton._MainDict.match(charArray, begin, length);
return this._MainDict.match(charArray, begin, length);
}

/**
Expand All @@ -355,7 +371,7 @@ public Hit matchInMainDict(char[] charArray, int begin, int length) {
* @return Hit 匹配结果描述
*/
public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
return singleton._QuantifierDict.match(charArray, begin, length);
return this._QuantifierDict.match(charArray, begin, length);
}

/**
Expand All @@ -374,7 +390,7 @@ public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
* @return boolean
*/
public boolean isStopWord(char[] charArray, int begin, int length) {
return singleton._StopWords.match(charArray, begin, length).isMatch();
return this._StopWords.match(charArray, begin, length).isMatch();
}

/**
Expand Down Expand Up @@ -565,7 +581,7 @@ void reLoadMainDict() {
logger.info("start to reload ik dict.");
// 新开一个实例加载词典,减少加载过程对当前词典使用的影响
Dictionary tmpDict = new Dictionary(configuration);
tmpDict.configuration = getSingleton().configuration;
tmpDict.configuration = this.configuration;
tmpDict.loadMainDict();
tmpDict.loadStopWordDict();
_MainDict = tmpDict._MainDict;
Expand Down
9 changes: 7 additions & 2 deletions src/main/java/org/wltea/analyzer/dic/Monitor.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ public class Monitor implements Runnable {
private static final Logger logger = ESPluginLoggerFactory.getLogger(Monitor.class.getName());

private static CloseableHttpClient httpclient = HttpClients.createDefault();
/*
* 自定义词典名,默认为common
*/
private final String customRemoteDictName;
/*
* 上次更改时间
*/
Expand All @@ -32,10 +36,11 @@ public class Monitor implements Runnable {
*/
private String location;

public Monitor(String location) {
public Monitor(String location, String customRemoteDictName) {
this.location = location;
this.last_modified = null;
this.eTags = null;
this.customRemoteDictName = customRemoteDictName;
}

public void run() {
Expand Down Expand Up @@ -84,7 +89,7 @@ public void runUnprivileged() {
||((response.getLastHeader("ETag")!=null) && !response.getLastHeader("ETag").getValue().equalsIgnoreCase(eTags))) {

// 远程词库有更新,需要重新加载词典,并修改last_modified,eTags
Dictionary.getSingleton().reLoadMainDict();
Dictionary.getDictionary(customRemoteDictName).reLoadMainDict();
last_modified = response.getLastHeader("Last-Modified")==null?null:response.getLastHeader("Last-Modified").getValue();
eTags = response.getLastHeader("ETag")==null?null:response.getLastHeader("ETag").getValue();
}
Expand Down