diff --git a/README.md b/README.md index f1406443..db8ae267 100644 --- a/README.md +++ b/README.md @@ -3,25 +3,25 @@ IK Analysis for Elasticsearch The IK Analysis plugin integrates Lucene IK analyzer (http://code.google.com/p/ik-analyzer/) into elasticsearch, support customized dictionary. -Analyzer: `ik_smart` , `ik_max_word` , Tokenizer: `ik_smart` , `ik_max_word` +Analyzer: `ik_smart` , `ik_max_word` , `ik_max_word_char` Tokenizer: `ik_smart` , `ik_max_word` , `ik_max_word_char` Versions -------- -IK version | ES version ------------|----------- -master | 7.x -> master -6.x| 6.x -5.x| 5.x -1.10.6 | 2.4.6 -1.9.5 | 2.3.5 -1.8.1 | 2.2.1 -1.7.0 | 2.1.1 -1.5.0 | 2.0.0 -1.2.6 | 1.0.0 -1.2.5 | 0.90.x -1.1.3 | 0.20.x -1.0.0 | 0.16.2 -> 0.19.0 +| IK version | ES version | +| ---------- | ---------------- | +| master | 7.x -> master | +| 6.x | 6.x | +| 5.x | 5.x | +| 1.10.6 | 2.4.6 | +| 1.9.5 | 2.3.5 | +| 1.8.1 | 2.2.1 | +| 1.7.0 | 2.1.1 | +| 1.5.0 | 2.0.0 | +| 1.2.6 | 1.0.0 | +| 1.2.5 | 0.90.x | +| 1.1.3 | 0.20.x | +| 1.0.0 | 0.16.2 -> 0.19.0 | Install ------- diff --git a/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java b/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java index 552f9561..4116bd42 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java +++ b/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java @@ -9,10 +9,14 @@ public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider { private final IKAnalyzer analyzer; - public IkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings,boolean useSmart) { + public IkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings, boolean useSmart) { + this(indexSettings, env, name, settings, useSmart, false); + } + + public IkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings,boolean useSmart, boolean includeSingleChar) { super(indexSettings, name, settings); - Configuration configuration=new Configuration(env,settings).setUseSmart(useSmart); + Configuration configuration=new Configuration(env,settings).setUseSmart(useSmart).setIncludeSingleChar(includeSingleChar); analyzer=new IKAnalyzer(configuration); } @@ -25,6 +29,10 @@ public static IkAnalyzerProvider getIkAnalyzerProvider(IndexSettings indexSettin return new IkAnalyzerProvider(indexSettings,env,name,settings,false); } + public static IkAnalyzerProvider getIkIncludeCharAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { + return new IkAnalyzerProvider(indexSettings,env,name,settings,false, true); + } + @Override public IKAnalyzer get() { return this.analyzer; } diff --git a/src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java index d2805618..8c5cc472 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java @@ -19,6 +19,10 @@ public static IkTokenizerFactory getIkTokenizerFactory(IndexSettings indexSettin return new IkTokenizerFactory(indexSettings,env, name, settings).setSmart(false); } + public static IkTokenizerFactory getIkIncludeCharTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { + return new IkTokenizerFactory(indexSettings,env, name, settings).setSmart(false).setIncludeSingleChar(true); + } + public static IkTokenizerFactory getIkSmartTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { return new IkTokenizerFactory(indexSettings,env, name, settings).setSmart(true); } @@ -28,6 +32,11 @@ public IkTokenizerFactory setSmart(boolean smart){ return this; } + public IkTokenizerFactory setIncludeSingleChar(boolean singleChar) { + this.configuration.setIncludeSingleChar(singleChar); + return this; + } + @Override public Tokenizer create() { return new IKTokenizer(configuration); } diff --git a/src/main/java/org/elasticsearch/plugin/analysis/ik/AnalysisIkPlugin.java b/src/main/java/org/elasticsearch/plugin/analysis/ik/AnalysisIkPlugin.java index e6ed25c7..806057b5 100644 --- a/src/main/java/org/elasticsearch/plugin/analysis/ik/AnalysisIkPlugin.java +++ b/src/main/java/org/elasticsearch/plugin/analysis/ik/AnalysisIkPlugin.java @@ -24,6 +24,7 @@ public Map> getTokeniz extra.put("ik_smart", IkTokenizerFactory::getIkSmartTokenizerFactory); extra.put("ik_max_word", IkTokenizerFactory::getIkTokenizerFactory); + extra.put("ik_max_word_char", IkTokenizerFactory::getIkIncludeCharTokenizerFactory); return extra; } @@ -34,6 +35,7 @@ public Map tmpHits; + + //是否包含单字分词 + private boolean includeSingleChar=false; - CJKSegmenter(){ + CJKSegmenter(boolean singleChar){ + includeSingleChar=singleChar; + this.tmpHits = new LinkedList(); } @@ -78,21 +83,20 @@ public void analyze(AnalyzeContext context) { //********************************* //再对当前指针位置的字符进行单字匹配 Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); - if(singleCharHit.isMatch()){//首字成词 + if(singleCharHit.isMatch()) {//首字成词 //输出当前的词 - Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD); + Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_CNWORD); + context.addLexeme(newLexeme); + } else if (includeSingleChar) {//单字拆词 + Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_CNCHAR); context.addLexeme(newLexeme); + } - //同时也是词前缀 - if(singleCharHit.isPrefix()){ - //前缀匹配则放入hit列表 - this.tmpHits.add(singleCharHit); - } - }else if(singleCharHit.isPrefix()){//首字为词前缀 + // 判断词前缀 + if(singleCharHit.isPrefix()){ //前缀匹配则放入hit列表 this.tmpHits.add(singleCharHit); } - }else{ //遇到CHAR_USELESS字符 diff --git a/src/main/java/org/wltea/analyzer/core/IKSegmenter.java b/src/main/java/org/wltea/analyzer/core/IKSegmenter.java index 789a3a67..46488719 100644 --- a/src/main/java/org/wltea/analyzer/core/IKSegmenter.java +++ b/src/main/java/org/wltea/analyzer/core/IKSegmenter.java @@ -81,7 +81,7 @@ private List loadSegmenters(){ //处理中文数量词的子分词器 segmenters.add(new CN_QuantifierSegmenter()); //处理中文词的子分词器 - segmenters.add(new CJKSegmenter()); + segmenters.add(new CJKSegmenter(configuration.isIncludeSingleChar())); return segmenters; }