From fc50e19e35a24cc3cdfd172b8c3a8904f86decb3 Mon Sep 17 00:00:00 2001 From: bairen Date: Thu, 5 Jan 2023 16:16:18 +0800 Subject: [PATCH] fix "startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards" Error --- .../index/analysis/PinyinTokenizer.java | 19 +++++-- .../index/analysis/PinyinAnalysisTest.java | 53 +++++++++++++++++++ 2 files changed, 68 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/elasticsearch/index/analysis/PinyinTokenizer.java b/src/main/java/org/elasticsearch/index/analysis/PinyinTokenizer.java index 7540da0..7898682 100644 --- a/src/main/java/org/elasticsearch/index/analysis/PinyinTokenizer.java +++ b/src/main/java/org/elasticsearch/index/analysis/PinyinTokenizer.java @@ -16,7 +16,6 @@ public class PinyinTokenizer extends Tokenizer { - private static final int DEFAULT_BUFFER_SIZE = 256; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private boolean done = false; @@ -156,6 +155,7 @@ public final boolean incrementToken() throws IOException { StringBuilder buff = new StringBuilder(); int buffStartPosition = 0; int buffSize = 0; + int[] specialCharPosition = new int[source.length()]; position = 0; @@ -183,12 +183,19 @@ public final boolean incrementToken() throws IOException { if (config.keepNoneChineseInJoinedFullPinyin) { fullPinyinLetters.append(c); } + }else{ + //handle special charset + specialCharPosition[i]=1; + ++buffSize; } } else { //clean previous temp if (buff.length() > 0) { - buffSize = parseBuff(buff, buffSize, buffStartPosition); + buffSize = parseBuff(buff, buffSize, buffStartPosition,specialCharPosition); + }else{ + //clean buffSize + buffSize=0; } boolean incrPosition = false; @@ -219,7 +226,7 @@ public final boolean incrementToken() throws IOException { //clean previous temp if (buff.length() > 0) { - buffSize = parseBuff(buff, buffSize, buffStartPosition); + buffSize = parseBuff(buff, buffSize, buffStartPosition,specialCharPosition); } } @@ -270,7 +277,7 @@ public final boolean incrementToken() throws IOException { return false; } - private int parseBuff(StringBuilder buff, int buffSize, int buffPosition) { + private int parseBuff(StringBuilder buff, int buffSize, int buffPosition,int[] specialCharPosition) { if (config.keepNoneChinese) { if (config.noneChinesePinyinTokenize) { List result = PinyinAlphabetTokenizer.walk(buff.toString()); @@ -278,6 +285,10 @@ private int parseBuff(StringBuilder buff, int buffSize, int buffPosition) { for (int i = 0; i < result.size(); i++) { int end; String t = result.get(i); + //skip special charset + if(specialCharPosition[start]==1){ + ++start; + } if (config.fixedPinyinOffset) { end = start + 1; } else { diff --git a/src/test/java/org/elasticsearch/index/analysis/PinyinAnalysisTest.java b/src/test/java/org/elasticsearch/index/analysis/PinyinAnalysisTest.java index 661a144..ec375e4 100644 --- a/src/test/java/org/elasticsearch/index/analysis/PinyinAnalysisTest.java +++ b/src/test/java/org/elasticsearch/index/analysis/PinyinAnalysisTest.java @@ -1485,4 +1485,57 @@ public void TestPinyinPosition4() throws IOException { } + + @Test + public void TestPinyinPosition5() throws IOException { + String[] s = {"WC-20%权益","刘德华(香港)精选M类"}; + PinyinConfig config = new PinyinConfig(); + config.keepFirstLetter = true; + config.keepOriginal = false; + config.ignorePinyinOffset = false; + + HashMap> result = getStringArrayListHashMap(s, config); + ArrayList re = result.get("WC-20%权益"); + Assert.assertEquals("w", re.get(0).term); + Assert.assertEquals(0, re.get(0).startOffset); + Assert.assertEquals(1, re.get(0).endOffset); + Assert.assertEquals(1, re.get(0).position); + + Assert.assertEquals("wc20qy", re.get(1).term); + Assert.assertEquals(0, re.get(1).startOffset); + Assert.assertEquals(6, re.get(1).endOffset); + Assert.assertEquals(1, re.get(1).position); + + Assert.assertEquals("c", re.get(2).term); + Assert.assertEquals(1, re.get(2).startOffset); + Assert.assertEquals(2, re.get(2).endOffset); + Assert.assertEquals(2, re.get(2).position); + + Assert.assertEquals("20", re.get(3).term); + Assert.assertEquals(3, re.get(3).startOffset); + Assert.assertEquals(5, re.get(3).endOffset); + Assert.assertEquals(3, re.get(3).position); + + Assert.assertEquals("quan", re.get(4).term); + Assert.assertEquals(6, re.get(4).startOffset); + Assert.assertEquals(7, re.get(4).endOffset); + Assert.assertEquals(4, re.get(4).position); + + Assert.assertEquals("yi", re.get(5).term); + Assert.assertEquals(7, re.get(5).startOffset); + Assert.assertEquals(8, re.get(5).endOffset); + Assert.assertEquals(5, re.get(5).position); + + re = result.get("刘德华(香港)精选M类"); + Assert.assertEquals("xuan", re.get(7).term); + Assert.assertEquals(8, re.get(7).startOffset); + Assert.assertEquals(9, re.get(7).endOffset); + Assert.assertEquals(7, re.get(7).position); + + Assert.assertEquals("m", re.get(8).term); + Assert.assertEquals(9, re.get(8).startOffset); + Assert.assertEquals(10, re.get(8).endOffset); + Assert.assertEquals(8, re.get(8).position); + + } }