From 4e4943ffd15379c9284a9a8425266dd684f3b901 Mon Sep 17 00:00:00 2001 From: jipeli <54889677+jipeli@users.noreply.github.com> Date: Thu, 27 Jun 2024 18:56:34 +0800 Subject: [PATCH] (improvement)(chat) fixed HanLP multiple uppercase(#963) (#1254) --- .../hanlp/dictionary/CoreDictionary.java | 45 +++++++++++- .../knowledge/DictionaryAttributeUtil.java | 21 ++++-- .../chat/knowledge/MultiCustomDictionary.java | 8 ++- .../chat/knowledge/helper/HanlpHelper.java | 71 ++++++++++++++++--- .../tencent/supersonic/util/HanlpTest.java | 39 ++++++++++ 5 files changed, 166 insertions(+), 18 deletions(-) create mode 100644 launchers/standalone/src/test/java/com/tencent/supersonic/util/HanlpTest.java diff --git a/common/src/main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java b/common/src/main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java index dfef774d6..085e5d5ec 100644 --- a/common/src/main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java +++ b/common/src/main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java @@ -8,7 +8,6 @@ import com.hankcs.hanlp.corpus.io.IOUtil; import com.hankcs.hanlp.corpus.tag.Nature; import com.hankcs.hanlp.utility.Predefine; import com.hankcs.hanlp.utility.TextUtility; - import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.DataOutputStream; @@ -16,8 +15,11 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.Serializable; +import java.util.Arrays; import java.util.Collection; +import java.util.List; import java.util.TreeMap; +import java.util.stream.Collectors; /** * 使用DoubleArrayTrie实现的核心词典 @@ -73,7 +75,8 @@ public class CoreDictionary { totalFrequency += attribute.totalFrequency; } Predefine.logger.info( - "核心词典读入词条" + map.size() + " 全部频次" + totalFrequency + ",耗时" + (System.currentTimeMillis() - start) + "核心词典读入词条" + map.size() + " 全部频次" + totalFrequency + ",耗时" + ( + System.currentTimeMillis() - start) + "ms"); br.close(); trie.build(map); @@ -214,12 +217,14 @@ public class CoreDictionary { public int[] frequency; public int totalFrequency; + public String[] originals; public String original = null; public Attribute(int size) { nature = new Nature[size]; frequency = new int[size]; + originals = new String[size]; } public Attribute(Nature[] nature, int[] frequency) { @@ -240,6 +245,13 @@ public class CoreDictionary { this.totalFrequency = totalFrequency; } + public Attribute(Nature[] nature, int[] frequency, String[] originals, int totalFrequency) { + this.nature = nature; + this.frequency = frequency; + this.originals = originals; + this.totalFrequency = totalFrequency; + } + /** * 使用单个词性,默认词频1000构造 * @@ -365,6 +377,35 @@ public class CoreDictionary { out.writeInt(frequency[i]); } } + + public void setOriginals(String original) { + if (original == null) { + return; + } + if (originals == null || originals.length == 0) { + originals = new String[1]; + } + originals[0] = original; + } + + public String getOriginal(Nature find) { + if (originals == null || originals.length == 0 || find == null) { + return null; + } + for (int i = 0; i < nature.length; i++) { + if (find.equals(nature[i]) && originals.length > i) { + return originals[i]; + } + } + return null; + } + + public List getOriginals() { + if (originals == null || originals.length == 0) { + return null; + } + return Arrays.stream(originals).filter(o -> o != null).distinct().collect(Collectors.toList()); + } } /** diff --git a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/DictionaryAttributeUtil.java b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/DictionaryAttributeUtil.java index f12aa8d0c..8ce079d15 100644 --- a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/DictionaryAttributeUtil.java +++ b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/DictionaryAttributeUtil.java @@ -8,6 +8,7 @@ import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -18,21 +19,31 @@ public class DictionaryAttributeUtil { public static CoreDictionary.Attribute getAttribute(CoreDictionary.Attribute old, CoreDictionary.Attribute add) { Map map = new HashMap<>(); - IntStream.range(0, old.nature.length).boxed().forEach(i -> map.put(old.nature[i], old.frequency[i])); - IntStream.range(0, add.nature.length).boxed().forEach(i -> map.put(add.nature[i], add.frequency[i])); + Map originalMap = new HashMap<>(); + IntStream.range(0, old.nature.length).boxed().forEach(i -> { + map.put(old.nature[i], old.frequency[i]); + if (Objects.nonNull(old.originals)) { + originalMap.put(old.nature[i], old.originals[i]); + } + }); + IntStream.range(0, add.nature.length).boxed().forEach(i -> { + map.put(add.nature[i], add.frequency[i]); + if (Objects.nonNull(add.originals)) { + originalMap.put(add.nature[i], add.originals[i]); + } + }); List> list = new LinkedList>(map.entrySet()); Collections.sort(list, new Comparator>() { public int compare(Map.Entry o1, Map.Entry o2) { return o2.getValue() - o1.getValue(); } }); + String[] originals = list.stream().map(l -> originalMap.get(l.getKey())).toArray(String[]::new); CoreDictionary.Attribute attribute = new CoreDictionary.Attribute( list.stream().map(i -> i.getKey()).collect(Collectors.toList()).toArray(new Nature[0]), list.stream().map(i -> i.getValue()).mapToInt(Integer::intValue).toArray(), + originals, list.stream().map(i -> i.getValue()).findFirst().get()); - if (old.original != null || add.original != null) { - attribute.original = add.original != null ? add.original : old.original; - } return attribute; } } diff --git a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/MultiCustomDictionary.java b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/MultiCustomDictionary.java index 004fbca3b..6b9fd50d7 100644 --- a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/MultiCustomDictionary.java +++ b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/MultiCustomDictionary.java @@ -108,10 +108,11 @@ public class MultiCustomDictionary extends DynamicCustomDictionary { attribute.nature[i] = LexiconUtility.convertStringToNature(param[1 + 2 * i], customNatureCollector); attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]); + attribute.originals[i] = original; attribute.totalFrequency += attribute.frequency[i]; } } - attribute.original = original; + //attribute.original = original; if (removeDuplicates && map.containsKey(word)) { attribute = DictionaryAttributeUtil.getAttribute(map.get(word), attribute); @@ -373,7 +374,7 @@ public class MultiCustomDictionary extends DynamicCustomDictionary { if (att == null) { return false; } else if (this.dat.containsKey(word)) { - att.original = original; + att.setOriginals(original); att = DictionaryAttributeUtil.getAttribute(this.dat.get(word), att); this.dat.set(word, att); // return true; @@ -381,7 +382,8 @@ public class MultiCustomDictionary extends DynamicCustomDictionary { if (this.trie == null) { this.trie = new BinTrie(); } - att.original = original; + //att.original = original; + att.setOriginals(original); if (this.trie.containsKey(word)) { att = DictionaryAttributeUtil.getAttribute(this.trie.get(word), att); } diff --git a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/helper/HanlpHelper.java b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/helper/HanlpHelper.java index 20f2e39cf..32ca78768 100644 --- a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/helper/HanlpHelper.java +++ b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/helper/HanlpHelper.java @@ -9,17 +9,14 @@ import com.hankcs.hanlp.seg.Segment; import com.hankcs.hanlp.seg.common.Term; import com.tencent.supersonic.common.pojo.enums.DictWordType; import com.tencent.supersonic.headless.api.pojo.response.S2Term; +import com.tencent.supersonic.headless.chat.knowledge.DatabaseMapResult; import com.tencent.supersonic.headless.chat.knowledge.DictWord; +import com.tencent.supersonic.headless.chat.knowledge.EmbeddingResult; import com.tencent.supersonic.headless.chat.knowledge.HadoopFileIOAdapter; +import com.tencent.supersonic.headless.chat.knowledge.HanlpMapResult; import com.tencent.supersonic.headless.chat.knowledge.MapResult; import com.tencent.supersonic.headless.chat.knowledge.MultiCustomDictionary; import com.tencent.supersonic.headless.chat.knowledge.SearchService; -import lombok.extern.slf4j.Slf4j; -import org.apache.commons.lang3.StringUtils; -import org.springframework.beans.BeanUtils; -import org.springframework.util.CollectionUtils; -import org.springframework.util.ResourceUtils; - import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; @@ -29,6 +26,11 @@ import java.util.Collection; import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.StringUtils; +import org.springframework.beans.BeanUtils; +import org.springframework.util.CollectionUtils; +import org.springframework.util.ResourceUtils; /** * HanLP helper @@ -200,16 +202,69 @@ public class HanlpHelper { if (CollectionUtils.isEmpty(mapResults)) { return; } + List newResults = new ArrayList<>(); for (T mapResult : mapResults) { + boolean isAdd = false; if (MultiCustomDictionary.isLowerLetter(mapResult.getName())) { if (CustomDictionary.contains(mapResult.getName())) { CoreDictionary.Attribute attribute = CustomDictionary.get(mapResult.getName()); - if (attribute != null && attribute.original != null) { - mapResult.setName(attribute.original); + if (attribute != null) { + isAdd = addLetterOriginal(newResults, mapResult, attribute); + } + } + } + if (!isAdd) { + newResults.add(mapResult); + } + } + mapResults.clear(); + mapResults.addAll(newResults); + } + + public static boolean addLetterOriginal(List mapResults, T mapResult, + CoreDictionary.Attribute attribute) { + boolean isAdd = false; + if (attribute != null) { + if (mapResult instanceof HanlpMapResult) { + HanlpMapResult hanlpMapResult = (HanlpMapResult) mapResult; + for (String nature : hanlpMapResult.getNatures()) { + String orig = attribute.getOriginal(Nature.fromString(nature)); + if (orig != null) { + MapResult addMapResult = new HanlpMapResult(orig, Arrays.asList(nature), + hanlpMapResult.getDetectWord()); + mapResults.add((T) addMapResult); + isAdd = true; + } + } + } else if (mapResult instanceof DatabaseMapResult) { + List originals = attribute.getOriginals(); + if (!CollectionUtils.isEmpty(originals)) { + for (String orig : originals) { + DatabaseMapResult addMapResult = new DatabaseMapResult(); + addMapResult.setName(orig); + addMapResult.setSchemaElement(((DatabaseMapResult) mapResult).getSchemaElement()); + addMapResult.setDetectWord(mapResult.getDetectWord()); + mapResults.add((T) addMapResult); + isAdd = true; + } + } + } else if (mapResult instanceof EmbeddingResult) { + List originals = attribute.getOriginals(); + if (!CollectionUtils.isEmpty(originals)) { + for (String orig : originals) { + EmbeddingResult addMapResult = new EmbeddingResult(); + addMapResult.setName(orig); + addMapResult.setDetectWord(mapResult.getDetectWord()); + addMapResult.setId(((EmbeddingResult) mapResult).getId()); + addMapResult.setMetadata(((EmbeddingResult) mapResult).getMetadata()); + addMapResult.setDistance(((EmbeddingResult) mapResult).getDistance()); + mapResults.add((T) addMapResult); + isAdd = true; } } } } + return isAdd; } public static List getTerms(String text, Map> modelIdToDataSetIds) { diff --git a/launchers/standalone/src/test/java/com/tencent/supersonic/util/HanlpTest.java b/launchers/standalone/src/test/java/com/tencent/supersonic/util/HanlpTest.java new file mode 100644 index 000000000..276a60099 --- /dev/null +++ b/launchers/standalone/src/test/java/com/tencent/supersonic/util/HanlpTest.java @@ -0,0 +1,39 @@ +package com.tencent.supersonic.util; + +import com.hankcs.hanlp.corpus.tag.Nature; +import com.hankcs.hanlp.dictionary.CoreDictionary.Attribute; +import com.tencent.supersonic.headless.api.pojo.SchemaElement; +import com.tencent.supersonic.headless.chat.knowledge.DatabaseMapResult; +import com.tencent.supersonic.headless.chat.knowledge.MapResult; +import com.tencent.supersonic.headless.chat.knowledge.helper.HanlpHelper; +import java.util.ArrayList; +import java.util.List; +import org.junit.Assert; +import org.junit.Test; + +public class HanlpTest { + + @Test + public void test() { + Nature[] nature = new Nature[2]; + nature[0] = Nature.create("_3"); + nature[1] = Nature.create("_4"); + int[] frequency = new int[2]; + frequency[0] = 100; + frequency[1] = 200; + String[] originals = new String[2]; + originals[0] = "AA"; + originals[1] = "Aa"; + Attribute att = new Attribute(nature, frequency, originals, 200); + att.original = "DDDDD"; + HanlpHelper.getDynamicCustomDictionary().getTrie().set("aa", att); + List mapResults = new ArrayList<>(); + DatabaseMapResult addMapResult = new DatabaseMapResult(); + addMapResult.setName("aa"); + addMapResult.setSchemaElement(new SchemaElement()); + addMapResult.setDetectWord("abc"); + mapResults.add(addMapResult); + HanlpHelper.transLetterOriginal(mapResults); + Assert.assertEquals(mapResults.size(), 2); + } +} \ No newline at end of file