From 3b65b1c80b94e4adc64696db7d4f4fcccf0f1cc2 Mon Sep 17 00:00:00 2001 From: lexluo09 <39718951+lexluo09@users.noreply.github.com> Date: Mon, 4 Nov 2024 10:06:10 +0800 Subject: [PATCH] [improvement][headless] Remove entities from the dictionary and search interface (#1878) --- .../common/pojo/enums/DictWordType.java | 2 - ...rdBuilder.java => DataSetWordBuilder.java} | 3 +- .../knowledge/builder/EntityWordBuilder.java | 37 ------------------- .../knowledge/builder/WordBuilderFactory.java | 3 +- .../chat/knowledge/helper/NatureHelper.java | 13 ++++--- .../chat/mapper/SearchMatchStrategy.java | 13 ------- 6 files changed, 9 insertions(+), 62 deletions(-) rename headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/builder/{ModelWordBuilder.java => DataSetWordBuilder.java} (93%) delete mode 100644 headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/builder/EntityWordBuilder.java diff --git a/common/src/main/java/com/tencent/supersonic/common/pojo/enums/DictWordType.java b/common/src/main/java/com/tencent/supersonic/common/pojo/enums/DictWordType.java index 5259131d4..6dbb99013 100644 --- a/common/src/main/java/com/tencent/supersonic/common/pojo/enums/DictWordType.java +++ b/common/src/main/java/com/tencent/supersonic/common/pojo/enums/DictWordType.java @@ -14,8 +14,6 @@ public enum DictWordType { DATASET("dataSet"), - ENTITY("entity"), - NUMBER("m"), TAG("tag"), diff --git a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/builder/ModelWordBuilder.java b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/builder/DataSetWordBuilder.java similarity index 93% rename from headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/builder/ModelWordBuilder.java rename to headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/builder/DataSetWordBuilder.java index 474de5473..43f3fa82a 100644 --- a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/builder/ModelWordBuilder.java +++ b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/builder/DataSetWordBuilder.java @@ -10,10 +10,9 @@ import org.springframework.stereotype.Service; import java.util.List; import java.util.Objects; -/** model word nature */ @Service @Slf4j -public class ModelWordBuilder extends BaseWordWithAliasBuilder { +public class DataSetWordBuilder extends BaseWordWithAliasBuilder { @Override public List doGet(String word, SchemaElement schemaElement) { diff --git a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/builder/EntityWordBuilder.java b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/builder/EntityWordBuilder.java deleted file mode 100644 index d4eb900ca..000000000 --- a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/builder/EntityWordBuilder.java +++ /dev/null @@ -1,37 +0,0 @@ -package com.tencent.supersonic.headless.chat.knowledge.builder; - -import com.google.common.collect.Lists; -import com.tencent.supersonic.common.pojo.enums.DictWordType; -import com.tencent.supersonic.headless.api.pojo.SchemaElement; -import com.tencent.supersonic.headless.chat.knowledge.DictWord; -import lombok.extern.slf4j.Slf4j; -import org.springframework.stereotype.Service; - -import java.util.List; -import java.util.Objects; - -@Service -@Slf4j -public class EntityWordBuilder extends BaseWordWithAliasBuilder { - - @Override - public List doGet(String word, SchemaElement schemaElement) { - List result = Lists.newArrayList(); - if (Objects.isNull(schemaElement)) { - return result; - } - result.add(getOneWordNature(word, schemaElement, false)); - result.addAll(getOneWordNatureAlias(schemaElement, false)); - return result; - } - - @Override - public DictWord getOneWordNature(String word, SchemaElement schemaElement, boolean isSuffix) { - String nature = DictWordType.NATURE_SPILT + schemaElement.getModel() - + DictWordType.NATURE_SPILT + schemaElement.getId() + DictWordType.ENTITY.getType(); - DictWord dictWord = new DictWord(); - dictWord.setWord(word); - dictWord.setNatureWithFrequency(String.format("%s " + DEFAULT_FREQUENCY * 2, nature)); - return dictWord; - } -} diff --git a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/builder/WordBuilderFactory.java b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/builder/WordBuilderFactory.java index ef60a2314..a917b9ba5 100644 --- a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/builder/WordBuilderFactory.java +++ b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/builder/WordBuilderFactory.java @@ -13,8 +13,7 @@ public class WordBuilderFactory { static { wordNatures.put(DictWordType.DIMENSION, new DimensionWordBuilder()); wordNatures.put(DictWordType.METRIC, new MetricWordBuilder()); - wordNatures.put(DictWordType.DATASET, new ModelWordBuilder()); - wordNatures.put(DictWordType.ENTITY, new EntityWordBuilder()); + wordNatures.put(DictWordType.DATASET, new DataSetWordBuilder()); wordNatures.put(DictWordType.VALUE, new ValueWordBuilder()); wordNatures.put(DictWordType.TERM, new TermWordBuilder()); } diff --git a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/helper/NatureHelper.java b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/helper/NatureHelper.java index 013f01851..3271520e3 100644 --- a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/helper/NatureHelper.java +++ b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/helper/NatureHelper.java @@ -18,7 +18,9 @@ import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; -/** nature parse helper */ +/** + * nature parse helper + */ @Slf4j public class NatureHelper { @@ -50,10 +52,9 @@ public class NatureHelper { return result; } - private static boolean isDataSetOrEntity(S2Term term, Integer model) { + private static boolean isDataSet(S2Term term, Integer model) { String natureStr = term.nature.toString(); - return (DictWordType.NATURE_SPILT + model).equals(natureStr) - || natureStr.endsWith(DictWordType.ENTITY.getType()); + return (DictWordType.NATURE_SPILT + model).equals(natureStr); } public static Integer getDataSetByNature(Nature nature) { @@ -119,8 +120,8 @@ public class NatureHelper { } private static long getDataSetCount(List terms) { - return terms.stream() - .filter(term -> isDataSetOrEntity(term, getDataSetByNature(term.nature))).count(); + return terms.stream().filter(term -> isDataSet(term, getDataSetByNature(term.nature))) + .count(); } private static long getDimensionValueCount(List terms) { diff --git a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/mapper/SearchMatchStrategy.java b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/mapper/SearchMatchStrategy.java index b076305dc..65d343f13 100644 --- a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/mapper/SearchMatchStrategy.java +++ b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/mapper/SearchMatchStrategy.java @@ -1,13 +1,11 @@ package com.tencent.supersonic.headless.chat.mapper; import com.google.common.collect.Lists; -import com.tencent.supersonic.common.pojo.enums.DictWordType; import com.tencent.supersonic.headless.api.pojo.response.S2Term; import com.tencent.supersonic.headless.chat.ChatQueryContext; import com.tencent.supersonic.headless.chat.knowledge.HanlpMapResult; import com.tencent.supersonic.headless.chat.knowledge.KnowledgeBaseService; import com.tencent.supersonic.headless.chat.knowledge.SearchService; -import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; @@ -17,7 +15,6 @@ import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; -import java.util.stream.Collectors; /** * SearchMatchStrategy encapsulates a concrete matching algorithm executed during search process. @@ -66,16 +63,6 @@ public class SearchMatchStrategy extends BaseMatchStrategy { knowledgeBaseService.suffixSearch(detectSegment, SEARCH_SIZE, chatQueryContext.getModelIdToDataSetIds(), detectDataSetIds); hanlpMapResults.addAll(suffixHanlpMapResults); - // remove entity name where search - hanlpMapResults = hanlpMapResults.stream().filter(entry -> { - List natures = entry.getNatures().stream() - .filter(nature -> !nature.endsWith(DictWordType.ENTITY.getType())) - .collect(Collectors.toList()); - if (CollectionUtils.isEmpty(natures)) { - return false; - } - return true; - }).collect(Collectors.toList()); MatchText matchText = MatchText.builder().regText(regText).detectSegment(detectSegment).build(); regTextMap.put(matchText, hanlpMapResults);