From 83cf171ec55b8ec143e94185733645aafaf6f622 Mon Sep 17 00:00:00 2001 From: lexluo09 <39718951+lexluo09@users.noreply.github.com> Date: Sat, 12 Oct 2024 20:26:24 +0800 Subject: [PATCH] [improvement][chat] Fix the display of aliases or synonyms for terms (#1793) --- .../headless/chat/knowledge/DictWord.java | 6 + ...Type.java => DataSetWithSemanticType.java} | 8 +- .../service/impl/RetrieveServiceImpl.java | 236 +++++++++--------- .../config/SwaggerConfiguration.java | 7 +- 4 files changed, 135 insertions(+), 122 deletions(-) rename headless/chat/src/main/java/com/tencent/supersonic/headless/chat/mapper/{ModelWithSemanticType.java => DataSetWithSemanticType.java} (59%) diff --git a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/DictWord.java b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/DictWord.java index 127907e5b..97bef6f69 100644 --- a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/DictWord.java +++ b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/knowledge/DictWord.java @@ -1,6 +1,9 @@ package com.tencent.supersonic.headless.chat.knowledge; +import lombok.AllArgsConstructor; +import lombok.Builder; import lombok.Data; +import lombok.NoArgsConstructor; import lombok.ToString; import java.util.Objects; @@ -8,6 +11,9 @@ import java.util.Objects; /** * word nature */ @Data @ToString +@Builder +@AllArgsConstructor +@NoArgsConstructor public class DictWord { private String word; diff --git a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/mapper/ModelWithSemanticType.java b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/mapper/DataSetWithSemanticType.java similarity index 59% rename from headless/chat/src/main/java/com/tencent/supersonic/headless/chat/mapper/ModelWithSemanticType.java rename to headless/chat/src/main/java/com/tencent/supersonic/headless/chat/mapper/DataSetWithSemanticType.java index 96b8856b0..710b6916f 100644 --- a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/mapper/ModelWithSemanticType.java +++ b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/mapper/DataSetWithSemanticType.java @@ -8,13 +8,13 @@ import java.io.Serializable; @Data @ToString -public class ModelWithSemanticType implements Serializable { +public class DataSetWithSemanticType implements Serializable { - private Long model; + private Long dataSetId; private SchemaElementType schemaElementType; - public ModelWithSemanticType(Long model, SchemaElementType schemaElementType) { - this.model = model; + public DataSetWithSemanticType(Long dataSetId, SchemaElementType schemaElementType) { + this.dataSetId = dataSetId; this.schemaElementType = schemaElementType; } } diff --git a/headless/server/src/main/java/com/tencent/supersonic/headless/server/service/impl/RetrieveServiceImpl.java b/headless/server/src/main/java/com/tencent/supersonic/headless/server/service/impl/RetrieveServiceImpl.java index 29aef754c..b42dce3b1 100644 --- a/headless/server/src/main/java/com/tencent/supersonic/headless/server/service/impl/RetrieveServiceImpl.java +++ b/headless/server/src/main/java/com/tencent/supersonic/headless/server/service/impl/RetrieveServiceImpl.java @@ -1,12 +1,10 @@ package com.tencent.supersonic.headless.server.service.impl; -import com.google.common.collect.Lists; import com.tencent.supersonic.auth.api.authentication.pojo.User; import com.tencent.supersonic.common.pojo.enums.DictWordType; import com.tencent.supersonic.headless.api.pojo.SchemaElement; import com.tencent.supersonic.headless.api.pojo.SchemaElementType; import com.tencent.supersonic.headless.api.pojo.SemanticSchema; -import com.tencent.supersonic.headless.api.pojo.request.QueryFilter; import com.tencent.supersonic.headless.api.pojo.request.QueryFilters; import com.tencent.supersonic.headless.api.pojo.request.QueryNLReq; import com.tencent.supersonic.headless.api.pojo.response.S2Term; @@ -18,8 +16,8 @@ import com.tencent.supersonic.headless.chat.knowledge.HanlpMapResult; import com.tencent.supersonic.headless.chat.knowledge.KnowledgeBaseService; import com.tencent.supersonic.headless.chat.knowledge.helper.HanlpHelper; import com.tencent.supersonic.headless.chat.knowledge.helper.NatureHelper; +import com.tencent.supersonic.headless.chat.mapper.DataSetWithSemanticType; import com.tencent.supersonic.headless.chat.mapper.MatchText; -import com.tencent.supersonic.headless.chat.mapper.ModelWithSemanticType; import com.tencent.supersonic.headless.chat.mapper.SearchMatchStrategy; import com.tencent.supersonic.headless.server.service.DataSetService; import com.tencent.supersonic.headless.server.service.RetrieveService; @@ -31,6 +29,7 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import java.util.ArrayList; +import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.LinkedHashMap; @@ -62,18 +61,18 @@ public class RetrieveServiceImpl implements RetrieveService { @Override public List retrieve(QueryNLReq queryNLReq) { - String queryText = queryNLReq.getQueryText(); - // 1.get meta info + + // 1. Get meta info SemanticSchema semanticSchemaDb = schemaService.getSemanticSchema(queryNLReq.getDataSetIds()); - List metricsDb = semanticSchemaDb.getMetrics(); - final Map dataSetIdToName = semanticSchemaDb.getDataSetIdToName(); + Map dataSetIdToName = semanticSchemaDb.getDataSetIdToName(); Map> modelIdToDataSetIds = dataSetService.getModelIdToDataSetIds( new ArrayList<>(dataSetIdToName.keySet()), User.getDefaultUser()); - // 2.detect by segment + + // 2. Detect by segment List originals = knowledgeBaseService.getTerms(queryText, modelIdToDataSetIds); - log.debug("hanlp parse result: {}", originals); + log.debug("originals terms: {}", originals); Set dataSetIds = queryNLReq.getDataSetIds(); ChatQueryContext chatQueryContext = new ChatQueryContext(); @@ -82,47 +81,42 @@ public class RetrieveServiceImpl implements RetrieveService { Map> regTextMap = searchMatchStrategy.match(chatQueryContext, originals, dataSetIds); + regTextMap.values().forEach(HanlpHelper::transLetterOriginal); - regTextMap.entrySet().stream().forEach(m -> HanlpHelper.transLetterOriginal(m.getValue())); + // 3. Get the most matching data + Optional>> mostSimilarSearchResult = regTextMap + .entrySet().stream().filter(entry -> CollectionUtils.isNotEmpty(entry.getValue())) + .max(Comparator.comparingInt(entry -> entry.getKey().getDetectSegment().length())); - // 3.get the most matching data - Optional>> mostSimilarSearchResult = - regTextMap.entrySet().stream() - .filter(entry -> CollectionUtils.isNotEmpty(entry.getValue())) - .reduce((entry1, entry2) -> entry1.getKey().getDetectSegment() - .length() >= entry2.getKey().getDetectSegment().length() ? entry1 - : entry2); - - // 4.optimize the results after the query if (!mostSimilarSearchResult.isPresent()) { - return Lists.newArrayList(); + return Collections.emptyList(); } + Map.Entry> searchTextEntry = mostSimilarSearchResult.get(); log.debug("searchTextEntry:{},queryNLReq:{}", searchTextEntry, queryNLReq); - Set searchResults = new LinkedHashSet(); - DataSetInfoStat dataSetInfoStat = NatureHelper.getDataSetStat(originals); + DataSetInfoStat dataSetInfoStat = NatureHelper.getDataSetStat(originals); List possibleDataSets = getPossibleDataSets(queryNLReq, originals, dataSetIds); - // 5.1 priority dimension metric - boolean existMetricAndDimension = searchMetricAndDimension(new HashSet<>(possibleDataSets), - dataSetIdToName, searchTextEntry, searchResults); + // 5.1 Priority dimension metric + Set searchResults = searchMetricAndDimension(new HashSet<>(possibleDataSets), + dataSetIdToName, searchTextEntry); + boolean existMetricAndDimension = CollectionUtils.isNotEmpty(searchResults); - // 5.2 process based on dimension values + // 5.2 Process based on dimension values MatchText matchText = searchTextEntry.getKey(); Map natureToNameMap = getNatureToNameMap(searchTextEntry, new HashSet<>(possibleDataSets)); log.debug("possibleDataSets:{},natureToNameMap:{}", possibleDataSets, natureToNameMap); for (Map.Entry natureToNameEntry : natureToNameMap.entrySet()) { - - Set searchResultSet = searchDimensionValue(metricsDb, dataSetIdToName, + Set results = searchDimensionValue(semanticSchemaDb, dataSetInfoStat.getMetricDataSetCount(), existMetricAndDimension, matchText, natureToNameMap, natureToNameEntry, queryNLReq.getQueryFilters()); - - searchResults.addAll(searchResultSet); + searchResults.addAll(results); } + return searchResults.stream().limit(RESULT_SIZE).collect(Collectors.toList()); } @@ -142,83 +136,89 @@ public class RetrieveServiceImpl implements RetrieveService { return possibleDataSets; } - private Set searchDimensionValue(List metricsDb, - Map modelToName, long metricModelCount, boolean existMetricAndDimension, - MatchText matchText, Map natureToNameMap, - Map.Entry natureToNameEntry, QueryFilters queryFilters) { + private Set searchDimensionValue(SemanticSchema semanticSchemaDb, + long metricModelCount, boolean existMetricAndDimension, MatchText matchText, + Map natureToNameMap, Map.Entry natureToNameEntry, + QueryFilters queryFilters) { + List metricsDb = semanticSchemaDb.getMetrics(); + Map dataSetIdToName = semanticSchemaDb.getDataSetIdToName(); - Set searchResults = new LinkedHashSet(); + Set searchResults = new LinkedHashSet<>(); String nature = natureToNameEntry.getKey(); String wordName = natureToNameEntry.getValue(); - Long modelId = NatureHelper.getDataSetId(nature); + Long dataSetId = NatureHelper.getDataSetId(nature); SchemaElementType schemaElementType = NatureHelper.convertToElementType(nature); + // Skip if the schema element type is ENTITY if (SchemaElementType.ENTITY.equals(schemaElementType)) { return searchResults; } - // If there are no metric/dimension, complete the metric information - SearchResult searchResult = SearchResult.builder().modelId(modelId) - .modelName(modelToName.get(modelId)).recommend(matchText.getRegText() + wordName) - .schemaElementType(schemaElementType).subRecommend(wordName).build(); - if (metricModelCount <= 0 && !existMetricAndDimension) { + // Create a base search result + SearchResult baseSearchResult = createBaseSearchResult(dataSetId, dataSetIdToName, + matchText, wordName, schemaElementType); + + // If there are no metrics or dimensions, complete the metric information + if (shouldCompleteMetricInfo(metricModelCount, existMetricAndDimension)) { if (filterByQueryFilter(wordName, queryFilters)) { return searchResults; } - searchResults.add(searchResult); - int metricSize = getMetricSize(natureToNameMap); - List metrics = filerMetricsByModel(metricsDb, modelId, metricSize * 3).stream() - .limit(metricSize).collect(Collectors.toList()); + searchResults.add(baseSearchResult); + + int metricSize = calculateMetricSize(natureToNameMap); + List metrics = getFilteredMetrics(metricsDb, dataSetId, metricSize); for (String metric : metrics) { - SearchResult result = SearchResult.builder().modelId(modelId) - .modelName(modelToName.get(modelId)) - .recommend(matchText.getRegText() + wordName + DictWordType.SPACE + metric) - .subRecommend(wordName + DictWordType.SPACE + metric).isComplete(false) - .build(); - searchResults.add(result); + SearchResult metricSearchResult = createMetricSearchResult(dataSetId, + dataSetIdToName, matchText, wordName, metric); + searchResults.add(metricSearchResult); } } else { - searchResults.add(searchResult); + searchResults.add(baseSearchResult); } + return searchResults; } - private int getMetricSize(Map natureToNameMap) { - int metricSize = RESULT_SIZE / (natureToNameMap.entrySet().size()); - if (metricSize <= 1) { - metricSize = 1; - } - return metricSize; + private SearchResult createBaseSearchResult(Long dataSetId, Map dataSetIdToName, + MatchText matchText, String wordName, SchemaElementType schemaElementType) { + return SearchResult.builder().modelId(dataSetId).modelName(dataSetIdToName.get(dataSetId)) + .recommend(matchText.getRegText() + wordName).schemaElementType(schemaElementType) + .subRecommend(wordName).build(); + } + + private boolean shouldCompleteMetricInfo(long metricModelCount, + boolean existMetricAndDimension) { + return metricModelCount <= 0 && !existMetricAndDimension; + } + + private int calculateMetricSize(Map natureToNameMap) { + int metricSize = RESULT_SIZE / natureToNameMap.size(); + return Math.max(metricSize, 1); + } + + private List getFilteredMetrics(List metricsDb, Long modelId, + int metricSize) { + return metricsDb.stream() + .filter(mapDO -> Objects.nonNull(mapDO) && modelId.equals(mapDO.getDataSetId())) + .sorted(Comparator.comparing(SchemaElement::getUseCnt).reversed()) + .map(SchemaElement::getName).limit(metricSize).collect(Collectors.toList()); + } + + private SearchResult createMetricSearchResult(Long modelId, Map modelToName, + MatchText matchText, String wordName, String metric) { + return SearchResult.builder().modelId(modelId).modelName(modelToName.get(modelId)) + .recommend(matchText.getRegText() + wordName + DictWordType.SPACE + metric) + .subRecommend(wordName + DictWordType.SPACE + metric).isComplete(false).build(); } private boolean filterByQueryFilter(String wordName, QueryFilters queryFilters) { if (queryFilters == null || CollectionUtils.isEmpty(queryFilters.getFilters())) { return false; } - List filters = queryFilters.getFilters(); - for (QueryFilter filter : filters) { - if (wordName.equalsIgnoreCase(String.valueOf(filter.getValue()))) { - return false; - } - } - return true; - } - - protected List filerMetricsByModel(List metricsDb, Long model, - int metricSize) { - if (CollectionUtils.isEmpty(metricsDb)) { - return Lists.newArrayList(); - } - return metricsDb.stream() - .filter(mapDO -> Objects.nonNull(mapDO) && model.equals(mapDO.getDataSetId())) - .sorted(Comparator.comparing(SchemaElement::getUseCnt).reversed()) - .flatMap(entry -> { - List result = new ArrayList<>(); - result.add(entry.getName()); - return result.stream(); - }).limit(metricSize).collect(Collectors.toList()); + return queryFilters.getFilters().stream() + .noneMatch(filter -> wordName.equalsIgnoreCase(String.valueOf(filter.getValue()))); } /** @@ -230,65 +230,71 @@ public class RetrieveServiceImpl implements RetrieveService { private Map getNatureToNameMap( Map.Entry> recommendTextListEntry, Set possibleModels) { + List recommendValues = recommendTextListEntry.getValue(); - return recommendValues.stream() - .flatMap(entry -> entry.getNatures().stream().filter(nature -> { - if (CollectionUtils.isEmpty(possibleModels)) { - return true; - } - Long model = NatureHelper.getDataSetId(nature); - return possibleModels.contains(model); - }).map(nature -> { - DictWord posDO = new DictWord(); - posDO.setWord(entry.getName()); - posDO.setNature(nature); - return posDO; - })).sorted(Comparator.comparingInt(a -> a.getWord().length())) + + return recommendValues.stream().flatMap(entry -> { + List filteredNatures = entry.getNatures().stream() + .filter(nature -> isNatureValid(nature, possibleModels)) + .collect(Collectors.toList()); + + return filteredNatures.stream() + .map(nature -> DictWord.builder().word(entry.getName()).nature(nature).build()); + }).sorted(Comparator.comparingInt(dictWord -> dictWord.getWord().length())) .collect(Collectors.toMap(DictWord::getNature, DictWord::getWord, (value1, value2) -> value1, LinkedHashMap::new)); } - private boolean searchMetricAndDimension(Set possibleDataSets, - Map modelToName, - Map.Entry> searchTextEntry, - Set searchResults) { - boolean existMetric = false; + private boolean isNatureValid(String nature, Set possibleModels) { + if (CollectionUtils.isEmpty(possibleModels)) { + return true; + } + Long model = NatureHelper.getDataSetId(nature); + return possibleModels.contains(model); + } + + private Set searchMetricAndDimension(Set possibleDataSets, + Map dataSetIdToName, + Map.Entry> searchTextEntry) { + + Set searchResults = new LinkedHashSet<>(); log.debug("searchMetricAndDimension searchTextEntry:{}", searchTextEntry); + MatchText matchText = searchTextEntry.getKey(); List hanlpMapResults = searchTextEntry.getValue(); for (HanlpMapResult hanlpMapResult : hanlpMapResults) { - - List dimensionMetricClassIds = hanlpMapResult.getNatures() + List dimensionMetricDataSetIds = hanlpMapResult.getNatures() .stream() - .map(nature -> new ModelWithSemanticType(NatureHelper.getDataSetId(nature), + .map(nature -> new DataSetWithSemanticType(NatureHelper.getDataSetId(nature), NatureHelper.convertToElementType(nature))) .filter(entry -> matchCondition(entry, possibleDataSets)) .collect(Collectors.toList()); - if (CollectionUtils.isEmpty(dimensionMetricClassIds)) { + if (CollectionUtils.isEmpty(dimensionMetricDataSetIds)) { continue; } - for (ModelWithSemanticType modelWithSemanticType : dimensionMetricClassIds) { - existMetric = true; - Long modelId = modelWithSemanticType.getModel(); - SchemaElementType schemaElementType = modelWithSemanticType.getSchemaElementType(); + for (DataSetWithSemanticType dataSetWithSemanticType : dimensionMetricDataSetIds) { + Long dataSetId = dataSetWithSemanticType.getDataSetId(); + SchemaElementType schemaElementType = + dataSetWithSemanticType.getSchemaElementType(); + String modelName = dataSetIdToName.get(dataSetId); + String recommendText = matchText.getRegText() + hanlpMapResult.getName(); + String subRecommendText = hanlpMapResult.getName(); + SearchResult searchResult = - SearchResult.builder().modelId(modelId).modelName(modelToName.get(modelId)) - .recommend(matchText.getRegText() + hanlpMapResult.getName()) - .subRecommend(hanlpMapResult.getName()) + SearchResult.builder().modelId(dataSetId).modelName(modelName) + .recommend(recommendText).subRecommend(subRecommendText) .schemaElementType(schemaElementType).build(); - // visibility to filter metrics + searchResults.add(searchResult); } - log.debug("parseResult:{},dimensionMetricClassIds:{},possibleDataSets:{}", - hanlpMapResult, dimensionMetricClassIds, possibleDataSets); } log.info("searchMetricAndDimension searchResults:{}", searchResults); - return existMetric; + return searchResults; } - private boolean matchCondition(ModelWithSemanticType entry, Set possibleDataSets) { + private boolean matchCondition(DataSetWithSemanticType entry, Set possibleDataSets) { if (!(SchemaElementType.METRIC.equals(entry.getSchemaElementType()) || SchemaElementType.DIMENSION.equals(entry.getSchemaElementType()))) { return false; @@ -297,6 +303,6 @@ public class RetrieveServiceImpl implements RetrieveService { if (CollectionUtils.isEmpty(possibleDataSets)) { return true; } - return possibleDataSets.contains(entry.getModel()); + return possibleDataSets.contains(entry.getDataSetId()); } } diff --git a/launchers/standalone/src/main/java/com/tencent/supersonic/config/SwaggerConfiguration.java b/launchers/standalone/src/main/java/com/tencent/supersonic/config/SwaggerConfiguration.java index 97a3e6a57..dfe751959 100644 --- a/launchers/standalone/src/main/java/com/tencent/supersonic/config/SwaggerConfiguration.java +++ b/launchers/standalone/src/main/java/com/tencent/supersonic/config/SwaggerConfiguration.java @@ -5,6 +5,10 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; + +import java.util.Collections; +import java.util.List; + import springfox.documentation.builders.ApiInfoBuilder; import springfox.documentation.builders.PathSelectors; import springfox.documentation.builders.RequestHandlerSelectors; @@ -18,9 +22,6 @@ import springfox.documentation.spi.service.contexts.SecurityContext; import springfox.documentation.spring.web.plugins.Docket; import springfox.documentation.swagger2.annotations.EnableSwagger2; -import java.util.Collections; -import java.util.List; - @Configuration @EnableSwagger2 @EnableOpenApi