[improvement](chat) Merge HanlpDictMapper and FuzzyNameMapper into KeywordMapper. (#493)

Co-authored-by: lexluo <lexluo@tencent.com>
This commit is contained in:
lexluo09
2023-12-11 17:22:57 +08:00
committed by GitHub
parent d79f73eab6
commit 0c6efada43
10 changed files with 142 additions and 197 deletions

View File

@@ -6,7 +6,7 @@ import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.config.OptimizationConfig;
import com.tencent.supersonic.common.pojo.Constants;
import com.tencent.supersonic.knowledge.dictionary.FuzzyResult;
import com.tencent.supersonic.knowledge.dictionary.DatabaseMapResult;
import com.tencent.supersonic.knowledge.service.SchemaService;
import java.util.ArrayList;
import java.util.HashSet;
@@ -26,7 +26,7 @@ import org.springframework.util.CollectionUtils;
*/
@Service
@Slf4j
public class FuzzyNameMatchStrategy extends BaseMatchStrategy<FuzzyResult> {
public class DatabaseMatchStrategy extends BaseMatchStrategy<DatabaseMapResult> {
@Autowired
private OptimizationConfig optimizationConfig;
@@ -37,25 +37,25 @@ public class FuzzyNameMatchStrategy extends BaseMatchStrategy<FuzzyResult> {
private List<SchemaElement> allElements;
@Override
public Map<MatchText, List<FuzzyResult>> match(QueryContext queryContext, List<Term> terms,
public Map<MatchText, List<DatabaseMapResult>> match(QueryContext queryContext, List<Term> terms,
Set<Long> detectModelIds) {
this.allElements = getSchemaElements();
return super.match(queryContext, terms, detectModelIds);
}
@Override
public boolean needDelete(FuzzyResult oneRoundResult, FuzzyResult existResult) {
public boolean needDelete(DatabaseMapResult oneRoundResult, DatabaseMapResult existResult) {
return getMapKey(oneRoundResult).equals(getMapKey(existResult))
&& existResult.getDetectWord().length() < oneRoundResult.getDetectWord().length();
}
@Override
public String getMapKey(FuzzyResult a) {
public String getMapKey(DatabaseMapResult a) {
return a.getName() + Constants.UNDERLINE + a.getSchemaElement().getId()
+ Constants.UNDERLINE + a.getSchemaElement().getName();
}
public void detectByStep(QueryContext queryContext, Set<FuzzyResult> existResults, Set<Long> detectModelIds,
public void detectByStep(QueryContext queryContext, Set<DatabaseMapResult> existResults, Set<Long> detectModelIds,
Integer startIndex, Integer index, int offset) {
String detectSegment = queryContext.getRequest().getQueryText().substring(startIndex, index);
if (StringUtils.isBlank(detectSegment)) {
@@ -80,11 +80,11 @@ public class FuzzyNameMatchStrategy extends BaseMatchStrategy<FuzzyResult> {
.collect(Collectors.toSet());
}
for (SchemaElement schemaElement : schemaElements) {
FuzzyResult fuzzyResult = new FuzzyResult();
fuzzyResult.setDetectWord(detectSegment);
fuzzyResult.setName(schemaElement.getName());
fuzzyResult.setSchemaElement(schemaElement);
existResults.add(fuzzyResult);
DatabaseMapResult databaseMapResult = new DatabaseMapResult();
databaseMapResult.setDetectWord(detectSegment);
databaseMapResult.setName(schemaElement.getName());
databaseMapResult.setSchemaElement(schemaElement);
existResults.add(databaseMapResult);
}
}
}

View File

@@ -15,7 +15,7 @@ import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
/***
* A mapper that is capable of semantic understanding of text.
* A mapper that recognize elements through embedding.
*/
@Slf4j
public class EmbeddingMapper extends BaseMapper {
@@ -23,7 +23,6 @@ public class EmbeddingMapper extends BaseMapper {
@Override
public void doMap(QueryContext queryContext) {
//1. query from embedding by queryText
String queryText = queryContext.getRequest().getQueryText();
List<Term> terms = HanlpHelper.getTerms(queryText);
@@ -39,11 +38,11 @@ public class EmbeddingMapper extends BaseMapper {
SchemaElement schemaElement = JSONObject.parseObject(JSONObject.toJSONString(matchResult.getMetadata()),
SchemaElement.class);
if (StringUtils.isBlank(matchResult.getMetadata().get("modelId"))) {
String modelIdStr = matchResult.getMetadata().get("modelId");
if (StringUtils.isBlank(modelIdStr)) {
continue;
}
long modelId = Long.parseLong(matchResult.getMetadata().get("modelId"));
long modelId = Long.parseLong(modelIdStr);
schemaElement = getSchemaElement(modelId, schemaElement.getType(), elementId);
if (schemaElement == null) {
continue;

View File

@@ -1,67 +0,0 @@
package com.tencent.supersonic.chat.mapper;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.chat.api.pojo.QueryContext;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SchemaMapInfo;
import com.tencent.supersonic.common.util.ContextUtils;
import com.tencent.supersonic.knowledge.dictionary.FuzzyResult;
import com.tencent.supersonic.knowledge.utils.HanlpHelper;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.springframework.util.CollectionUtils;
/***
* A mapper capable of fuzzy parsing of metric names and dimension names.
*/
@Slf4j
public class FuzzyNameMapper extends BaseMapper {
@Override
public void doMap(QueryContext queryContext) {
List<Term> terms = HanlpHelper.getTerms(queryContext.getRequest().getQueryText());
FuzzyNameMatchStrategy fuzzyNameMatchStrategy = ContextUtils.getBean(FuzzyNameMatchStrategy.class);
MapperHelper mapperHelper = ContextUtils.getBean(MapperHelper.class);
List<FuzzyResult> matches = fuzzyNameMatchStrategy.getMatches(queryContext, terms);
for (FuzzyResult match : matches) {
SchemaElement schemaElement = match.getSchemaElement();
Set<Long> regElementSet = getRegElementSet(queryContext.getMapInfo(), schemaElement);
if (regElementSet.contains(schemaElement.getId())) {
continue;
}
SchemaElementMatch schemaElementMatch = SchemaElementMatch.builder()
.element(schemaElement)
.word(schemaElement.getName())
.detectWord(match.getDetectWord())
.frequency(10000L)
.similarity(mapperHelper.getSimilarity(match.getDetectWord(), schemaElement.getName()))
.build();
log.info("add to schema, elementMatch {}", schemaElementMatch);
addToSchemaMap(queryContext.getMapInfo(), schemaElement.getModel(), schemaElementMatch);
}
}
private Set<Long> getRegElementSet(SchemaMapInfo schemaMap, SchemaElement schemaElement) {
List<SchemaElementMatch> elements = schemaMap.getMatchedElements(schemaElement.getModel());
if (CollectionUtils.isEmpty(elements)) {
return new HashSet<>();
}
return elements.stream()
.filter(elementMatch ->
SchemaElementType.METRIC.equals(elementMatch.getElement().getType())
|| SchemaElementType.DIMENSION.equals(elementMatch.getElement().getType()))
.map(elementMatch -> elementMatch.getElement().getId())
.collect(Collectors.toSet());
}
}

View File

@@ -1,84 +0,0 @@
package com.tencent.supersonic.chat.mapper;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.chat.api.pojo.QueryContext;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SchemaMapInfo;
import com.tencent.supersonic.common.util.ContextUtils;
import com.tencent.supersonic.knowledge.dictionary.HanlpMapResult;
import com.tencent.supersonic.knowledge.utils.HanlpHelper;
import com.tencent.supersonic.knowledge.utils.NatureHelper;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections.CollectionUtils;
/***
* A mapper capable of prefix and suffix similarity parsing for
* domain names, dimension values, metric names, and dimension names.
*/
@Slf4j
public class HanlpDictMapper extends BaseMapper {
@Override
public void doMap(QueryContext queryContext) {
String queryText = queryContext.getRequest().getQueryText();
List<Term> terms = HanlpHelper.getTerms(queryText);
HanlpDictMatchStrategy matchStrategy = ContextUtils.getBean(HanlpDictMatchStrategy.class);
List<HanlpMapResult> matches = matchStrategy.getMatches(queryContext, terms);
HanlpHelper.transLetterOriginal(matches);
convertTermsToSchemaMapInfo(matches, queryContext.getMapInfo(), terms);
}
private void convertTermsToSchemaMapInfo(List<HanlpMapResult> hanlpMapResults, SchemaMapInfo schemaMap,
List<Term> terms) {
if (CollectionUtils.isEmpty(hanlpMapResults)) {
return;
}
Map<String, Long> wordNatureToFrequency = terms.stream().collect(
Collectors.toMap(entry -> entry.getWord() + entry.getNature(),
term -> Long.valueOf(term.getFrequency()), (value1, value2) -> value2));
for (HanlpMapResult hanlpMapResult : hanlpMapResults) {
for (String nature : hanlpMapResult.getNatures()) {
Long modelId = NatureHelper.getModelId(nature);
if (Objects.isNull(modelId)) {
continue;
}
SchemaElementType elementType = NatureHelper.convertToElementType(nature);
if (Objects.isNull(elementType)) {
continue;
}
Long elementID = NatureHelper.getElementID(nature);
SchemaElement element = getSchemaElement(modelId, elementType, elementID);
if (element == null) {
continue;
}
if (element.getType().equals(SchemaElementType.VALUE)) {
element.setName(hanlpMapResult.getName());
}
Long frequency = wordNatureToFrequency.get(hanlpMapResult.getName() + nature);
SchemaElementMatch schemaElementMatch = SchemaElementMatch.builder()
.element(element)
.frequency(frequency)
.word(hanlpMapResult.getName())
.similarity(hanlpMapResult.getSimilarity())
.detectWord(hanlpMapResult.getDetectWord())
.build();
addToSchemaMap(schemaMap, modelId, schemaElementMatch);
}
}
}
}

View File

@@ -0,0 +1,122 @@
package com.tencent.supersonic.chat.mapper;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.chat.api.pojo.QueryContext;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SchemaMapInfo;
import com.tencent.supersonic.common.util.ContextUtils;
import com.tencent.supersonic.knowledge.dictionary.DatabaseMapResult;
import com.tencent.supersonic.knowledge.dictionary.HanlpMapResult;
import com.tencent.supersonic.knowledge.utils.HanlpHelper;
import com.tencent.supersonic.knowledge.utils.NatureHelper;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.springframework.util.CollectionUtils;
/***
* A mapper that recognize elements through keyword.
* It includes two recognition strategies: HanlpDictMatchStrategy and DatabaseMatchStrategy.
*
*/
@Slf4j
public class KeywordMapper extends BaseMapper {
@Override
public void doMap(QueryContext queryContext) {
String queryText = queryContext.getRequest().getQueryText();
//1.hanlpDict Match
List<Term> terms = HanlpHelper.getTerms(queryText);
HanlpDictMatchStrategy hanlpMatchStrategy = ContextUtils.getBean(HanlpDictMatchStrategy.class);
List<HanlpMapResult> hanlpMapResults = hanlpMatchStrategy.getMatches(queryContext, terms);
convertHanlpMapResultToMapInfo(hanlpMapResults, queryContext.getMapInfo(), terms);
//2.database Match
DatabaseMatchStrategy databaseMatchStrategy = ContextUtils.getBean(DatabaseMatchStrategy.class);
List<DatabaseMapResult> databaseResults = databaseMatchStrategy.getMatches(queryContext, terms);
convertDatabaseMapResultToMapInfo(queryContext, databaseResults);
}
private void convertHanlpMapResultToMapInfo(List<HanlpMapResult> mapResults, SchemaMapInfo schemaMap,
List<Term> terms) {
if (CollectionUtils.isEmpty(mapResults)) {
return;
}
HanlpHelper.transLetterOriginal(mapResults);
Map<String, Long> wordNatureToFrequency = terms.stream().collect(
Collectors.toMap(entry -> entry.getWord() + entry.getNature(),
term -> Long.valueOf(term.getFrequency()), (value1, value2) -> value2));
for (HanlpMapResult hanlpMapResult : mapResults) {
for (String nature : hanlpMapResult.getNatures()) {
Long modelId = NatureHelper.getModelId(nature);
if (Objects.isNull(modelId)) {
continue;
}
SchemaElementType elementType = NatureHelper.convertToElementType(nature);
if (Objects.isNull(elementType)) {
continue;
}
Long elementID = NatureHelper.getElementID(nature);
SchemaElement element = getSchemaElement(modelId, elementType, elementID);
if (element == null) {
continue;
}
if (element.getType().equals(SchemaElementType.VALUE)) {
element.setName(hanlpMapResult.getName());
}
Long frequency = wordNatureToFrequency.get(hanlpMapResult.getName() + nature);
SchemaElementMatch schemaElementMatch = SchemaElementMatch.builder()
.element(element)
.frequency(frequency)
.word(hanlpMapResult.getName())
.similarity(hanlpMapResult.getSimilarity())
.detectWord(hanlpMapResult.getDetectWord())
.build();
addToSchemaMap(schemaMap, modelId, schemaElementMatch);
}
}
}
private void convertDatabaseMapResultToMapInfo(QueryContext queryContext, List<DatabaseMapResult> mapResults) {
MapperHelper mapperHelper = ContextUtils.getBean(MapperHelper.class);
for (DatabaseMapResult match : mapResults) {
SchemaElement schemaElement = match.getSchemaElement();
Set<Long> regElementSet = getRegElementSet(queryContext.getMapInfo(), schemaElement);
if (regElementSet.contains(schemaElement.getId())) {
continue;
}
SchemaElementMatch schemaElementMatch = SchemaElementMatch.builder()
.element(schemaElement)
.word(schemaElement.getName())
.detectWord(match.getDetectWord())
.frequency(10000L)
.similarity(mapperHelper.getSimilarity(match.getDetectWord(), schemaElement.getName()))
.build();
log.info("add to schema, elementMatch {}", schemaElementMatch);
addToSchemaMap(queryContext.getMapInfo(), schemaElement.getModel(), schemaElementMatch);
}
}
private Set<Long> getRegElementSet(SchemaMapInfo schemaMap, SchemaElement schemaElement) {
List<SchemaElementMatch> elements = schemaMap.getMatchedElements(schemaElement.getModel());
if (CollectionUtils.isEmpty(elements)) {
return new HashSet<>();
}
return elements.stream()
.filter(elementMatch ->
SchemaElementType.METRIC.equals(elementMatch.getElement().getType())
|| SchemaElementType.DIMENSION.equals(elementMatch.getElement().getType()))
.map(elementMatch -> elementMatch.getElement().getId())
.collect(Collectors.toSet());
}
}

View File

@@ -1,22 +0,0 @@
package com.tencent.supersonic.chat.mapper;
import com.tencent.supersonic.chat.api.pojo.QueryContext;
import com.tencent.supersonic.chat.api.pojo.request.QueryReq;
import com.tencent.supersonic.chat.test.context.ContextTest;
import org.junit.jupiter.api.Test;
/**
* HanlpDictMapperTest
*/
class HanlpDictMapperTest extends ContextTest {
@Test
void map() {
QueryReq queryRequest = new QueryReq();
queryRequest.setChatId(1);
queryRequest.setModelId(2L);
queryRequest.setQueryText("supersonic按部门访问次数");
HanlpDictMapper hanlpDictMapper = new HanlpDictMapper();
hanlpDictMapper.map(new QueryContext(queryRequest));
}
}

View File

@@ -7,7 +7,7 @@ import lombok.ToString;
@Data
@ToString
public class FuzzyResult extends MapResult {
public class DatabaseMapResult extends MapResult {
private SchemaElement schemaElement;
@@ -19,7 +19,7 @@ public class FuzzyResult extends MapResult {
if (o == null || getClass() != o.getClass()) {
return false;
}
FuzzyResult that = (FuzzyResult) o;
DatabaseMapResult that = (DatabaseMapResult) o;
return Objects.equal(name, that.name) && Objects.equal(schemaElement, that.schemaElement);
}