[improvement](chat) Merge HanlpDictMapper and FuzzyNameMapper into KeywordMapper. (#493)

Co-authored-by: lexluo <lexluo@tencent.com>
This commit is contained in:
lexluo09
2023-12-11 17:22:57 +08:00
committed by GitHub
parent d79f73eab6
commit 0c6efada43
10 changed files with 142 additions and 197 deletions

View File

@@ -6,7 +6,7 @@ import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch; import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.config.OptimizationConfig; import com.tencent.supersonic.chat.config.OptimizationConfig;
import com.tencent.supersonic.common.pojo.Constants; import com.tencent.supersonic.common.pojo.Constants;
import com.tencent.supersonic.knowledge.dictionary.FuzzyResult; import com.tencent.supersonic.knowledge.dictionary.DatabaseMapResult;
import com.tencent.supersonic.knowledge.service.SchemaService; import com.tencent.supersonic.knowledge.service.SchemaService;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
@@ -26,7 +26,7 @@ import org.springframework.util.CollectionUtils;
*/ */
@Service @Service
@Slf4j @Slf4j
public class FuzzyNameMatchStrategy extends BaseMatchStrategy<FuzzyResult> { public class DatabaseMatchStrategy extends BaseMatchStrategy<DatabaseMapResult> {
@Autowired @Autowired
private OptimizationConfig optimizationConfig; private OptimizationConfig optimizationConfig;
@@ -37,25 +37,25 @@ public class FuzzyNameMatchStrategy extends BaseMatchStrategy<FuzzyResult> {
private List<SchemaElement> allElements; private List<SchemaElement> allElements;
@Override @Override
public Map<MatchText, List<FuzzyResult>> match(QueryContext queryContext, List<Term> terms, public Map<MatchText, List<DatabaseMapResult>> match(QueryContext queryContext, List<Term> terms,
Set<Long> detectModelIds) { Set<Long> detectModelIds) {
this.allElements = getSchemaElements(); this.allElements = getSchemaElements();
return super.match(queryContext, terms, detectModelIds); return super.match(queryContext, terms, detectModelIds);
} }
@Override @Override
public boolean needDelete(FuzzyResult oneRoundResult, FuzzyResult existResult) { public boolean needDelete(DatabaseMapResult oneRoundResult, DatabaseMapResult existResult) {
return getMapKey(oneRoundResult).equals(getMapKey(existResult)) return getMapKey(oneRoundResult).equals(getMapKey(existResult))
&& existResult.getDetectWord().length() < oneRoundResult.getDetectWord().length(); && existResult.getDetectWord().length() < oneRoundResult.getDetectWord().length();
} }
@Override @Override
public String getMapKey(FuzzyResult a) { public String getMapKey(DatabaseMapResult a) {
return a.getName() + Constants.UNDERLINE + a.getSchemaElement().getId() return a.getName() + Constants.UNDERLINE + a.getSchemaElement().getId()
+ Constants.UNDERLINE + a.getSchemaElement().getName(); + Constants.UNDERLINE + a.getSchemaElement().getName();
} }
public void detectByStep(QueryContext queryContext, Set<FuzzyResult> existResults, Set<Long> detectModelIds, public void detectByStep(QueryContext queryContext, Set<DatabaseMapResult> existResults, Set<Long> detectModelIds,
Integer startIndex, Integer index, int offset) { Integer startIndex, Integer index, int offset) {
String detectSegment = queryContext.getRequest().getQueryText().substring(startIndex, index); String detectSegment = queryContext.getRequest().getQueryText().substring(startIndex, index);
if (StringUtils.isBlank(detectSegment)) { if (StringUtils.isBlank(detectSegment)) {
@@ -80,11 +80,11 @@ public class FuzzyNameMatchStrategy extends BaseMatchStrategy<FuzzyResult> {
.collect(Collectors.toSet()); .collect(Collectors.toSet());
} }
for (SchemaElement schemaElement : schemaElements) { for (SchemaElement schemaElement : schemaElements) {
FuzzyResult fuzzyResult = new FuzzyResult(); DatabaseMapResult databaseMapResult = new DatabaseMapResult();
fuzzyResult.setDetectWord(detectSegment); databaseMapResult.setDetectWord(detectSegment);
fuzzyResult.setName(schemaElement.getName()); databaseMapResult.setName(schemaElement.getName());
fuzzyResult.setSchemaElement(schemaElement); databaseMapResult.setSchemaElement(schemaElement);
existResults.add(fuzzyResult); existResults.add(databaseMapResult);
} }
} }
} }

View File

@@ -15,7 +15,7 @@ import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
/*** /***
* A mapper that is capable of semantic understanding of text. * A mapper that recognize elements through embedding.
*/ */
@Slf4j @Slf4j
public class EmbeddingMapper extends BaseMapper { public class EmbeddingMapper extends BaseMapper {
@@ -23,7 +23,6 @@ public class EmbeddingMapper extends BaseMapper {
@Override @Override
public void doMap(QueryContext queryContext) { public void doMap(QueryContext queryContext) {
//1. query from embedding by queryText //1. query from embedding by queryText
String queryText = queryContext.getRequest().getQueryText(); String queryText = queryContext.getRequest().getQueryText();
List<Term> terms = HanlpHelper.getTerms(queryText); List<Term> terms = HanlpHelper.getTerms(queryText);
@@ -39,11 +38,11 @@ public class EmbeddingMapper extends BaseMapper {
SchemaElement schemaElement = JSONObject.parseObject(JSONObject.toJSONString(matchResult.getMetadata()), SchemaElement schemaElement = JSONObject.parseObject(JSONObject.toJSONString(matchResult.getMetadata()),
SchemaElement.class); SchemaElement.class);
if (StringUtils.isBlank(matchResult.getMetadata().get("modelId"))) { String modelIdStr = matchResult.getMetadata().get("modelId");
if (StringUtils.isBlank(modelIdStr)) {
continue; continue;
} }
long modelId = Long.parseLong(matchResult.getMetadata().get("modelId")); long modelId = Long.parseLong(modelIdStr);
schemaElement = getSchemaElement(modelId, schemaElement.getType(), elementId); schemaElement = getSchemaElement(modelId, schemaElement.getType(), elementId);
if (schemaElement == null) { if (schemaElement == null) {
continue; continue;

View File

@@ -1,67 +0,0 @@
package com.tencent.supersonic.chat.mapper;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.chat.api.pojo.QueryContext;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SchemaMapInfo;
import com.tencent.supersonic.common.util.ContextUtils;
import com.tencent.supersonic.knowledge.dictionary.FuzzyResult;
import com.tencent.supersonic.knowledge.utils.HanlpHelper;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.springframework.util.CollectionUtils;
/***
* A mapper capable of fuzzy parsing of metric names and dimension names.
*/
@Slf4j
public class FuzzyNameMapper extends BaseMapper {
@Override
public void doMap(QueryContext queryContext) {
List<Term> terms = HanlpHelper.getTerms(queryContext.getRequest().getQueryText());
FuzzyNameMatchStrategy fuzzyNameMatchStrategy = ContextUtils.getBean(FuzzyNameMatchStrategy.class);
MapperHelper mapperHelper = ContextUtils.getBean(MapperHelper.class);
List<FuzzyResult> matches = fuzzyNameMatchStrategy.getMatches(queryContext, terms);
for (FuzzyResult match : matches) {
SchemaElement schemaElement = match.getSchemaElement();
Set<Long> regElementSet = getRegElementSet(queryContext.getMapInfo(), schemaElement);
if (regElementSet.contains(schemaElement.getId())) {
continue;
}
SchemaElementMatch schemaElementMatch = SchemaElementMatch.builder()
.element(schemaElement)
.word(schemaElement.getName())
.detectWord(match.getDetectWord())
.frequency(10000L)
.similarity(mapperHelper.getSimilarity(match.getDetectWord(), schemaElement.getName()))
.build();
log.info("add to schema, elementMatch {}", schemaElementMatch);
addToSchemaMap(queryContext.getMapInfo(), schemaElement.getModel(), schemaElementMatch);
}
}
private Set<Long> getRegElementSet(SchemaMapInfo schemaMap, SchemaElement schemaElement) {
List<SchemaElementMatch> elements = schemaMap.getMatchedElements(schemaElement.getModel());
if (CollectionUtils.isEmpty(elements)) {
return new HashSet<>();
}
return elements.stream()
.filter(elementMatch ->
SchemaElementType.METRIC.equals(elementMatch.getElement().getType())
|| SchemaElementType.DIMENSION.equals(elementMatch.getElement().getType()))
.map(elementMatch -> elementMatch.getElement().getId())
.collect(Collectors.toSet());
}
}

View File

@@ -1,84 +0,0 @@
package com.tencent.supersonic.chat.mapper;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.chat.api.pojo.QueryContext;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SchemaMapInfo;
import com.tencent.supersonic.common.util.ContextUtils;
import com.tencent.supersonic.knowledge.dictionary.HanlpMapResult;
import com.tencent.supersonic.knowledge.utils.HanlpHelper;
import com.tencent.supersonic.knowledge.utils.NatureHelper;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections.CollectionUtils;
/***
* A mapper capable of prefix and suffix similarity parsing for
* domain names, dimension values, metric names, and dimension names.
*/
@Slf4j
public class HanlpDictMapper extends BaseMapper {
@Override
public void doMap(QueryContext queryContext) {
String queryText = queryContext.getRequest().getQueryText();
List<Term> terms = HanlpHelper.getTerms(queryText);
HanlpDictMatchStrategy matchStrategy = ContextUtils.getBean(HanlpDictMatchStrategy.class);
List<HanlpMapResult> matches = matchStrategy.getMatches(queryContext, terms);
HanlpHelper.transLetterOriginal(matches);
convertTermsToSchemaMapInfo(matches, queryContext.getMapInfo(), terms);
}
private void convertTermsToSchemaMapInfo(List<HanlpMapResult> hanlpMapResults, SchemaMapInfo schemaMap,
List<Term> terms) {
if (CollectionUtils.isEmpty(hanlpMapResults)) {
return;
}
Map<String, Long> wordNatureToFrequency = terms.stream().collect(
Collectors.toMap(entry -> entry.getWord() + entry.getNature(),
term -> Long.valueOf(term.getFrequency()), (value1, value2) -> value2));
for (HanlpMapResult hanlpMapResult : hanlpMapResults) {
for (String nature : hanlpMapResult.getNatures()) {
Long modelId = NatureHelper.getModelId(nature);
if (Objects.isNull(modelId)) {
continue;
}
SchemaElementType elementType = NatureHelper.convertToElementType(nature);
if (Objects.isNull(elementType)) {
continue;
}
Long elementID = NatureHelper.getElementID(nature);
SchemaElement element = getSchemaElement(modelId, elementType, elementID);
if (element == null) {
continue;
}
if (element.getType().equals(SchemaElementType.VALUE)) {
element.setName(hanlpMapResult.getName());
}
Long frequency = wordNatureToFrequency.get(hanlpMapResult.getName() + nature);
SchemaElementMatch schemaElementMatch = SchemaElementMatch.builder()
.element(element)
.frequency(frequency)
.word(hanlpMapResult.getName())
.similarity(hanlpMapResult.getSimilarity())
.detectWord(hanlpMapResult.getDetectWord())
.build();
addToSchemaMap(schemaMap, modelId, schemaElementMatch);
}
}
}
}

View File

@@ -0,0 +1,122 @@
package com.tencent.supersonic.chat.mapper;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.chat.api.pojo.QueryContext;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SchemaMapInfo;
import com.tencent.supersonic.common.util.ContextUtils;
import com.tencent.supersonic.knowledge.dictionary.DatabaseMapResult;
import com.tencent.supersonic.knowledge.dictionary.HanlpMapResult;
import com.tencent.supersonic.knowledge.utils.HanlpHelper;
import com.tencent.supersonic.knowledge.utils.NatureHelper;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.springframework.util.CollectionUtils;
/***
* A mapper that recognize elements through keyword.
* It includes two recognition strategies: HanlpDictMatchStrategy and DatabaseMatchStrategy.
*
*/
@Slf4j
public class KeywordMapper extends BaseMapper {
@Override
public void doMap(QueryContext queryContext) {
String queryText = queryContext.getRequest().getQueryText();
//1.hanlpDict Match
List<Term> terms = HanlpHelper.getTerms(queryText);
HanlpDictMatchStrategy hanlpMatchStrategy = ContextUtils.getBean(HanlpDictMatchStrategy.class);
List<HanlpMapResult> hanlpMapResults = hanlpMatchStrategy.getMatches(queryContext, terms);
convertHanlpMapResultToMapInfo(hanlpMapResults, queryContext.getMapInfo(), terms);
//2.database Match
DatabaseMatchStrategy databaseMatchStrategy = ContextUtils.getBean(DatabaseMatchStrategy.class);
List<DatabaseMapResult> databaseResults = databaseMatchStrategy.getMatches(queryContext, terms);
convertDatabaseMapResultToMapInfo(queryContext, databaseResults);
}
private void convertHanlpMapResultToMapInfo(List<HanlpMapResult> mapResults, SchemaMapInfo schemaMap,
List<Term> terms) {
if (CollectionUtils.isEmpty(mapResults)) {
return;
}
HanlpHelper.transLetterOriginal(mapResults);
Map<String, Long> wordNatureToFrequency = terms.stream().collect(
Collectors.toMap(entry -> entry.getWord() + entry.getNature(),
term -> Long.valueOf(term.getFrequency()), (value1, value2) -> value2));
for (HanlpMapResult hanlpMapResult : mapResults) {
for (String nature : hanlpMapResult.getNatures()) {
Long modelId = NatureHelper.getModelId(nature);
if (Objects.isNull(modelId)) {
continue;
}
SchemaElementType elementType = NatureHelper.convertToElementType(nature);
if (Objects.isNull(elementType)) {
continue;
}
Long elementID = NatureHelper.getElementID(nature);
SchemaElement element = getSchemaElement(modelId, elementType, elementID);
if (element == null) {
continue;
}
if (element.getType().equals(SchemaElementType.VALUE)) {
element.setName(hanlpMapResult.getName());
}
Long frequency = wordNatureToFrequency.get(hanlpMapResult.getName() + nature);
SchemaElementMatch schemaElementMatch = SchemaElementMatch.builder()
.element(element)
.frequency(frequency)
.word(hanlpMapResult.getName())
.similarity(hanlpMapResult.getSimilarity())
.detectWord(hanlpMapResult.getDetectWord())
.build();
addToSchemaMap(schemaMap, modelId, schemaElementMatch);
}
}
}
private void convertDatabaseMapResultToMapInfo(QueryContext queryContext, List<DatabaseMapResult> mapResults) {
MapperHelper mapperHelper = ContextUtils.getBean(MapperHelper.class);
for (DatabaseMapResult match : mapResults) {
SchemaElement schemaElement = match.getSchemaElement();
Set<Long> regElementSet = getRegElementSet(queryContext.getMapInfo(), schemaElement);
if (regElementSet.contains(schemaElement.getId())) {
continue;
}
SchemaElementMatch schemaElementMatch = SchemaElementMatch.builder()
.element(schemaElement)
.word(schemaElement.getName())
.detectWord(match.getDetectWord())
.frequency(10000L)
.similarity(mapperHelper.getSimilarity(match.getDetectWord(), schemaElement.getName()))
.build();
log.info("add to schema, elementMatch {}", schemaElementMatch);
addToSchemaMap(queryContext.getMapInfo(), schemaElement.getModel(), schemaElementMatch);
}
}
private Set<Long> getRegElementSet(SchemaMapInfo schemaMap, SchemaElement schemaElement) {
List<SchemaElementMatch> elements = schemaMap.getMatchedElements(schemaElement.getModel());
if (CollectionUtils.isEmpty(elements)) {
return new HashSet<>();
}
return elements.stream()
.filter(elementMatch ->
SchemaElementType.METRIC.equals(elementMatch.getElement().getType())
|| SchemaElementType.DIMENSION.equals(elementMatch.getElement().getType()))
.map(elementMatch -> elementMatch.getElement().getId())
.collect(Collectors.toSet());
}
}

View File

@@ -1,22 +0,0 @@
package com.tencent.supersonic.chat.mapper;
import com.tencent.supersonic.chat.api.pojo.QueryContext;
import com.tencent.supersonic.chat.api.pojo.request.QueryReq;
import com.tencent.supersonic.chat.test.context.ContextTest;
import org.junit.jupiter.api.Test;
/**
* HanlpDictMapperTest
*/
class HanlpDictMapperTest extends ContextTest {
@Test
void map() {
QueryReq queryRequest = new QueryReq();
queryRequest.setChatId(1);
queryRequest.setModelId(2L);
queryRequest.setQueryText("supersonic按部门访问次数");
HanlpDictMapper hanlpDictMapper = new HanlpDictMapper();
hanlpDictMapper.map(new QueryContext(queryRequest));
}
}

View File

@@ -7,7 +7,7 @@ import lombok.ToString;
@Data @Data
@ToString @ToString
public class FuzzyResult extends MapResult { public class DatabaseMapResult extends MapResult {
private SchemaElement schemaElement; private SchemaElement schemaElement;
@@ -19,7 +19,7 @@ public class FuzzyResult extends MapResult {
if (o == null || getClass() != o.getClass()) { if (o == null || getClass() != o.getClass()) {
return false; return false;
} }
FuzzyResult that = (FuzzyResult) o; DatabaseMapResult that = (DatabaseMapResult) o;
return Objects.equal(name, that.name) && Objects.equal(schemaElement, that.schemaElement); return Objects.equal(name, that.name) && Objects.equal(schemaElement, that.schemaElement);
} }

View File

@@ -1,7 +1,6 @@
com.tencent.supersonic.chat.api.component.SchemaMapper=\ com.tencent.supersonic.chat.api.component.SchemaMapper=\
com.tencent.supersonic.chat.mapper.EmbeddingMapper, \ com.tencent.supersonic.chat.mapper.EmbeddingMapper, \
com.tencent.supersonic.chat.mapper.HanlpDictMapper, \ com.tencent.supersonic.chat.mapper.KeywordMapper, \
com.tencent.supersonic.chat.mapper.FuzzyNameMapper, \
com.tencent.supersonic.chat.mapper.QueryFilterMapper, \ com.tencent.supersonic.chat.mapper.QueryFilterMapper, \
com.tencent.supersonic.chat.mapper.EntityMapper com.tencent.supersonic.chat.mapper.EntityMapper

View File

@@ -1,7 +1,6 @@
com.tencent.supersonic.chat.api.component.SchemaMapper=\ com.tencent.supersonic.chat.api.component.SchemaMapper=\
com.tencent.supersonic.chat.mapper.EmbeddingMapper, \ com.tencent.supersonic.chat.mapper.EmbeddingMapper, \
com.tencent.supersonic.chat.mapper.HanlpDictMapper, \ com.tencent.supersonic.chat.mapper.KeywordMapper, \
com.tencent.supersonic.chat.mapper.FuzzyNameMapper, \
com.tencent.supersonic.chat.mapper.QueryFilterMapper, \ com.tencent.supersonic.chat.mapper.QueryFilterMapper, \
com.tencent.supersonic.chat.mapper.EntityMapper, \ com.tencent.supersonic.chat.mapper.EntityMapper, \
com.tencent.supersonic.chat.mapper.ModelClusterMapper com.tencent.supersonic.chat.mapper.ModelClusterMapper

View File

@@ -1,7 +1,6 @@
com.tencent.supersonic.chat.api.component.SchemaMapper=\ com.tencent.supersonic.chat.api.component.SchemaMapper=\
com.tencent.supersonic.chat.mapper.EmbeddingMapper, \ com.tencent.supersonic.chat.mapper.EmbeddingMapper, \
com.tencent.supersonic.chat.mapper.HanlpDictMapper, \ com.tencent.supersonic.chat.mapper.KeywordMapper, \
com.tencent.supersonic.chat.mapper.FuzzyNameMapper, \
com.tencent.supersonic.chat.mapper.QueryFilterMapper, \ com.tencent.supersonic.chat.mapper.QueryFilterMapper, \
com.tencent.supersonic.chat.mapper.EntityMapper, \ com.tencent.supersonic.chat.mapper.EntityMapper, \
com.tencent.supersonic.chat.mapper.ModelClusterMapper com.tencent.supersonic.chat.mapper.ModelClusterMapper