(improvement)(Headless) Put term into dict and let it can be mapped by mapper (#1002)

This commit is contained in:
LXW
2024-05-16 10:15:04 +08:00
committed by GitHub
parent 21af74c674
commit 55c625a915
34 changed files with 292 additions and 137 deletions

View File

@@ -0,0 +1,44 @@
package com.tencent.supersonic.headless.core.chat.knowledge.builder;
import com.google.common.collect.Lists;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.core.chat.knowledge.DictWord;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Service;
import java.util.List;
/**
* Metric DictWord
*/
@Service
public class TermWordBuilder extends BaseWordWithAliasBuilder {
@Override
public List<DictWord> doGet(String word, SchemaElement schemaElement) {
List<DictWord> result = Lists.newArrayList();
result.add(getOneWordNature(word, schemaElement, false));
result.addAll(getOneWordNatureAlias(schemaElement, false));
String reverseWord = StringUtils.reverse(word);
if (!word.equalsIgnoreCase(reverseWord)) {
result.add(getOneWordNature(reverseWord, schemaElement, true));
}
return result;
}
public DictWord getOneWordNature(String word, SchemaElement schemaElement, boolean isSuffix) {
DictWord dictWord = new DictWord();
dictWord.setWord(word);
Long dataSet = schemaElement.getDataSet();
String nature = DictWordType.NATURE_SPILT + dataSet + DictWordType.NATURE_SPILT + schemaElement.getId()
+ DictWordType.TERM.getType();
if (isSuffix) {
nature = DictWordType.NATURE_SPILT + dataSet + DictWordType.NATURE_SPILT + schemaElement.getId()
+ DictWordType.SUFFIX.getType() + DictWordType.TERM.getType();
}
dictWord.setNatureWithFrequency(String.format("%s " + DEFAULT_FREQUENCY, nature));
return dictWord;
}
}

View File

@@ -18,6 +18,7 @@ public class WordBuilderFactory {
wordNatures.put(DictWordType.DATASET, new ModelWordBuilder());
wordNatures.put(DictWordType.ENTITY, new EntityWordBuilder());
wordNatures.put(DictWordType.VALUE, new ValueWordBuilder());
wordNatures.put(DictWordType.TERM, new TermWordBuilder());
}
public static BaseWordBuilder get(DictWordType strategyType) {

View File

@@ -214,17 +214,17 @@ public class HanlpHelper {
}
}
public static List<S2Term> getTerms(String text, Map<Long, List<Long>> modelIdToViewIds) {
public static List<S2Term> getTerms(String text, Map<Long, List<Long>> modelIdToDataSetIds) {
return getSegment().seg(text.toLowerCase()).stream()
.filter(term -> term.getNature().startsWith(DictWordType.NATURE_SPILT))
.map(term -> transform2ApiTerm(term, modelIdToViewIds))
.map(term -> transform2ApiTerm(term, modelIdToDataSetIds))
.flatMap(Collection::stream)
.collect(Collectors.toList());
}
public static List<S2Term> transform2ApiTerm(Term term, Map<Long, List<Long>> modelIdToViewIds) {
public static List<S2Term> transform2ApiTerm(Term term, Map<Long, List<Long>> modelIdToDataSetIds) {
List<S2Term> s2Terms = Lists.newArrayList();
List<String> natures = NatureHelper.changeModel2DataSet(String.valueOf(term.getNature()), modelIdToViewIds);
List<String> natures = NatureHelper.changeModel2DataSet(String.valueOf(term.getNature()), modelIdToDataSetIds);
for (String nature : natures) {
S2Term s2Term = new S2Term();
BeanUtils.copyProperties(term, s2Term);

View File

@@ -6,6 +6,10 @@ import com.tencent.supersonic.common.pojo.enums.DictWordType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
import com.tencent.supersonic.headless.core.chat.knowledge.DataSetInfoStat;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.util.CollectionUtils;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
@@ -14,9 +18,6 @@ import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.util.CollectionUtils;
/**
* nature parse helper
@@ -46,6 +47,9 @@ public class NatureHelper {
case VALUE:
result = SchemaElementType.VALUE;
break;
case TERM:
result = SchemaElementType.TERM;
break;
default:
break;
}
@@ -108,6 +112,10 @@ public class NatureHelper {
}
public static List<String> changeModel2DataSet(String nature, Map<Long, List<Long>> modelIdToDataSetIds) {
//term prefix id is dataSetId, no need to transform
if (SchemaElementType.TERM.equals(NatureHelper.convertToElementType(nature))) {
return Lists.newArrayList(nature);
}
Long modelId = getModelId(nature);
List<Long> dataSetIds = modelIdToDataSetIds.get(modelId);
if (CollectionUtils.isEmpty(dataSetIds)) {
@@ -129,7 +137,7 @@ public class NatureHelper {
return false;
}
return !nature.endsWith(DictWordType.METRIC.getType()) && !nature.endsWith(
DictWordType.DIMENSION.getType())
DictWordType.DIMENSION.getType()) && !nature.endsWith(DictWordType.TERM.getType())
&& StringUtils.isNumeric(split[1]);
}

View File

@@ -111,6 +111,10 @@ public abstract class BaseMapper implements SchemaMapper {
if (!existElement.equals(newElement)) {
return false;
}
if (SchemaElementType.TERM.equals(existElement.getType())
&& SchemaElementType.TERM.equals(newElement.getType())) {
return false;
}
if (SchemaElementType.VALUE.equals(newElement.getType())) {
return existElementMatch.getWord().equalsIgnoreCase(newElementMatch.getWord());
}

View File

@@ -46,7 +46,7 @@ public class HanlpDictMatchStrategy extends BaseMatchStrategy<HanlpMapResult> {
return null;
}
log.debug("retryCount:{},terms:{},,detectModelIds:{}", terms, detectDataSetIds);
log.debug("terms:{},detectModelIds:{}", terms, detectDataSetIds);
List<HanlpMapResult> detects = detect(queryContext, terms, detectDataSetIds);
Map<MatchText, List<HanlpMapResult>> result = new HashMap<>();