mirror of
https://github.com/tencentmusic/supersonic.git
synced 2025-12-14 13:47:09 +00:00
(improvement)(Headless) Put term into dict and let it can be mapped by mapper (#1002)
This commit is contained in:
@@ -0,0 +1,44 @@
|
||||
package com.tencent.supersonic.headless.core.chat.knowledge.builder;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.tencent.supersonic.common.pojo.enums.DictWordType;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||
import com.tencent.supersonic.headless.core.chat.knowledge.DictWord;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Metric DictWord
|
||||
*/
|
||||
@Service
|
||||
public class TermWordBuilder extends BaseWordWithAliasBuilder {
|
||||
|
||||
@Override
|
||||
public List<DictWord> doGet(String word, SchemaElement schemaElement) {
|
||||
List<DictWord> result = Lists.newArrayList();
|
||||
result.add(getOneWordNature(word, schemaElement, false));
|
||||
result.addAll(getOneWordNatureAlias(schemaElement, false));
|
||||
String reverseWord = StringUtils.reverse(word);
|
||||
if (!word.equalsIgnoreCase(reverseWord)) {
|
||||
result.add(getOneWordNature(reverseWord, schemaElement, true));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public DictWord getOneWordNature(String word, SchemaElement schemaElement, boolean isSuffix) {
|
||||
DictWord dictWord = new DictWord();
|
||||
dictWord.setWord(word);
|
||||
Long dataSet = schemaElement.getDataSet();
|
||||
String nature = DictWordType.NATURE_SPILT + dataSet + DictWordType.NATURE_SPILT + schemaElement.getId()
|
||||
+ DictWordType.TERM.getType();
|
||||
if (isSuffix) {
|
||||
nature = DictWordType.NATURE_SPILT + dataSet + DictWordType.NATURE_SPILT + schemaElement.getId()
|
||||
+ DictWordType.SUFFIX.getType() + DictWordType.TERM.getType();
|
||||
}
|
||||
dictWord.setNatureWithFrequency(String.format("%s " + DEFAULT_FREQUENCY, nature));
|
||||
return dictWord;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -18,6 +18,7 @@ public class WordBuilderFactory {
|
||||
wordNatures.put(DictWordType.DATASET, new ModelWordBuilder());
|
||||
wordNatures.put(DictWordType.ENTITY, new EntityWordBuilder());
|
||||
wordNatures.put(DictWordType.VALUE, new ValueWordBuilder());
|
||||
wordNatures.put(DictWordType.TERM, new TermWordBuilder());
|
||||
}
|
||||
|
||||
public static BaseWordBuilder get(DictWordType strategyType) {
|
||||
|
||||
@@ -214,17 +214,17 @@ public class HanlpHelper {
|
||||
}
|
||||
}
|
||||
|
||||
public static List<S2Term> getTerms(String text, Map<Long, List<Long>> modelIdToViewIds) {
|
||||
public static List<S2Term> getTerms(String text, Map<Long, List<Long>> modelIdToDataSetIds) {
|
||||
return getSegment().seg(text.toLowerCase()).stream()
|
||||
.filter(term -> term.getNature().startsWith(DictWordType.NATURE_SPILT))
|
||||
.map(term -> transform2ApiTerm(term, modelIdToViewIds))
|
||||
.map(term -> transform2ApiTerm(term, modelIdToDataSetIds))
|
||||
.flatMap(Collection::stream)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static List<S2Term> transform2ApiTerm(Term term, Map<Long, List<Long>> modelIdToViewIds) {
|
||||
public static List<S2Term> transform2ApiTerm(Term term, Map<Long, List<Long>> modelIdToDataSetIds) {
|
||||
List<S2Term> s2Terms = Lists.newArrayList();
|
||||
List<String> natures = NatureHelper.changeModel2DataSet(String.valueOf(term.getNature()), modelIdToViewIds);
|
||||
List<String> natures = NatureHelper.changeModel2DataSet(String.valueOf(term.getNature()), modelIdToDataSetIds);
|
||||
for (String nature : natures) {
|
||||
S2Term s2Term = new S2Term();
|
||||
BeanUtils.copyProperties(term, s2Term);
|
||||
|
||||
@@ -6,6 +6,10 @@ import com.tencent.supersonic.common.pojo.enums.DictWordType;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
|
||||
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
|
||||
import com.tencent.supersonic.headless.core.chat.knowledge.DataSetInfoStat;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
@@ -14,9 +18,6 @@ import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
/**
|
||||
* nature parse helper
|
||||
@@ -46,6 +47,9 @@ public class NatureHelper {
|
||||
case VALUE:
|
||||
result = SchemaElementType.VALUE;
|
||||
break;
|
||||
case TERM:
|
||||
result = SchemaElementType.TERM;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -108,6 +112,10 @@ public class NatureHelper {
|
||||
}
|
||||
|
||||
public static List<String> changeModel2DataSet(String nature, Map<Long, List<Long>> modelIdToDataSetIds) {
|
||||
//term prefix id is dataSetId, no need to transform
|
||||
if (SchemaElementType.TERM.equals(NatureHelper.convertToElementType(nature))) {
|
||||
return Lists.newArrayList(nature);
|
||||
}
|
||||
Long modelId = getModelId(nature);
|
||||
List<Long> dataSetIds = modelIdToDataSetIds.get(modelId);
|
||||
if (CollectionUtils.isEmpty(dataSetIds)) {
|
||||
@@ -129,7 +137,7 @@ public class NatureHelper {
|
||||
return false;
|
||||
}
|
||||
return !nature.endsWith(DictWordType.METRIC.getType()) && !nature.endsWith(
|
||||
DictWordType.DIMENSION.getType())
|
||||
DictWordType.DIMENSION.getType()) && !nature.endsWith(DictWordType.TERM.getType())
|
||||
&& StringUtils.isNumeric(split[1]);
|
||||
}
|
||||
|
||||
|
||||
@@ -111,6 +111,10 @@ public abstract class BaseMapper implements SchemaMapper {
|
||||
if (!existElement.equals(newElement)) {
|
||||
return false;
|
||||
}
|
||||
if (SchemaElementType.TERM.equals(existElement.getType())
|
||||
&& SchemaElementType.TERM.equals(newElement.getType())) {
|
||||
return false;
|
||||
}
|
||||
if (SchemaElementType.VALUE.equals(newElement.getType())) {
|
||||
return existElementMatch.getWord().equalsIgnoreCase(newElementMatch.getWord());
|
||||
}
|
||||
|
||||
@@ -46,7 +46,7 @@ public class HanlpDictMatchStrategy extends BaseMatchStrategy<HanlpMapResult> {
|
||||
return null;
|
||||
}
|
||||
|
||||
log.debug("retryCount:{},terms:{},,detectModelIds:{}", terms, detectDataSetIds);
|
||||
log.debug("terms:{},detectModelIds:{}", terms, detectDataSetIds);
|
||||
|
||||
List<HanlpMapResult> detects = detect(queryContext, terms, detectDataSetIds);
|
||||
Map<MatchText, List<HanlpMapResult>> result = new HashMap<>();
|
||||
|
||||
Reference in New Issue
Block a user