(improvement)(headless) transfer term nature modelId to viewId before providing it to chat and put the modelId of metadata into the dict word instead of viewId (#739)

Co-authored-by: jolunoluo
This commit is contained in:
LXW
2024-02-23 10:29:21 +08:00
committed by GitHub
parent 16643e8d75
commit e95a528219
52 changed files with 456 additions and 245 deletions

View File

@@ -6,8 +6,15 @@ import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import com.tencent.supersonic.headless.api.pojo.request.DimensionValueReq;
import com.tencent.supersonic.headless.core.knowledge.helper.NatureHelper;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.util.CollectionUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Objects;
@@ -17,11 +24,6 @@ import java.util.TreeMap;
import java.util.TreeSet;
import java.util.stream.Collectors;
import com.tencent.supersonic.headless.api.pojo.request.DimensionValueReq;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.util.CollectionUtils;
@Slf4j
public class SearchService {
@@ -39,14 +41,14 @@ public class SearchService {
* @param key
* @return
*/
public static List<HanlpMapResult> prefixSearch(String key, int limit, Set<Long> detectModelIds) {
return prefixSearch(key, limit, trie, detectModelIds);
public static List<HanlpMapResult> prefixSearch(String key, int limit, Map<Long, List<Long>> modelIdToViewIds) {
return prefixSearch(key, limit, trie, modelIdToViewIds);
}
public static List<HanlpMapResult> prefixSearch(String key, int limit, BinTrie<List<String>> binTrie,
Set<Long> detectModelIds) {
Set<Map.Entry<String, List<String>>> result = prefixSearchLimit(key, limit, binTrie, detectModelIds);
return result.stream().map(
Map<Long, List<Long>> modelIdToViewIds) {
Set<Map.Entry<String, List<String>>> result = prefixSearchLimit(key, limit, binTrie, modelIdToViewIds.keySet());
List<HanlpMapResult> hanlpMapResults = result.stream().map(
entry -> {
String name = entry.getKey().replace("#", " ");
return new HanlpMapResult(name, entry.getValue(), key);
@@ -54,6 +56,13 @@ public class SearchService {
).sorted((a, b) -> -(b.getName().length() - a.getName().length()))
.limit(SEARCH_SIZE)
.collect(Collectors.toList());
for (HanlpMapResult hanlpMapResult : hanlpMapResults) {
List<String> natures = hanlpMapResult.getNatures().stream()
.map(nature -> NatureHelper.changeModel2View(nature, modelIdToViewIds))
.flatMap(Collection::stream).collect(Collectors.toList());
hanlpMapResult.setNatures(natures);
}
return hanlpMapResults;
}
/***

View File

@@ -38,11 +38,11 @@ public class DimensionWordBuilder extends BaseWordBuilder {
private DictWord getOnwWordNature(String word, SchemaElement schemaElement, boolean isSuffix) {
DictWord dictWord = new DictWord();
dictWord.setWord(word);
Long viewId = schemaElement.getView();
String nature = DictWordType.NATURE_SPILT + viewId + DictWordType.NATURE_SPILT + schemaElement.getId()
Long modelId = schemaElement.getModel();
String nature = DictWordType.NATURE_SPILT + modelId + DictWordType.NATURE_SPILT + schemaElement.getId()
+ DictWordType.DIMENSION.getType();
if (isSuffix) {
nature = DictWordType.NATURE_SPILT + viewId + DictWordType.NATURE_SPILT + schemaElement.getId()
nature = DictWordType.NATURE_SPILT + modelId + DictWordType.NATURE_SPILT + schemaElement.getId()
+ DictWordType.SUFFIX.getType() + DictWordType.DIMENSION.getType();
}
dictWord.setNatureWithFrequency(String.format("%s " + DEFAULT_FREQUENCY, nature));

View File

@@ -27,8 +27,8 @@ public class EntityWordBuilder extends BaseWordBuilder {
return result;
}
Long view = schemaElement.getView();
String nature = DictWordType.NATURE_SPILT + view + DictWordType.NATURE_SPILT + schemaElement.getId()
Long modelId = schemaElement.getModel();
String nature = DictWordType.NATURE_SPILT + modelId + DictWordType.NATURE_SPILT + schemaElement.getId()
+ DictWordType.ENTITY.getType();
if (!CollectionUtils.isEmpty(schemaElement.getAlias())) {

View File

@@ -38,11 +38,11 @@ public class MetricWordBuilder extends BaseWordBuilder {
private DictWord getOnwWordNature(String word, SchemaElement schemaElement, boolean isSuffix) {
DictWord dictWord = new DictWord();
dictWord.setWord(word);
Long viewId = schemaElement.getView();
String nature = DictWordType.NATURE_SPILT + viewId + DictWordType.NATURE_SPILT + schemaElement.getId()
Long modelId = schemaElement.getModel();
String nature = DictWordType.NATURE_SPILT + modelId + DictWordType.NATURE_SPILT + schemaElement.getId()
+ DictWordType.METRIC.getType();
if (isSuffix) {
nature = DictWordType.NATURE_SPILT + viewId + DictWordType.NATURE_SPILT + schemaElement.getId()
nature = DictWordType.NATURE_SPILT + modelId + DictWordType.NATURE_SPILT + schemaElement.getId()
+ DictWordType.SUFFIX.getType() + DictWordType.METRIC.getType();
}
dictWord.setNatureWithFrequency(String.format("%s " + DEFAULT_FREQUENCY, nature));

View File

@@ -27,8 +27,8 @@ public class ValueWordBuilder extends BaseWordBuilder {
schemaElement.getAlias().stream().forEach(value -> {
DictWord dictWord = new DictWord();
Long viewId = schemaElement.getView();
String nature = DictWordType.NATURE_SPILT + viewId + DictWordType.NATURE_SPILT + schemaElement.getId();
Long modelId = schemaElement.getModel();
String nature = DictWordType.NATURE_SPILT + modelId + DictWordType.NATURE_SPILT + schemaElement.getId();
dictWord.setNatureWithFrequency(String.format("%s " + DEFAULT_FREQUENCY, nature));
dictWord.setWord(value);
result.add(dictWord);

View File

@@ -1,34 +1,36 @@
package com.tencent.supersonic.headless.core.knowledge.helper;
import static com.hankcs.hanlp.HanLP.Config.CustomDictionaryPath;
import com.google.common.collect.Lists;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
import com.tencent.supersonic.headless.core.knowledge.DictWord;
import com.tencent.supersonic.headless.core.knowledge.HadoopFileIOAdapter;
import com.tencent.supersonic.headless.core.knowledge.MapResult;
import com.tencent.supersonic.headless.core.knowledge.MultiCustomDictionary;
import com.tencent.supersonic.headless.core.knowledge.SearchService;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.BeanUtils;
import org.springframework.util.CollectionUtils;
import org.springframework.util.ResourceUtils;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import com.hankcs.hanlp.seg.common.Term;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.BeanUtils;
import org.springframework.util.CollectionUtils;
import org.springframework.util.ResourceUtils;
import static com.hankcs.hanlp.HanLP.Config.CustomDictionaryPath;
/**
* HanLP helper
@@ -212,18 +214,25 @@ public class HanlpHelper {
}
}
public static List<com.tencent.supersonic.headless.api.pojo.response.S2Term> getTerms(String text) {
public static List<S2Term> getTerms(String text, Map<Long, List<Long>> modelIdToViewIds) {
return getSegment().seg(text.toLowerCase()).stream()
.filter(term -> term.getNature().startsWith(DictWordType.NATURE_SPILT))
.map(term -> transform2ApiTerm(term))
.map(term -> transform2ApiTerm(term, modelIdToViewIds))
.flatMap(Collection::stream)
.collect(Collectors.toList());
}
public static S2Term transform2ApiTerm(Term term) {
S2Term knowledgeTerm = new S2Term();
BeanUtils.copyProperties(term, knowledgeTerm);
knowledgeTerm.setFrequency(term.getFrequency());
return knowledgeTerm;
public static List<S2Term> transform2ApiTerm(Term term, Map<Long, List<Long>> modelIdToViewIds) {
List<S2Term> s2Terms = Lists.newArrayList();
List<String> natures = NatureHelper.changeModel2View(String.valueOf(term.getNature()), modelIdToViewIds);
for (String nature : natures) {
S2Term s2Term = new S2Term();
BeanUtils.copyProperties(term, s2Term);
s2Term.setNature(Nature.create(nature));
s2Term.setFrequency(term.getFrequency());
s2Terms.add(s2Term);
}
return s2Terms;
}
}

View File

@@ -1,12 +1,14 @@
package com.tencent.supersonic.headless.core.knowledge.helper;
import com.google.common.collect.Lists;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
import com.tencent.supersonic.headless.core.knowledge.ViewInfoStat;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.util.CollectionUtils;
import java.util.ArrayList;
import java.util.Comparator;
@@ -81,6 +83,43 @@ public class NatureHelper {
return null;
}
private static Long getModelId(String nature) {
try {
String[] split = nature.split(DictWordType.NATURE_SPILT);
if (split.length <= 1) {
return null;
}
return Long.valueOf(split[1]);
} catch (NumberFormatException e) {
log.error("", e);
}
return null;
}
private static Nature changeModel2View(String nature, Long viewId) {
try {
String[] split = nature.split(DictWordType.NATURE_SPILT);
if (split.length <= 1) {
return null;
}
split[1] = String.valueOf(viewId);
return Nature.create(StringUtils.join(split, DictWordType.NATURE_SPILT));
} catch (NumberFormatException e) {
log.error("", e);
}
return null;
}
public static List<String> changeModel2View(String nature, Map<Long, List<Long>> modelIdToViewIds) {
Long modelId = getModelId(nature);
List<Long> viewIds = modelIdToViewIds.get(modelId);
if (CollectionUtils.isEmpty(viewIds)) {
return Lists.newArrayList();
}
return viewIds.stream().map(viewId -> String.valueOf(changeModel2View(nature, viewId)))
.collect(Collectors.toList());
}
public static boolean isDimensionValueViewId(String nature) {
if (StringUtils.isEmpty(nature)) {
return false;