mirror of
https://github.com/tencentmusic/supersonic.git
synced 2025-12-10 02:46:56 +00:00
(improvement)(chat) Optimize and fix the NatureHelper code. (#1510)
Co-authored-by: lexluo <lexluo@tencent.com>
This commit is contained in:
@@ -1,16 +1,11 @@
|
||||
package com.tencent.supersonic.headless.chat.knowledge.helper;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.hankcs.hanlp.corpus.tag.Nature;
|
||||
import com.tencent.supersonic.common.pojo.enums.DictWordType;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
|
||||
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.DataSetInfoStat;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
@@ -18,6 +13,9 @@ import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
/**
|
||||
* nature parse helper
|
||||
@@ -57,54 +55,35 @@ public class NatureHelper {
|
||||
}
|
||||
|
||||
private static boolean isDataSetOrEntity(S2Term term, Integer model) {
|
||||
return (DictWordType.NATURE_SPILT + model).equals(term.nature.toString()) || term.nature.toString()
|
||||
.endsWith(DictWordType.ENTITY.getType());
|
||||
String natureStr = term.nature.toString();
|
||||
return (DictWordType.NATURE_SPILT + model).equals(natureStr) || natureStr.endsWith(
|
||||
DictWordType.ENTITY.getType());
|
||||
}
|
||||
|
||||
public static Integer getDataSetByNature(Nature nature) {
|
||||
if (nature.startsWith(DictWordType.NATURE_SPILT)) {
|
||||
String[] dimensionValues = nature.toString().split(DictWordType.NATURE_SPILT);
|
||||
if (StringUtils.isNumeric(dimensionValues[1])) {
|
||||
return Integer.valueOf(dimensionValues[1]);
|
||||
}
|
||||
if (!nature.startsWith(DictWordType.NATURE_SPILT)) {
|
||||
return 0;
|
||||
}
|
||||
return 0;
|
||||
String[] dimensionValues = nature.toString().split(DictWordType.NATURE_SPILT);
|
||||
return StringUtils.isNumeric(dimensionValues[1]) ? Integer.valueOf(dimensionValues[1]) : 0;
|
||||
}
|
||||
|
||||
public static Long getDataSetId(String nature) {
|
||||
try {
|
||||
String[] split = nature.split(DictWordType.NATURE_SPILT);
|
||||
if (split.length <= 1) {
|
||||
return null;
|
||||
}
|
||||
return Long.valueOf(split[1]);
|
||||
} catch (NumberFormatException e) {
|
||||
log.error("", e);
|
||||
}
|
||||
return null;
|
||||
return parseIdFromNature(nature, 1);
|
||||
}
|
||||
|
||||
private static Long getModelId(String nature) {
|
||||
try {
|
||||
String[] split = nature.split(DictWordType.NATURE_SPILT);
|
||||
if (split.length <= 1) {
|
||||
return null;
|
||||
}
|
||||
return Long.valueOf(split[1]);
|
||||
} catch (NumberFormatException e) {
|
||||
log.error("", e);
|
||||
}
|
||||
return null;
|
||||
return parseIdFromNature(nature, 1);
|
||||
}
|
||||
|
||||
private static Nature changeModel2DataSet(String nature, Long dataSetId) {
|
||||
private static String changeModel2DataSet(String nature, Long dataSetId) {
|
||||
try {
|
||||
String[] split = nature.split(DictWordType.NATURE_SPILT);
|
||||
if (split.length <= 1) {
|
||||
return null;
|
||||
}
|
||||
split[1] = String.valueOf(dataSetId);
|
||||
return Nature.create(StringUtils.join(split, DictWordType.NATURE_SPILT));
|
||||
return String.join(DictWordType.NATURE_SPILT, split);
|
||||
} catch (NumberFormatException e) {
|
||||
log.error("", e);
|
||||
}
|
||||
@@ -112,47 +91,28 @@ public class NatureHelper {
|
||||
}
|
||||
|
||||
public static List<String> changeModel2DataSet(String nature, Map<Long, List<Long>> modelIdToDataSetIds) {
|
||||
//term prefix id is dataSetId, no need to transform
|
||||
if (SchemaElementType.TERM.equals(NatureHelper.convertToElementType(nature))) {
|
||||
return Lists.newArrayList(nature);
|
||||
return Collections.singletonList(nature);
|
||||
}
|
||||
Long modelId = getModelId(nature);
|
||||
List<Long> dataSetIds = modelIdToDataSetIds.get(modelId);
|
||||
if (CollectionUtils.isEmpty(dataSetIds)) {
|
||||
return Lists.newArrayList();
|
||||
return Collections.emptyList();
|
||||
}
|
||||
return dataSetIds.stream().map(dataSetId -> String.valueOf(changeModel2DataSet(nature, dataSetId)))
|
||||
return dataSetIds.stream()
|
||||
.map(dataSetId -> changeModel2DataSet(nature, dataSetId))
|
||||
.filter(Objects::nonNull)
|
||||
.map(String::valueOf)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static boolean isDimensionValueDataSetId(String nature) {
|
||||
if (StringUtils.isEmpty(nature)) {
|
||||
return false;
|
||||
}
|
||||
if (!nature.startsWith(DictWordType.NATURE_SPILT)) {
|
||||
return false;
|
||||
}
|
||||
String[] split = nature.split(DictWordType.NATURE_SPILT);
|
||||
if (split.length <= 1) {
|
||||
return false;
|
||||
}
|
||||
return !nature.endsWith(DictWordType.METRIC.getType()) && !nature.endsWith(
|
||||
DictWordType.DIMENSION.getType()) && !nature.endsWith(DictWordType.TERM.getType())
|
||||
&& StringUtils.isNumeric(split[1]);
|
||||
return isNatureValid(nature) && !isNatureType(nature, DictWordType.METRIC, DictWordType.DIMENSION,
|
||||
DictWordType.TERM) && StringUtils.isNumeric(nature.split(DictWordType.NATURE_SPILT)[1]);
|
||||
}
|
||||
|
||||
public static boolean isTermNature(String nature) {
|
||||
if (StringUtils.isEmpty(nature)) {
|
||||
return false;
|
||||
}
|
||||
if (!nature.startsWith(DictWordType.NATURE_SPILT)) {
|
||||
return false;
|
||||
}
|
||||
String[] split = nature.split(DictWordType.NATURE_SPILT);
|
||||
if (split.length <= 1) {
|
||||
return false;
|
||||
}
|
||||
return nature.endsWith(DictWordType.TERM.getType());
|
||||
return isNatureValid(nature) && nature.endsWith(DictWordType.TERM.getType());
|
||||
}
|
||||
|
||||
public static DataSetInfoStat getDataSetStat(List<S2Term> terms) {
|
||||
@@ -182,72 +142,67 @@ public class NatureHelper {
|
||||
.endsWith(DictWordType.METRIC.getType())).count();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of types of class parts of speech
|
||||
* modelId -> (nature , natureCount)
|
||||
*
|
||||
* @param terms
|
||||
* @return
|
||||
*/
|
||||
public static Map<Long, Map<DictWordType, Integer>> getDataSetToNatureStat(List<S2Term> terms) {
|
||||
Map<Long, Map<DictWordType, Integer>> modelToNature = new HashMap<>();
|
||||
terms.stream().filter(
|
||||
term -> term.nature.startsWith(DictWordType.NATURE_SPILT)
|
||||
).forEach(term -> {
|
||||
DictWordType dictWordType = DictWordType.getNatureType(String.valueOf(term.nature));
|
||||
Long model = getDataSetId(String.valueOf(term.nature));
|
||||
terms.stream()
|
||||
.filter(term -> term.nature.startsWith(DictWordType.NATURE_SPILT))
|
||||
.forEach(term -> {
|
||||
DictWordType dictWordType = DictWordType.getNatureType(term.nature.toString());
|
||||
Long model = getDataSetId(term.nature.toString());
|
||||
|
||||
Map<DictWordType, Integer> natureTypeMap = new HashMap<>();
|
||||
natureTypeMap.put(dictWordType, 1);
|
||||
|
||||
Map<DictWordType, Integer> original = modelToNature.get(model);
|
||||
if (Objects.isNull(original)) {
|
||||
modelToNature.put(model, natureTypeMap);
|
||||
} else {
|
||||
Integer count = original.get(dictWordType);
|
||||
if (Objects.isNull(count)) {
|
||||
count = 1;
|
||||
} else {
|
||||
count = count + 1;
|
||||
}
|
||||
original.put(dictWordType, count);
|
||||
}
|
||||
});
|
||||
modelToNature.computeIfAbsent(model, k -> new HashMap<>())
|
||||
.merge(dictWordType, 1, Integer::sum);
|
||||
});
|
||||
return modelToNature;
|
||||
}
|
||||
|
||||
public static List<Long> selectPossibleDataSets(List<S2Term> terms) {
|
||||
Map<Long, Map<DictWordType, Integer>> modelToNatureStat = getDataSetToNatureStat(terms);
|
||||
Integer maxDataSetTypeSize = modelToNatureStat.entrySet().stream()
|
||||
.max(Comparator.comparingInt(o -> o.getValue().size())).map(entry -> entry.getValue().size())
|
||||
.orElse(null);
|
||||
if (Objects.isNull(maxDataSetTypeSize) || maxDataSetTypeSize == 0) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
return modelToNatureStat.entrySet().stream().filter(entry -> entry.getValue().size() == maxDataSetTypeSize)
|
||||
.map(entry -> entry.getKey()).collect(Collectors.toList());
|
||||
return modelToNatureStat.entrySet().stream()
|
||||
.max(Comparator.comparingInt(entry -> entry.getValue().size()))
|
||||
.map(entry -> modelToNatureStat.entrySet().stream()
|
||||
.filter(e -> e.getValue().size() == entry.getValue().size())
|
||||
.map(Map.Entry::getKey)
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(Collections.emptyList());
|
||||
}
|
||||
|
||||
public static Long getElementID(String nature) {
|
||||
String[] split = nature.split(DictWordType.NATURE_SPILT);
|
||||
if (split.length >= 3) {
|
||||
return Long.valueOf(split[2]);
|
||||
}
|
||||
return 0L;
|
||||
return parseIdFromNature(nature, 2);
|
||||
}
|
||||
|
||||
public static Set<Long> getModelIds(Map<Long, List<Long>> modelIdToDataSetIds, Set<Long> detectDataSetIds) {
|
||||
Set<Long> detectModelIds = modelIdToDataSetIds.keySet();
|
||||
if (!CollectionUtils.isEmpty(detectDataSetIds)) {
|
||||
detectModelIds = modelIdToDataSetIds.entrySet().stream().filter(entry -> {
|
||||
List<Long> dataSetIds = entry.getValue().stream().filter(detectDataSetIds::contains)
|
||||
.collect(Collectors.toList());
|
||||
if (!CollectionUtils.isEmpty(dataSetIds)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}).map(entry -> entry.getKey()).collect(Collectors.toSet());
|
||||
if (CollectionUtils.isEmpty(detectDataSetIds)) {
|
||||
return modelIdToDataSetIds.keySet();
|
||||
}
|
||||
return detectModelIds;
|
||||
return modelIdToDataSetIds.entrySet().stream()
|
||||
.filter(entry -> !Collections.disjoint(entry.getValue(), detectDataSetIds))
|
||||
.map(Map.Entry::getKey)
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
public static Long parseIdFromNature(String nature, int index) {
|
||||
try {
|
||||
String[] split = nature.split(DictWordType.NATURE_SPILT);
|
||||
if (split.length > index) {
|
||||
return Long.valueOf(split[index]);
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
log.error("Error parsing long from nature: {}", nature, e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static boolean isNatureValid(String nature) {
|
||||
return StringUtils.isNotEmpty(nature) && nature.startsWith(DictWordType.NATURE_SPILT);
|
||||
}
|
||||
|
||||
private static boolean isNatureType(String nature, DictWordType... types) {
|
||||
for (DictWordType type : types) {
|
||||
if (nature.endsWith(type.getType())) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
package com.tencent.supersonic.headless;
|
||||
|
||||
import com.tencent.supersonic.auth.api.authentication.pojo.User;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.ItemValueReq;
|
||||
import com.tencent.supersonic.headless.api.pojo.response.ItemValueResp;
|
||||
import com.tencent.supersonic.headless.server.service.TagQueryService;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.MethodOrderer;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.TestMethodOrder;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
|
||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||
public class TagTest extends BaseTest {
|
||||
|
||||
@Autowired
|
||||
private TagQueryService tagQueryService;
|
||||
|
||||
@Test
|
||||
public void testQueryTagValue() throws Exception {
|
||||
ItemValueReq itemValueReq = new ItemValueReq();
|
||||
itemValueReq.setId(1L);
|
||||
ItemValueResp itemValueResp = tagQueryService.queryTagValue(itemValueReq, User.getFakeUser());
|
||||
Assertions.assertNotNull(itemValueResp);
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user