mirror of
https://github.com/tencentmusic/supersonic.git
synced 2025-12-14 05:43:51 +00:00
(improvement)(Headless) Filtering based on dataSetIds during Mapper detection Compatible with term (#1096)
Co-authored-by: jolunoluo
This commit is contained in:
@@ -10,7 +10,6 @@ import org.springframework.util.CollectionUtils;
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@@ -19,23 +18,11 @@ public class LoadRemoveService {
|
|||||||
@Value("${mapper.remove.nature.prefix:}")
|
@Value("${mapper.remove.nature.prefix:}")
|
||||||
private String mapperRemoveNaturePrefix;
|
private String mapperRemoveNaturePrefix;
|
||||||
|
|
||||||
public List removeNatures(List value, Set<Long> detectModelIds) {
|
public List removeNatures(List value) {
|
||||||
if (CollectionUtils.isEmpty(value)) {
|
if (CollectionUtils.isEmpty(value)) {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
List<String> resultList = new ArrayList<>(value);
|
List<String> resultList = new ArrayList<>(value);
|
||||||
if (!CollectionUtils.isEmpty(detectModelIds)) {
|
|
||||||
resultList.removeIf(nature -> {
|
|
||||||
if (Objects.isNull(nature)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
Long modelId = getDataSetId(nature);
|
|
||||||
if (Objects.nonNull(modelId)) {
|
|
||||||
return !detectModelIds.contains(modelId);
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
if (StringUtils.isNotBlank(mapperRemoveNaturePrefix)) {
|
if (StringUtils.isNotBlank(mapperRemoveNaturePrefix)) {
|
||||||
resultList.removeIf(nature -> {
|
resultList.removeIf(nature -> {
|
||||||
if (Objects.isNull(nature)) {
|
if (Objects.isNull(nature)) {
|
||||||
|
|||||||
@@ -2,6 +2,9 @@ package com.hankcs.hanlp.collection.trie.bintrie;
|
|||||||
|
|
||||||
import com.hankcs.hanlp.LoadRemoveService;
|
import com.hankcs.hanlp.LoadRemoveService;
|
||||||
import com.hankcs.hanlp.corpus.io.ByteArray;
|
import com.hankcs.hanlp.corpus.io.ByteArray;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.DataOutputStream;
|
import java.io.DataOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.ObjectInput;
|
import java.io.ObjectInput;
|
||||||
@@ -14,8 +17,6 @@ import java.util.Map;
|
|||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Queue;
|
import java.util.Queue;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
|
|
||||||
public abstract class BaseNode<V> implements Comparable<BaseNode> {
|
public abstract class BaseNode<V> implements Comparable<BaseNode> {
|
||||||
@@ -286,12 +287,12 @@ public abstract class BaseNode<V> implements Comparable<BaseNode> {
|
|||||||
+ '}';
|
+ '}';
|
||||||
}
|
}
|
||||||
|
|
||||||
public void walkNode(Set<Map.Entry<String, V>> entrySet, Set<Long> detectModelIds) {
|
public void walkNode(Set<Map.Entry<String, V>> entrySet) {
|
||||||
if (status == Status.WORD_MIDDLE_2 || status == Status.WORD_END_3) {
|
if (status == Status.WORD_MIDDLE_2 || status == Status.WORD_END_3) {
|
||||||
logger.debug("detectModelIds:{},before:{}", detectModelIds, value.toString());
|
logger.debug("walkNode before:{}", value.toString());
|
||||||
List natures = new LoadRemoveService().removeNatures((List) value, detectModelIds);
|
List natures = new LoadRemoveService().removeNatures((List) value);
|
||||||
String name = this.prefix != null ? this.prefix + c : "" + c;
|
String name = this.prefix != null ? this.prefix + c : "" + c;
|
||||||
logger.debug("name:{},after:{},natures:{}", name, (List) value, natures);
|
logger.debug("walkNode name:{},after:{},natures:{}", name, (List) value, natures);
|
||||||
entrySet.add(new TrieEntry(name, (V) natures));
|
entrySet.add(new TrieEntry(name, (V) natures));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -300,21 +301,17 @@ public abstract class BaseNode<V> implements Comparable<BaseNode> {
|
|||||||
* walk limit
|
* walk limit
|
||||||
* @param sb
|
* @param sb
|
||||||
* @param entrySet
|
* @param entrySet
|
||||||
* @param limit
|
|
||||||
*/
|
*/
|
||||||
public void walkLimit(StringBuilder sb, Set<Map.Entry<String, V>> entrySet, int limit, Set<Long> detectModelIds) {
|
public void walkLimit(StringBuilder sb, Set<Map.Entry<String, V>> entrySet) {
|
||||||
Queue<BaseNode> queue = new ArrayDeque<>();
|
Queue<BaseNode> queue = new ArrayDeque<>();
|
||||||
this.prefix = sb.toString();
|
this.prefix = sb.toString();
|
||||||
queue.add(this);
|
queue.add(this);
|
||||||
while (!queue.isEmpty()) {
|
while (!queue.isEmpty()) {
|
||||||
if (entrySet.size() >= limit) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
BaseNode root = queue.poll();
|
BaseNode root = queue.poll();
|
||||||
if (root == null) {
|
if (root == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
root.walkNode(entrySet, detectModelIds);
|
root.walkNode(entrySet);
|
||||||
if (root.child == null) {
|
if (root.child == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -48,22 +48,16 @@ public class SearchService {
|
|||||||
|
|
||||||
public static List<HanlpMapResult> prefixSearch(String key, int limit, BinTrie<List<String>> binTrie,
|
public static List<HanlpMapResult> prefixSearch(String key, int limit, BinTrie<List<String>> binTrie,
|
||||||
Map<Long, List<Long>> modelIdToDataSetIds, Set<Long> detectDataSetIds) {
|
Map<Long, List<Long>> modelIdToDataSetIds, Set<Long> detectDataSetIds) {
|
||||||
Set<Map.Entry<String, List<String>>> result = prefixSearchLimit(key, limit, binTrie,
|
Set<Map.Entry<String, List<String>>> result = search(key, binTrie);
|
||||||
modelIdToDataSetIds, detectDataSetIds);
|
|
||||||
List<HanlpMapResult> hanlpMapResults = result.stream().map(
|
List<HanlpMapResult> hanlpMapResults = result.stream().map(
|
||||||
entry -> {
|
entry -> {
|
||||||
String name = entry.getKey().replace("#", " ");
|
String name = entry.getKey().replace("#", " ");
|
||||||
return new HanlpMapResult(name, entry.getValue(), key);
|
return new HanlpMapResult(name, entry.getValue(), key);
|
||||||
}
|
}
|
||||||
).sorted((a, b) -> -(b.getName().length() - a.getName().length()))
|
).sorted((a, b) -> -(b.getName().length() - a.getName().length()))
|
||||||
.limit(SEARCH_SIZE)
|
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
for (HanlpMapResult hanlpMapResult : hanlpMapResults) {
|
hanlpMapResults = transformAndFilterByDataSet(hanlpMapResults, modelIdToDataSetIds,
|
||||||
List<String> natures = hanlpMapResult.getNatures().stream()
|
detectDataSetIds, limit);
|
||||||
.map(nature -> NatureHelper.changeModel2DataSet(nature, modelIdToDataSetIds))
|
|
||||||
.flatMap(Collection::stream).collect(Collectors.toList());
|
|
||||||
hanlpMapResult.setNatures(natures);
|
|
||||||
}
|
|
||||||
return hanlpMapResults;
|
return hanlpMapResults;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -80,11 +74,8 @@ public class SearchService {
|
|||||||
|
|
||||||
public static List<HanlpMapResult> suffixSearch(String key, int limit, BinTrie<List<String>> binTrie,
|
public static List<HanlpMapResult> suffixSearch(String key, int limit, BinTrie<List<String>> binTrie,
|
||||||
Map<Long, List<Long>> modelIdToDataSetIds, Set<Long> detectDataSetIds) {
|
Map<Long, List<Long>> modelIdToDataSetIds, Set<Long> detectDataSetIds) {
|
||||||
|
Set<Map.Entry<String, List<String>>> result = search(key, binTrie);
|
||||||
Set<Map.Entry<String, List<String>>> result = prefixSearchLimit(key, limit, binTrie, modelIdToDataSetIds,
|
List<HanlpMapResult> hanlpMapResults = result.stream().map(
|
||||||
detectDataSetIds);
|
|
||||||
|
|
||||||
return result.stream().map(
|
|
||||||
entry -> {
|
entry -> {
|
||||||
String name = entry.getKey().replace("#", " ");
|
String name = entry.getKey().replace("#", " ");
|
||||||
List<String> natures = entry.getValue().stream()
|
List<String> natures = entry.getValue().stream()
|
||||||
@@ -94,15 +85,34 @@ public class SearchService {
|
|||||||
return new HanlpMapResult(name, natures, key);
|
return new HanlpMapResult(name, natures, key);
|
||||||
}
|
}
|
||||||
).sorted((a, b) -> -(b.getName().length() - a.getName().length()))
|
).sorted((a, b) -> -(b.getName().length() - a.getName().length()))
|
||||||
.limit(SEARCH_SIZE)
|
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
|
return transformAndFilterByDataSet(hanlpMapResults, modelIdToDataSetIds, detectDataSetIds, limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Set<Map.Entry<String, List<String>>> prefixSearchLimit(String key, int limit,
|
private static List<HanlpMapResult> transformAndFilterByDataSet(List<HanlpMapResult> hanlpMapResults,
|
||||||
BinTrie<List<String>> binTrie, Map<Long, List<Long>> modelIdToDataSetIds, Set<Long> detectDataSetIds) {
|
Map<Long, List<Long>> modelIdToDataSetIds,
|
||||||
|
Set<Long> detectDataSetIds, int limit) {
|
||||||
Set<Long> detectModelIds = NatureHelper.getModelIds(modelIdToDataSetIds, detectDataSetIds);
|
return hanlpMapResults.stream().peek(hanlpMapResult -> {
|
||||||
|
List<String> natures = hanlpMapResult.getNatures().stream()
|
||||||
|
.map(nature -> NatureHelper.changeModel2DataSet(nature, modelIdToDataSetIds))
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.filter(nature -> {
|
||||||
|
if (CollectionUtils.isEmpty(detectDataSetIds)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
Long dataSetId = NatureHelper.getDataSetId(nature);
|
||||||
|
if (dataSetId != null) {
|
||||||
|
return detectDataSetIds.contains(dataSetId);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}).collect(Collectors.toList());
|
||||||
|
hanlpMapResult.setNatures(natures);
|
||||||
|
}).filter(hanlpMapResult -> !CollectionUtils.isEmpty(hanlpMapResult.getNatures()))
|
||||||
|
.limit(limit).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Set<Map.Entry<String, List<String>>> search(String key,
|
||||||
|
BinTrie<List<String>> binTrie) {
|
||||||
key = key.toLowerCase();
|
key = key.toLowerCase();
|
||||||
Set<Map.Entry<String, List<String>>> entrySet = new TreeSet<Map.Entry<String, List<String>>>();
|
Set<Map.Entry<String, List<String>>> entrySet = new TreeSet<Map.Entry<String, List<String>>>();
|
||||||
|
|
||||||
@@ -122,7 +132,7 @@ public class SearchService {
|
|||||||
if (branch == null) {
|
if (branch == null) {
|
||||||
return entrySet;
|
return entrySet;
|
||||||
}
|
}
|
||||||
branch.walkLimit(sb, entrySet, limit, detectModelIds);
|
branch.walkLimit(sb, entrySet);
|
||||||
return entrySet;
|
return entrySet;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ import com.tencent.supersonic.chat.server.plugin.PluginParseConfig;
|
|||||||
import com.tencent.supersonic.chat.server.plugin.build.WebBase;
|
import com.tencent.supersonic.chat.server.plugin.build.WebBase;
|
||||||
import com.tencent.supersonic.common.pojo.JoinCondition;
|
import com.tencent.supersonic.common.pojo.JoinCondition;
|
||||||
import com.tencent.supersonic.common.pojo.ModelRela;
|
import com.tencent.supersonic.common.pojo.ModelRela;
|
||||||
import com.tencent.supersonic.common.pojo.SystemConfig;
|
|
||||||
import com.tencent.supersonic.common.pojo.enums.AggOperatorEnum;
|
import com.tencent.supersonic.common.pojo.enums.AggOperatorEnum;
|
||||||
import com.tencent.supersonic.common.pojo.enums.AggregateTypeEnum;
|
import com.tencent.supersonic.common.pojo.enums.AggregateTypeEnum;
|
||||||
import com.tencent.supersonic.common.pojo.enums.FilterOperatorEnum;
|
import com.tencent.supersonic.common.pojo.enums.FilterOperatorEnum;
|
||||||
|
|||||||
Reference in New Issue
Block a user