mirror of
https://github.com/tencentmusic/supersonic.git
synced 2025-12-15 06:27:21 +00:00
@@ -8,6 +8,7 @@ import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
@@ -18,21 +19,31 @@ public class DictionaryAttributeUtil {
|
||||
|
||||
public static CoreDictionary.Attribute getAttribute(CoreDictionary.Attribute old, CoreDictionary.Attribute add) {
|
||||
Map<Nature, Integer> map = new HashMap<>();
|
||||
IntStream.range(0, old.nature.length).boxed().forEach(i -> map.put(old.nature[i], old.frequency[i]));
|
||||
IntStream.range(0, add.nature.length).boxed().forEach(i -> map.put(add.nature[i], add.frequency[i]));
|
||||
Map<Nature, String> originalMap = new HashMap<>();
|
||||
IntStream.range(0, old.nature.length).boxed().forEach(i -> {
|
||||
map.put(old.nature[i], old.frequency[i]);
|
||||
if (Objects.nonNull(old.originals)) {
|
||||
originalMap.put(old.nature[i], old.originals[i]);
|
||||
}
|
||||
});
|
||||
IntStream.range(0, add.nature.length).boxed().forEach(i -> {
|
||||
map.put(add.nature[i], add.frequency[i]);
|
||||
if (Objects.nonNull(add.originals)) {
|
||||
originalMap.put(add.nature[i], add.originals[i]);
|
||||
}
|
||||
});
|
||||
List<Map.Entry<Nature, Integer>> list = new LinkedList<Map.Entry<Nature, Integer>>(map.entrySet());
|
||||
Collections.sort(list, new Comparator<Map.Entry<Nature, Integer>>() {
|
||||
public int compare(Map.Entry<Nature, Integer> o1, Map.Entry<Nature, Integer> o2) {
|
||||
return o2.getValue() - o1.getValue();
|
||||
}
|
||||
});
|
||||
String[] originals = list.stream().map(l -> originalMap.get(l.getKey())).toArray(String[]::new);
|
||||
CoreDictionary.Attribute attribute = new CoreDictionary.Attribute(
|
||||
list.stream().map(i -> i.getKey()).collect(Collectors.toList()).toArray(new Nature[0]),
|
||||
list.stream().map(i -> i.getValue()).mapToInt(Integer::intValue).toArray(),
|
||||
originals,
|
||||
list.stream().map(i -> i.getValue()).findFirst().get());
|
||||
if (old.original != null || add.original != null) {
|
||||
attribute.original = add.original != null ? add.original : old.original;
|
||||
}
|
||||
return attribute;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -108,10 +108,11 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
|
||||
attribute.nature[i] = LexiconUtility.convertStringToNature(param[1 + 2 * i],
|
||||
customNatureCollector);
|
||||
attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]);
|
||||
attribute.originals[i] = original;
|
||||
attribute.totalFrequency += attribute.frequency[i];
|
||||
}
|
||||
}
|
||||
attribute.original = original;
|
||||
//attribute.original = original;
|
||||
|
||||
if (removeDuplicates && map.containsKey(word)) {
|
||||
attribute = DictionaryAttributeUtil.getAttribute(map.get(word), attribute);
|
||||
@@ -373,7 +374,7 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
|
||||
if (att == null) {
|
||||
return false;
|
||||
} else if (this.dat.containsKey(word)) {
|
||||
att.original = original;
|
||||
att.setOriginals(original);
|
||||
att = DictionaryAttributeUtil.getAttribute(this.dat.get(word), att);
|
||||
this.dat.set(word, att);
|
||||
// return true;
|
||||
@@ -381,7 +382,8 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
|
||||
if (this.trie == null) {
|
||||
this.trie = new BinTrie();
|
||||
}
|
||||
att.original = original;
|
||||
//att.original = original;
|
||||
att.setOriginals(original);
|
||||
if (this.trie.containsKey(word)) {
|
||||
att = DictionaryAttributeUtil.getAttribute(this.trie.get(word), att);
|
||||
}
|
||||
|
||||
@@ -9,17 +9,14 @@ import com.hankcs.hanlp.seg.Segment;
|
||||
import com.hankcs.hanlp.seg.common.Term;
|
||||
import com.tencent.supersonic.common.pojo.enums.DictWordType;
|
||||
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.DatabaseMapResult;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.DictWord;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.EmbeddingResult;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.HadoopFileIOAdapter;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.HanlpMapResult;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.MapResult;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.MultiCustomDictionary;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.SearchService;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
import org.springframework.util.ResourceUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
@@ -29,6 +26,11 @@ import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
import org.springframework.util.ResourceUtils;
|
||||
|
||||
/**
|
||||
* HanLP helper
|
||||
@@ -200,16 +202,69 @@ public class HanlpHelper {
|
||||
if (CollectionUtils.isEmpty(mapResults)) {
|
||||
return;
|
||||
}
|
||||
List<T> newResults = new ArrayList<>();
|
||||
for (T mapResult : mapResults) {
|
||||
boolean isAdd = false;
|
||||
if (MultiCustomDictionary.isLowerLetter(mapResult.getName())) {
|
||||
if (CustomDictionary.contains(mapResult.getName())) {
|
||||
CoreDictionary.Attribute attribute = CustomDictionary.get(mapResult.getName());
|
||||
if (attribute != null && attribute.original != null) {
|
||||
mapResult.setName(attribute.original);
|
||||
if (attribute != null) {
|
||||
isAdd = addLetterOriginal(newResults, mapResult, attribute);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!isAdd) {
|
||||
newResults.add(mapResult);
|
||||
}
|
||||
}
|
||||
mapResults.clear();
|
||||
mapResults.addAll(newResults);
|
||||
}
|
||||
|
||||
public static <T extends MapResult> boolean addLetterOriginal(List<T> mapResults, T mapResult,
|
||||
CoreDictionary.Attribute attribute) {
|
||||
boolean isAdd = false;
|
||||
if (attribute != null) {
|
||||
if (mapResult instanceof HanlpMapResult) {
|
||||
HanlpMapResult hanlpMapResult = (HanlpMapResult) mapResult;
|
||||
for (String nature : hanlpMapResult.getNatures()) {
|
||||
String orig = attribute.getOriginal(Nature.fromString(nature));
|
||||
if (orig != null) {
|
||||
MapResult addMapResult = new HanlpMapResult(orig, Arrays.asList(nature),
|
||||
hanlpMapResult.getDetectWord());
|
||||
mapResults.add((T) addMapResult);
|
||||
isAdd = true;
|
||||
}
|
||||
}
|
||||
} else if (mapResult instanceof DatabaseMapResult) {
|
||||
List<String> originals = attribute.getOriginals();
|
||||
if (!CollectionUtils.isEmpty(originals)) {
|
||||
for (String orig : originals) {
|
||||
DatabaseMapResult addMapResult = new DatabaseMapResult();
|
||||
addMapResult.setName(orig);
|
||||
addMapResult.setSchemaElement(((DatabaseMapResult) mapResult).getSchemaElement());
|
||||
addMapResult.setDetectWord(mapResult.getDetectWord());
|
||||
mapResults.add((T) addMapResult);
|
||||
isAdd = true;
|
||||
}
|
||||
}
|
||||
} else if (mapResult instanceof EmbeddingResult) {
|
||||
List<String> originals = attribute.getOriginals();
|
||||
if (!CollectionUtils.isEmpty(originals)) {
|
||||
for (String orig : originals) {
|
||||
EmbeddingResult addMapResult = new EmbeddingResult();
|
||||
addMapResult.setName(orig);
|
||||
addMapResult.setDetectWord(mapResult.getDetectWord());
|
||||
addMapResult.setId(((EmbeddingResult) mapResult).getId());
|
||||
addMapResult.setMetadata(((EmbeddingResult) mapResult).getMetadata());
|
||||
addMapResult.setDistance(((EmbeddingResult) mapResult).getDistance());
|
||||
mapResults.add((T) addMapResult);
|
||||
isAdd = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return isAdd;
|
||||
}
|
||||
|
||||
public static List<S2Term> getTerms(String text, Map<Long, List<Long>> modelIdToDataSetIds) {
|
||||
|
||||
Reference in New Issue
Block a user