(improvement)(chat) fixed HanLP multiple uppercase(#963) (#1254)

This commit is contained in:
jipeli
2024-06-27 18:56:34 +08:00
committed by GitHub
parent e07e74064d
commit 4e4943ffd1
5 changed files with 166 additions and 18 deletions

View File

@@ -8,6 +8,7 @@ import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
@@ -18,21 +19,31 @@ public class DictionaryAttributeUtil {
public static CoreDictionary.Attribute getAttribute(CoreDictionary.Attribute old, CoreDictionary.Attribute add) {
Map<Nature, Integer> map = new HashMap<>();
IntStream.range(0, old.nature.length).boxed().forEach(i -> map.put(old.nature[i], old.frequency[i]));
IntStream.range(0, add.nature.length).boxed().forEach(i -> map.put(add.nature[i], add.frequency[i]));
Map<Nature, String> originalMap = new HashMap<>();
IntStream.range(0, old.nature.length).boxed().forEach(i -> {
map.put(old.nature[i], old.frequency[i]);
if (Objects.nonNull(old.originals)) {
originalMap.put(old.nature[i], old.originals[i]);
}
});
IntStream.range(0, add.nature.length).boxed().forEach(i -> {
map.put(add.nature[i], add.frequency[i]);
if (Objects.nonNull(add.originals)) {
originalMap.put(add.nature[i], add.originals[i]);
}
});
List<Map.Entry<Nature, Integer>> list = new LinkedList<Map.Entry<Nature, Integer>>(map.entrySet());
Collections.sort(list, new Comparator<Map.Entry<Nature, Integer>>() {
public int compare(Map.Entry<Nature, Integer> o1, Map.Entry<Nature, Integer> o2) {
return o2.getValue() - o1.getValue();
}
});
String[] originals = list.stream().map(l -> originalMap.get(l.getKey())).toArray(String[]::new);
CoreDictionary.Attribute attribute = new CoreDictionary.Attribute(
list.stream().map(i -> i.getKey()).collect(Collectors.toList()).toArray(new Nature[0]),
list.stream().map(i -> i.getValue()).mapToInt(Integer::intValue).toArray(),
originals,
list.stream().map(i -> i.getValue()).findFirst().get());
if (old.original != null || add.original != null) {
attribute.original = add.original != null ? add.original : old.original;
}
return attribute;
}
}

View File

@@ -108,10 +108,11 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
attribute.nature[i] = LexiconUtility.convertStringToNature(param[1 + 2 * i],
customNatureCollector);
attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]);
attribute.originals[i] = original;
attribute.totalFrequency += attribute.frequency[i];
}
}
attribute.original = original;
//attribute.original = original;
if (removeDuplicates && map.containsKey(word)) {
attribute = DictionaryAttributeUtil.getAttribute(map.get(word), attribute);
@@ -373,7 +374,7 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
if (att == null) {
return false;
} else if (this.dat.containsKey(word)) {
att.original = original;
att.setOriginals(original);
att = DictionaryAttributeUtil.getAttribute(this.dat.get(word), att);
this.dat.set(word, att);
// return true;
@@ -381,7 +382,8 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
if (this.trie == null) {
this.trie = new BinTrie();
}
att.original = original;
//att.original = original;
att.setOriginals(original);
if (this.trie.containsKey(word)) {
att = DictionaryAttributeUtil.getAttribute(this.trie.get(word), att);
}

View File

@@ -9,17 +9,14 @@ import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
import com.tencent.supersonic.headless.chat.knowledge.DatabaseMapResult;
import com.tencent.supersonic.headless.chat.knowledge.DictWord;
import com.tencent.supersonic.headless.chat.knowledge.EmbeddingResult;
import com.tencent.supersonic.headless.chat.knowledge.HadoopFileIOAdapter;
import com.tencent.supersonic.headless.chat.knowledge.HanlpMapResult;
import com.tencent.supersonic.headless.chat.knowledge.MapResult;
import com.tencent.supersonic.headless.chat.knowledge.MultiCustomDictionary;
import com.tencent.supersonic.headless.chat.knowledge.SearchService;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.BeanUtils;
import org.springframework.util.CollectionUtils;
import org.springframework.util.ResourceUtils;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
@@ -29,6 +26,11 @@ import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.BeanUtils;
import org.springframework.util.CollectionUtils;
import org.springframework.util.ResourceUtils;
/**
* HanLP helper
@@ -200,16 +202,69 @@ public class HanlpHelper {
if (CollectionUtils.isEmpty(mapResults)) {
return;
}
List<T> newResults = new ArrayList<>();
for (T mapResult : mapResults) {
boolean isAdd = false;
if (MultiCustomDictionary.isLowerLetter(mapResult.getName())) {
if (CustomDictionary.contains(mapResult.getName())) {
CoreDictionary.Attribute attribute = CustomDictionary.get(mapResult.getName());
if (attribute != null && attribute.original != null) {
mapResult.setName(attribute.original);
if (attribute != null) {
isAdd = addLetterOriginal(newResults, mapResult, attribute);
}
}
}
if (!isAdd) {
newResults.add(mapResult);
}
}
mapResults.clear();
mapResults.addAll(newResults);
}
public static <T extends MapResult> boolean addLetterOriginal(List<T> mapResults, T mapResult,
CoreDictionary.Attribute attribute) {
boolean isAdd = false;
if (attribute != null) {
if (mapResult instanceof HanlpMapResult) {
HanlpMapResult hanlpMapResult = (HanlpMapResult) mapResult;
for (String nature : hanlpMapResult.getNatures()) {
String orig = attribute.getOriginal(Nature.fromString(nature));
if (orig != null) {
MapResult addMapResult = new HanlpMapResult(orig, Arrays.asList(nature),
hanlpMapResult.getDetectWord());
mapResults.add((T) addMapResult);
isAdd = true;
}
}
} else if (mapResult instanceof DatabaseMapResult) {
List<String> originals = attribute.getOriginals();
if (!CollectionUtils.isEmpty(originals)) {
for (String orig : originals) {
DatabaseMapResult addMapResult = new DatabaseMapResult();
addMapResult.setName(orig);
addMapResult.setSchemaElement(((DatabaseMapResult) mapResult).getSchemaElement());
addMapResult.setDetectWord(mapResult.getDetectWord());
mapResults.add((T) addMapResult);
isAdd = true;
}
}
} else if (mapResult instanceof EmbeddingResult) {
List<String> originals = attribute.getOriginals();
if (!CollectionUtils.isEmpty(originals)) {
for (String orig : originals) {
EmbeddingResult addMapResult = new EmbeddingResult();
addMapResult.setName(orig);
addMapResult.setDetectWord(mapResult.getDetectWord());
addMapResult.setId(((EmbeddingResult) mapResult).getId());
addMapResult.setMetadata(((EmbeddingResult) mapResult).getMetadata());
addMapResult.setDistance(((EmbeddingResult) mapResult).getDistance());
mapResults.add((T) addMapResult);
isAdd = true;
}
}
}
}
return isAdd;
}
public static List<S2Term> getTerms(String text, Map<Long, List<Long>> modelIdToDataSetIds) {