(improvement)(chat) fixed HanLP multiple uppercase(#963) (#1254)

This commit is contained in:
jipeli
2024-06-27 18:56:34 +08:00
committed by GitHub
parent e07e74064d
commit 4e4943ffd1
5 changed files with 166 additions and 18 deletions

View File

@@ -8,7 +8,6 @@ import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.utility.Predefine;
import com.hankcs.hanlp.utility.TextUtility;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
@@ -16,8 +15,11 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.TreeMap;
import java.util.stream.Collectors;
/**
* 使用DoubleArrayTrie实现的核心词典
@@ -73,7 +75,8 @@ public class CoreDictionary {
totalFrequency += attribute.totalFrequency;
}
Predefine.logger.info(
"核心词典读入词条" + map.size() + " 全部频次" + totalFrequency + ",耗时" + (System.currentTimeMillis() - start)
"核心词典读入词条" + map.size() + " 全部频次" + totalFrequency + ",耗时" + (
System.currentTimeMillis() - start)
+ "ms");
br.close();
trie.build(map);
@@ -214,12 +217,14 @@ public class CoreDictionary {
public int[] frequency;
public int totalFrequency;
public String[] originals;
public String original = null;
public Attribute(int size) {
nature = new Nature[size];
frequency = new int[size];
originals = new String[size];
}
public Attribute(Nature[] nature, int[] frequency) {
@@ -240,6 +245,13 @@ public class CoreDictionary {
this.totalFrequency = totalFrequency;
}
public Attribute(Nature[] nature, int[] frequency, String[] originals, int totalFrequency) {
this.nature = nature;
this.frequency = frequency;
this.originals = originals;
this.totalFrequency = totalFrequency;
}
/**
* 使用单个词性默认词频1000构造
*
@@ -365,6 +377,35 @@ public class CoreDictionary {
out.writeInt(frequency[i]);
}
}
public void setOriginals(String original) {
if (original == null) {
return;
}
if (originals == null || originals.length == 0) {
originals = new String[1];
}
originals[0] = original;
}
public String getOriginal(Nature find) {
if (originals == null || originals.length == 0 || find == null) {
return null;
}
for (int i = 0; i < nature.length; i++) {
if (find.equals(nature[i]) && originals.length > i) {
return originals[i];
}
}
return null;
}
public List<String> getOriginals() {
if (originals == null || originals.length == 0) {
return null;
}
return Arrays.stream(originals).filter(o -> o != null).distinct().collect(Collectors.toList());
}
}
/**

View File

@@ -8,6 +8,7 @@ import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
@@ -18,21 +19,31 @@ public class DictionaryAttributeUtil {
public static CoreDictionary.Attribute getAttribute(CoreDictionary.Attribute old, CoreDictionary.Attribute add) {
Map<Nature, Integer> map = new HashMap<>();
IntStream.range(0, old.nature.length).boxed().forEach(i -> map.put(old.nature[i], old.frequency[i]));
IntStream.range(0, add.nature.length).boxed().forEach(i -> map.put(add.nature[i], add.frequency[i]));
Map<Nature, String> originalMap = new HashMap<>();
IntStream.range(0, old.nature.length).boxed().forEach(i -> {
map.put(old.nature[i], old.frequency[i]);
if (Objects.nonNull(old.originals)) {
originalMap.put(old.nature[i], old.originals[i]);
}
});
IntStream.range(0, add.nature.length).boxed().forEach(i -> {
map.put(add.nature[i], add.frequency[i]);
if (Objects.nonNull(add.originals)) {
originalMap.put(add.nature[i], add.originals[i]);
}
});
List<Map.Entry<Nature, Integer>> list = new LinkedList<Map.Entry<Nature, Integer>>(map.entrySet());
Collections.sort(list, new Comparator<Map.Entry<Nature, Integer>>() {
public int compare(Map.Entry<Nature, Integer> o1, Map.Entry<Nature, Integer> o2) {
return o2.getValue() - o1.getValue();
}
});
String[] originals = list.stream().map(l -> originalMap.get(l.getKey())).toArray(String[]::new);
CoreDictionary.Attribute attribute = new CoreDictionary.Attribute(
list.stream().map(i -> i.getKey()).collect(Collectors.toList()).toArray(new Nature[0]),
list.stream().map(i -> i.getValue()).mapToInt(Integer::intValue).toArray(),
originals,
list.stream().map(i -> i.getValue()).findFirst().get());
if (old.original != null || add.original != null) {
attribute.original = add.original != null ? add.original : old.original;
}
return attribute;
}
}

View File

@@ -108,10 +108,11 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
attribute.nature[i] = LexiconUtility.convertStringToNature(param[1 + 2 * i],
customNatureCollector);
attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]);
attribute.originals[i] = original;
attribute.totalFrequency += attribute.frequency[i];
}
}
attribute.original = original;
//attribute.original = original;
if (removeDuplicates && map.containsKey(word)) {
attribute = DictionaryAttributeUtil.getAttribute(map.get(word), attribute);
@@ -373,7 +374,7 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
if (att == null) {
return false;
} else if (this.dat.containsKey(word)) {
att.original = original;
att.setOriginals(original);
att = DictionaryAttributeUtil.getAttribute(this.dat.get(word), att);
this.dat.set(word, att);
// return true;
@@ -381,7 +382,8 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
if (this.trie == null) {
this.trie = new BinTrie();
}
att.original = original;
//att.original = original;
att.setOriginals(original);
if (this.trie.containsKey(word)) {
att = DictionaryAttributeUtil.getAttribute(this.trie.get(word), att);
}

View File

@@ -9,17 +9,14 @@ import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
import com.tencent.supersonic.headless.chat.knowledge.DatabaseMapResult;
import com.tencent.supersonic.headless.chat.knowledge.DictWord;
import com.tencent.supersonic.headless.chat.knowledge.EmbeddingResult;
import com.tencent.supersonic.headless.chat.knowledge.HadoopFileIOAdapter;
import com.tencent.supersonic.headless.chat.knowledge.HanlpMapResult;
import com.tencent.supersonic.headless.chat.knowledge.MapResult;
import com.tencent.supersonic.headless.chat.knowledge.MultiCustomDictionary;
import com.tencent.supersonic.headless.chat.knowledge.SearchService;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.BeanUtils;
import org.springframework.util.CollectionUtils;
import org.springframework.util.ResourceUtils;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
@@ -29,6 +26,11 @@ import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.BeanUtils;
import org.springframework.util.CollectionUtils;
import org.springframework.util.ResourceUtils;
/**
* HanLP helper
@@ -200,16 +202,69 @@ public class HanlpHelper {
if (CollectionUtils.isEmpty(mapResults)) {
return;
}
List<T> newResults = new ArrayList<>();
for (T mapResult : mapResults) {
boolean isAdd = false;
if (MultiCustomDictionary.isLowerLetter(mapResult.getName())) {
if (CustomDictionary.contains(mapResult.getName())) {
CoreDictionary.Attribute attribute = CustomDictionary.get(mapResult.getName());
if (attribute != null && attribute.original != null) {
mapResult.setName(attribute.original);
if (attribute != null) {
isAdd = addLetterOriginal(newResults, mapResult, attribute);
}
}
}
if (!isAdd) {
newResults.add(mapResult);
}
}
mapResults.clear();
mapResults.addAll(newResults);
}
public static <T extends MapResult> boolean addLetterOriginal(List<T> mapResults, T mapResult,
CoreDictionary.Attribute attribute) {
boolean isAdd = false;
if (attribute != null) {
if (mapResult instanceof HanlpMapResult) {
HanlpMapResult hanlpMapResult = (HanlpMapResult) mapResult;
for (String nature : hanlpMapResult.getNatures()) {
String orig = attribute.getOriginal(Nature.fromString(nature));
if (orig != null) {
MapResult addMapResult = new HanlpMapResult(orig, Arrays.asList(nature),
hanlpMapResult.getDetectWord());
mapResults.add((T) addMapResult);
isAdd = true;
}
}
} else if (mapResult instanceof DatabaseMapResult) {
List<String> originals = attribute.getOriginals();
if (!CollectionUtils.isEmpty(originals)) {
for (String orig : originals) {
DatabaseMapResult addMapResult = new DatabaseMapResult();
addMapResult.setName(orig);
addMapResult.setSchemaElement(((DatabaseMapResult) mapResult).getSchemaElement());
addMapResult.setDetectWord(mapResult.getDetectWord());
mapResults.add((T) addMapResult);
isAdd = true;
}
}
} else if (mapResult instanceof EmbeddingResult) {
List<String> originals = attribute.getOriginals();
if (!CollectionUtils.isEmpty(originals)) {
for (String orig : originals) {
EmbeddingResult addMapResult = new EmbeddingResult();
addMapResult.setName(orig);
addMapResult.setDetectWord(mapResult.getDetectWord());
addMapResult.setId(((EmbeddingResult) mapResult).getId());
addMapResult.setMetadata(((EmbeddingResult) mapResult).getMetadata());
addMapResult.setDistance(((EmbeddingResult) mapResult).getDistance());
mapResults.add((T) addMapResult);
isAdd = true;
}
}
}
}
return isAdd;
}
public static List<S2Term> getTerms(String text, Map<Long, List<Long>> modelIdToDataSetIds) {

View File

@@ -0,0 +1,39 @@
package com.tencent.supersonic.util;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.CoreDictionary.Attribute;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.chat.knowledge.DatabaseMapResult;
import com.tencent.supersonic.headless.chat.knowledge.MapResult;
import com.tencent.supersonic.headless.chat.knowledge.helper.HanlpHelper;
import java.util.ArrayList;
import java.util.List;
import org.junit.Assert;
import org.junit.Test;
public class HanlpTest {
@Test
public void test() {
Nature[] nature = new Nature[2];
nature[0] = Nature.create("_3");
nature[1] = Nature.create("_4");
int[] frequency = new int[2];
frequency[0] = 100;
frequency[1] = 200;
String[] originals = new String[2];
originals[0] = "AA";
originals[1] = "Aa";
Attribute att = new Attribute(nature, frequency, originals, 200);
att.original = "DDDDD";
HanlpHelper.getDynamicCustomDictionary().getTrie().set("aa", att);
List<MapResult> mapResults = new ArrayList<>();
DatabaseMapResult addMapResult = new DatabaseMapResult();
addMapResult.setName("aa");
addMapResult.setSchemaElement(new SchemaElement());
addMapResult.setDetectWord("abc");
mapResults.add(addMapResult);
HanlpHelper.transLetterOriginal(mapResults);
Assert.assertEquals(mapResults.size(), 2);
}
}