(improvement)(chat) fixed HanLP multiple uppercase(#963) (#1254)

This commit is contained in:
jipeli
2024-06-27 18:56:34 +08:00
committed by GitHub
parent e07e74064d
commit 4e4943ffd1
5 changed files with 166 additions and 18 deletions

View File

@@ -8,7 +8,6 @@ import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.corpus.tag.Nature; import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.utility.Predefine; import com.hankcs.hanlp.utility.Predefine;
import com.hankcs.hanlp.utility.TextUtility; import com.hankcs.hanlp.utility.TextUtility;
import java.io.BufferedOutputStream; import java.io.BufferedOutputStream;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.DataOutputStream; import java.io.DataOutputStream;
@@ -16,8 +15,11 @@ import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.Serializable; import java.io.Serializable;
import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.List;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.stream.Collectors;
/** /**
* 使用DoubleArrayTrie实现的核心词典 * 使用DoubleArrayTrie实现的核心词典
@@ -73,7 +75,8 @@ public class CoreDictionary {
totalFrequency += attribute.totalFrequency; totalFrequency += attribute.totalFrequency;
} }
Predefine.logger.info( Predefine.logger.info(
"核心词典读入词条" + map.size() + " 全部频次" + totalFrequency + ",耗时" + (System.currentTimeMillis() - start) "核心词典读入词条" + map.size() + " 全部频次" + totalFrequency + ",耗时" + (
System.currentTimeMillis() - start)
+ "ms"); + "ms");
br.close(); br.close();
trie.build(map); trie.build(map);
@@ -214,12 +217,14 @@ public class CoreDictionary {
public int[] frequency; public int[] frequency;
public int totalFrequency; public int totalFrequency;
public String[] originals;
public String original = null; public String original = null;
public Attribute(int size) { public Attribute(int size) {
nature = new Nature[size]; nature = new Nature[size];
frequency = new int[size]; frequency = new int[size];
originals = new String[size];
} }
public Attribute(Nature[] nature, int[] frequency) { public Attribute(Nature[] nature, int[] frequency) {
@@ -240,6 +245,13 @@ public class CoreDictionary {
this.totalFrequency = totalFrequency; this.totalFrequency = totalFrequency;
} }
public Attribute(Nature[] nature, int[] frequency, String[] originals, int totalFrequency) {
this.nature = nature;
this.frequency = frequency;
this.originals = originals;
this.totalFrequency = totalFrequency;
}
/** /**
* 使用单个词性默认词频1000构造 * 使用单个词性默认词频1000构造
* *
@@ -365,6 +377,35 @@ public class CoreDictionary {
out.writeInt(frequency[i]); out.writeInt(frequency[i]);
} }
} }
public void setOriginals(String original) {
if (original == null) {
return;
}
if (originals == null || originals.length == 0) {
originals = new String[1];
}
originals[0] = original;
}
public String getOriginal(Nature find) {
if (originals == null || originals.length == 0 || find == null) {
return null;
}
for (int i = 0; i < nature.length; i++) {
if (find.equals(nature[i]) && originals.length > i) {
return originals[i];
}
}
return null;
}
public List<String> getOriginals() {
if (originals == null || originals.length == 0) {
return null;
}
return Arrays.stream(originals).filter(o -> o != null).distinct().collect(Collectors.toList());
}
} }
/** /**

View File

@@ -8,6 +8,7 @@ import java.util.HashMap;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.IntStream; import java.util.stream.IntStream;
@@ -18,21 +19,31 @@ public class DictionaryAttributeUtil {
public static CoreDictionary.Attribute getAttribute(CoreDictionary.Attribute old, CoreDictionary.Attribute add) { public static CoreDictionary.Attribute getAttribute(CoreDictionary.Attribute old, CoreDictionary.Attribute add) {
Map<Nature, Integer> map = new HashMap<>(); Map<Nature, Integer> map = new HashMap<>();
IntStream.range(0, old.nature.length).boxed().forEach(i -> map.put(old.nature[i], old.frequency[i])); Map<Nature, String> originalMap = new HashMap<>();
IntStream.range(0, add.nature.length).boxed().forEach(i -> map.put(add.nature[i], add.frequency[i])); IntStream.range(0, old.nature.length).boxed().forEach(i -> {
map.put(old.nature[i], old.frequency[i]);
if (Objects.nonNull(old.originals)) {
originalMap.put(old.nature[i], old.originals[i]);
}
});
IntStream.range(0, add.nature.length).boxed().forEach(i -> {
map.put(add.nature[i], add.frequency[i]);
if (Objects.nonNull(add.originals)) {
originalMap.put(add.nature[i], add.originals[i]);
}
});
List<Map.Entry<Nature, Integer>> list = new LinkedList<Map.Entry<Nature, Integer>>(map.entrySet()); List<Map.Entry<Nature, Integer>> list = new LinkedList<Map.Entry<Nature, Integer>>(map.entrySet());
Collections.sort(list, new Comparator<Map.Entry<Nature, Integer>>() { Collections.sort(list, new Comparator<Map.Entry<Nature, Integer>>() {
public int compare(Map.Entry<Nature, Integer> o1, Map.Entry<Nature, Integer> o2) { public int compare(Map.Entry<Nature, Integer> o1, Map.Entry<Nature, Integer> o2) {
return o2.getValue() - o1.getValue(); return o2.getValue() - o1.getValue();
} }
}); });
String[] originals = list.stream().map(l -> originalMap.get(l.getKey())).toArray(String[]::new);
CoreDictionary.Attribute attribute = new CoreDictionary.Attribute( CoreDictionary.Attribute attribute = new CoreDictionary.Attribute(
list.stream().map(i -> i.getKey()).collect(Collectors.toList()).toArray(new Nature[0]), list.stream().map(i -> i.getKey()).collect(Collectors.toList()).toArray(new Nature[0]),
list.stream().map(i -> i.getValue()).mapToInt(Integer::intValue).toArray(), list.stream().map(i -> i.getValue()).mapToInt(Integer::intValue).toArray(),
originals,
list.stream().map(i -> i.getValue()).findFirst().get()); list.stream().map(i -> i.getValue()).findFirst().get());
if (old.original != null || add.original != null) {
attribute.original = add.original != null ? add.original : old.original;
}
return attribute; return attribute;
} }
} }

View File

@@ -108,10 +108,11 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
attribute.nature[i] = LexiconUtility.convertStringToNature(param[1 + 2 * i], attribute.nature[i] = LexiconUtility.convertStringToNature(param[1 + 2 * i],
customNatureCollector); customNatureCollector);
attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]); attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]);
attribute.originals[i] = original;
attribute.totalFrequency += attribute.frequency[i]; attribute.totalFrequency += attribute.frequency[i];
} }
} }
attribute.original = original; //attribute.original = original;
if (removeDuplicates && map.containsKey(word)) { if (removeDuplicates && map.containsKey(word)) {
attribute = DictionaryAttributeUtil.getAttribute(map.get(word), attribute); attribute = DictionaryAttributeUtil.getAttribute(map.get(word), attribute);
@@ -373,7 +374,7 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
if (att == null) { if (att == null) {
return false; return false;
} else if (this.dat.containsKey(word)) { } else if (this.dat.containsKey(word)) {
att.original = original; att.setOriginals(original);
att = DictionaryAttributeUtil.getAttribute(this.dat.get(word), att); att = DictionaryAttributeUtil.getAttribute(this.dat.get(word), att);
this.dat.set(word, att); this.dat.set(word, att);
// return true; // return true;
@@ -381,7 +382,8 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
if (this.trie == null) { if (this.trie == null) {
this.trie = new BinTrie(); this.trie = new BinTrie();
} }
att.original = original; //att.original = original;
att.setOriginals(original);
if (this.trie.containsKey(word)) { if (this.trie.containsKey(word)) {
att = DictionaryAttributeUtil.getAttribute(this.trie.get(word), att); att = DictionaryAttributeUtil.getAttribute(this.trie.get(word), att);
} }

View File

@@ -9,17 +9,14 @@ import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term; import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.common.pojo.enums.DictWordType; import com.tencent.supersonic.common.pojo.enums.DictWordType;
import com.tencent.supersonic.headless.api.pojo.response.S2Term; import com.tencent.supersonic.headless.api.pojo.response.S2Term;
import com.tencent.supersonic.headless.chat.knowledge.DatabaseMapResult;
import com.tencent.supersonic.headless.chat.knowledge.DictWord; import com.tencent.supersonic.headless.chat.knowledge.DictWord;
import com.tencent.supersonic.headless.chat.knowledge.EmbeddingResult;
import com.tencent.supersonic.headless.chat.knowledge.HadoopFileIOAdapter; import com.tencent.supersonic.headless.chat.knowledge.HadoopFileIOAdapter;
import com.tencent.supersonic.headless.chat.knowledge.HanlpMapResult;
import com.tencent.supersonic.headless.chat.knowledge.MapResult; import com.tencent.supersonic.headless.chat.knowledge.MapResult;
import com.tencent.supersonic.headless.chat.knowledge.MultiCustomDictionary; import com.tencent.supersonic.headless.chat.knowledge.MultiCustomDictionary;
import com.tencent.supersonic.headless.chat.knowledge.SearchService; import com.tencent.supersonic.headless.chat.knowledge.SearchService;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.BeanUtils;
import org.springframework.util.CollectionUtils;
import org.springframework.util.ResourceUtils;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
@@ -29,6 +26,11 @@ import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.BeanUtils;
import org.springframework.util.CollectionUtils;
import org.springframework.util.ResourceUtils;
/** /**
* HanLP helper * HanLP helper
@@ -200,16 +202,69 @@ public class HanlpHelper {
if (CollectionUtils.isEmpty(mapResults)) { if (CollectionUtils.isEmpty(mapResults)) {
return; return;
} }
List<T> newResults = new ArrayList<>();
for (T mapResult : mapResults) { for (T mapResult : mapResults) {
boolean isAdd = false;
if (MultiCustomDictionary.isLowerLetter(mapResult.getName())) { if (MultiCustomDictionary.isLowerLetter(mapResult.getName())) {
if (CustomDictionary.contains(mapResult.getName())) { if (CustomDictionary.contains(mapResult.getName())) {
CoreDictionary.Attribute attribute = CustomDictionary.get(mapResult.getName()); CoreDictionary.Attribute attribute = CustomDictionary.get(mapResult.getName());
if (attribute != null && attribute.original != null) { if (attribute != null) {
mapResult.setName(attribute.original); isAdd = addLetterOriginal(newResults, mapResult, attribute);
}
}
}
if (!isAdd) {
newResults.add(mapResult);
}
}
mapResults.clear();
mapResults.addAll(newResults);
}
public static <T extends MapResult> boolean addLetterOriginal(List<T> mapResults, T mapResult,
CoreDictionary.Attribute attribute) {
boolean isAdd = false;
if (attribute != null) {
if (mapResult instanceof HanlpMapResult) {
HanlpMapResult hanlpMapResult = (HanlpMapResult) mapResult;
for (String nature : hanlpMapResult.getNatures()) {
String orig = attribute.getOriginal(Nature.fromString(nature));
if (orig != null) {
MapResult addMapResult = new HanlpMapResult(orig, Arrays.asList(nature),
hanlpMapResult.getDetectWord());
mapResults.add((T) addMapResult);
isAdd = true;
}
}
} else if (mapResult instanceof DatabaseMapResult) {
List<String> originals = attribute.getOriginals();
if (!CollectionUtils.isEmpty(originals)) {
for (String orig : originals) {
DatabaseMapResult addMapResult = new DatabaseMapResult();
addMapResult.setName(orig);
addMapResult.setSchemaElement(((DatabaseMapResult) mapResult).getSchemaElement());
addMapResult.setDetectWord(mapResult.getDetectWord());
mapResults.add((T) addMapResult);
isAdd = true;
}
}
} else if (mapResult instanceof EmbeddingResult) {
List<String> originals = attribute.getOriginals();
if (!CollectionUtils.isEmpty(originals)) {
for (String orig : originals) {
EmbeddingResult addMapResult = new EmbeddingResult();
addMapResult.setName(orig);
addMapResult.setDetectWord(mapResult.getDetectWord());
addMapResult.setId(((EmbeddingResult) mapResult).getId());
addMapResult.setMetadata(((EmbeddingResult) mapResult).getMetadata());
addMapResult.setDistance(((EmbeddingResult) mapResult).getDistance());
mapResults.add((T) addMapResult);
isAdd = true;
} }
} }
} }
} }
return isAdd;
} }
public static List<S2Term> getTerms(String text, Map<Long, List<Long>> modelIdToDataSetIds) { public static List<S2Term> getTerms(String text, Map<Long, List<Long>> modelIdToDataSetIds) {

View File

@@ -0,0 +1,39 @@
package com.tencent.supersonic.util;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.CoreDictionary.Attribute;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.chat.knowledge.DatabaseMapResult;
import com.tencent.supersonic.headless.chat.knowledge.MapResult;
import com.tencent.supersonic.headless.chat.knowledge.helper.HanlpHelper;
import java.util.ArrayList;
import java.util.List;
import org.junit.Assert;
import org.junit.Test;
public class HanlpTest {
@Test
public void test() {
Nature[] nature = new Nature[2];
nature[0] = Nature.create("_3");
nature[1] = Nature.create("_4");
int[] frequency = new int[2];
frequency[0] = 100;
frequency[1] = 200;
String[] originals = new String[2];
originals[0] = "AA";
originals[1] = "Aa";
Attribute att = new Attribute(nature, frequency, originals, 200);
att.original = "DDDDD";
HanlpHelper.getDynamicCustomDictionary().getTrie().set("aa", att);
List<MapResult> mapResults = new ArrayList<>();
DatabaseMapResult addMapResult = new DatabaseMapResult();
addMapResult.setName("aa");
addMapResult.setSchemaElement(new SchemaElement());
addMapResult.setDetectWord("abc");
mapResults.add(addMapResult);
HanlpHelper.transLetterOriginal(mapResults);
Assert.assertEquals(mapResults.size(), 2);
}
}