mirror of
https://github.com/tencentmusic/supersonic.git
synced 2025-12-13 04:57:28 +00:00
@@ -8,7 +8,6 @@ import com.hankcs.hanlp.corpus.io.IOUtil;
|
|||||||
import com.hankcs.hanlp.corpus.tag.Nature;
|
import com.hankcs.hanlp.corpus.tag.Nature;
|
||||||
import com.hankcs.hanlp.utility.Predefine;
|
import com.hankcs.hanlp.utility.Predefine;
|
||||||
import com.hankcs.hanlp.utility.TextUtility;
|
import com.hankcs.hanlp.utility.TextUtility;
|
||||||
|
|
||||||
import java.io.BufferedOutputStream;
|
import java.io.BufferedOutputStream;
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.DataOutputStream;
|
import java.io.DataOutputStream;
|
||||||
@@ -16,8 +15,11 @@ import java.io.FileNotFoundException;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 使用DoubleArrayTrie实现的核心词典
|
* 使用DoubleArrayTrie实现的核心词典
|
||||||
@@ -73,7 +75,8 @@ public class CoreDictionary {
|
|||||||
totalFrequency += attribute.totalFrequency;
|
totalFrequency += attribute.totalFrequency;
|
||||||
}
|
}
|
||||||
Predefine.logger.info(
|
Predefine.logger.info(
|
||||||
"核心词典读入词条" + map.size() + " 全部频次" + totalFrequency + ",耗时" + (System.currentTimeMillis() - start)
|
"核心词典读入词条" + map.size() + " 全部频次" + totalFrequency + ",耗时" + (
|
||||||
|
System.currentTimeMillis() - start)
|
||||||
+ "ms");
|
+ "ms");
|
||||||
br.close();
|
br.close();
|
||||||
trie.build(map);
|
trie.build(map);
|
||||||
@@ -214,12 +217,14 @@ public class CoreDictionary {
|
|||||||
public int[] frequency;
|
public int[] frequency;
|
||||||
|
|
||||||
public int totalFrequency;
|
public int totalFrequency;
|
||||||
|
public String[] originals;
|
||||||
public String original = null;
|
public String original = null;
|
||||||
|
|
||||||
|
|
||||||
public Attribute(int size) {
|
public Attribute(int size) {
|
||||||
nature = new Nature[size];
|
nature = new Nature[size];
|
||||||
frequency = new int[size];
|
frequency = new int[size];
|
||||||
|
originals = new String[size];
|
||||||
}
|
}
|
||||||
|
|
||||||
public Attribute(Nature[] nature, int[] frequency) {
|
public Attribute(Nature[] nature, int[] frequency) {
|
||||||
@@ -240,6 +245,13 @@ public class CoreDictionary {
|
|||||||
this.totalFrequency = totalFrequency;
|
this.totalFrequency = totalFrequency;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Attribute(Nature[] nature, int[] frequency, String[] originals, int totalFrequency) {
|
||||||
|
this.nature = nature;
|
||||||
|
this.frequency = frequency;
|
||||||
|
this.originals = originals;
|
||||||
|
this.totalFrequency = totalFrequency;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 使用单个词性,默认词频1000构造
|
* 使用单个词性,默认词频1000构造
|
||||||
*
|
*
|
||||||
@@ -365,6 +377,35 @@ public class CoreDictionary {
|
|||||||
out.writeInt(frequency[i]);
|
out.writeInt(frequency[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setOriginals(String original) {
|
||||||
|
if (original == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (originals == null || originals.length == 0) {
|
||||||
|
originals = new String[1];
|
||||||
|
}
|
||||||
|
originals[0] = original;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getOriginal(Nature find) {
|
||||||
|
if (originals == null || originals.length == 0 || find == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < nature.length; i++) {
|
||||||
|
if (find.equals(nature[i]) && originals.length > i) {
|
||||||
|
return originals[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getOriginals() {
|
||||||
|
if (originals == null || originals.length == 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return Arrays.stream(originals).filter(o -> o != null).distinct().collect(Collectors.toList());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import java.util.HashMap;
|
|||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.IntStream;
|
import java.util.stream.IntStream;
|
||||||
|
|
||||||
@@ -18,21 +19,31 @@ public class DictionaryAttributeUtil {
|
|||||||
|
|
||||||
public static CoreDictionary.Attribute getAttribute(CoreDictionary.Attribute old, CoreDictionary.Attribute add) {
|
public static CoreDictionary.Attribute getAttribute(CoreDictionary.Attribute old, CoreDictionary.Attribute add) {
|
||||||
Map<Nature, Integer> map = new HashMap<>();
|
Map<Nature, Integer> map = new HashMap<>();
|
||||||
IntStream.range(0, old.nature.length).boxed().forEach(i -> map.put(old.nature[i], old.frequency[i]));
|
Map<Nature, String> originalMap = new HashMap<>();
|
||||||
IntStream.range(0, add.nature.length).boxed().forEach(i -> map.put(add.nature[i], add.frequency[i]));
|
IntStream.range(0, old.nature.length).boxed().forEach(i -> {
|
||||||
|
map.put(old.nature[i], old.frequency[i]);
|
||||||
|
if (Objects.nonNull(old.originals)) {
|
||||||
|
originalMap.put(old.nature[i], old.originals[i]);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
IntStream.range(0, add.nature.length).boxed().forEach(i -> {
|
||||||
|
map.put(add.nature[i], add.frequency[i]);
|
||||||
|
if (Objects.nonNull(add.originals)) {
|
||||||
|
originalMap.put(add.nature[i], add.originals[i]);
|
||||||
|
}
|
||||||
|
});
|
||||||
List<Map.Entry<Nature, Integer>> list = new LinkedList<Map.Entry<Nature, Integer>>(map.entrySet());
|
List<Map.Entry<Nature, Integer>> list = new LinkedList<Map.Entry<Nature, Integer>>(map.entrySet());
|
||||||
Collections.sort(list, new Comparator<Map.Entry<Nature, Integer>>() {
|
Collections.sort(list, new Comparator<Map.Entry<Nature, Integer>>() {
|
||||||
public int compare(Map.Entry<Nature, Integer> o1, Map.Entry<Nature, Integer> o2) {
|
public int compare(Map.Entry<Nature, Integer> o1, Map.Entry<Nature, Integer> o2) {
|
||||||
return o2.getValue() - o1.getValue();
|
return o2.getValue() - o1.getValue();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
String[] originals = list.stream().map(l -> originalMap.get(l.getKey())).toArray(String[]::new);
|
||||||
CoreDictionary.Attribute attribute = new CoreDictionary.Attribute(
|
CoreDictionary.Attribute attribute = new CoreDictionary.Attribute(
|
||||||
list.stream().map(i -> i.getKey()).collect(Collectors.toList()).toArray(new Nature[0]),
|
list.stream().map(i -> i.getKey()).collect(Collectors.toList()).toArray(new Nature[0]),
|
||||||
list.stream().map(i -> i.getValue()).mapToInt(Integer::intValue).toArray(),
|
list.stream().map(i -> i.getValue()).mapToInt(Integer::intValue).toArray(),
|
||||||
|
originals,
|
||||||
list.stream().map(i -> i.getValue()).findFirst().get());
|
list.stream().map(i -> i.getValue()).findFirst().get());
|
||||||
if (old.original != null || add.original != null) {
|
|
||||||
attribute.original = add.original != null ? add.original : old.original;
|
|
||||||
}
|
|
||||||
return attribute;
|
return attribute;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -108,10 +108,11 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
|
|||||||
attribute.nature[i] = LexiconUtility.convertStringToNature(param[1 + 2 * i],
|
attribute.nature[i] = LexiconUtility.convertStringToNature(param[1 + 2 * i],
|
||||||
customNatureCollector);
|
customNatureCollector);
|
||||||
attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]);
|
attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]);
|
||||||
|
attribute.originals[i] = original;
|
||||||
attribute.totalFrequency += attribute.frequency[i];
|
attribute.totalFrequency += attribute.frequency[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
attribute.original = original;
|
//attribute.original = original;
|
||||||
|
|
||||||
if (removeDuplicates && map.containsKey(word)) {
|
if (removeDuplicates && map.containsKey(word)) {
|
||||||
attribute = DictionaryAttributeUtil.getAttribute(map.get(word), attribute);
|
attribute = DictionaryAttributeUtil.getAttribute(map.get(word), attribute);
|
||||||
@@ -373,7 +374,7 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
|
|||||||
if (att == null) {
|
if (att == null) {
|
||||||
return false;
|
return false;
|
||||||
} else if (this.dat.containsKey(word)) {
|
} else if (this.dat.containsKey(word)) {
|
||||||
att.original = original;
|
att.setOriginals(original);
|
||||||
att = DictionaryAttributeUtil.getAttribute(this.dat.get(word), att);
|
att = DictionaryAttributeUtil.getAttribute(this.dat.get(word), att);
|
||||||
this.dat.set(word, att);
|
this.dat.set(word, att);
|
||||||
// return true;
|
// return true;
|
||||||
@@ -381,7 +382,8 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
|
|||||||
if (this.trie == null) {
|
if (this.trie == null) {
|
||||||
this.trie = new BinTrie();
|
this.trie = new BinTrie();
|
||||||
}
|
}
|
||||||
att.original = original;
|
//att.original = original;
|
||||||
|
att.setOriginals(original);
|
||||||
if (this.trie.containsKey(word)) {
|
if (this.trie.containsKey(word)) {
|
||||||
att = DictionaryAttributeUtil.getAttribute(this.trie.get(word), att);
|
att = DictionaryAttributeUtil.getAttribute(this.trie.get(word), att);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,17 +9,14 @@ import com.hankcs.hanlp.seg.Segment;
|
|||||||
import com.hankcs.hanlp.seg.common.Term;
|
import com.hankcs.hanlp.seg.common.Term;
|
||||||
import com.tencent.supersonic.common.pojo.enums.DictWordType;
|
import com.tencent.supersonic.common.pojo.enums.DictWordType;
|
||||||
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
|
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
|
||||||
|
import com.tencent.supersonic.headless.chat.knowledge.DatabaseMapResult;
|
||||||
import com.tencent.supersonic.headless.chat.knowledge.DictWord;
|
import com.tencent.supersonic.headless.chat.knowledge.DictWord;
|
||||||
|
import com.tencent.supersonic.headless.chat.knowledge.EmbeddingResult;
|
||||||
import com.tencent.supersonic.headless.chat.knowledge.HadoopFileIOAdapter;
|
import com.tencent.supersonic.headless.chat.knowledge.HadoopFileIOAdapter;
|
||||||
|
import com.tencent.supersonic.headless.chat.knowledge.HanlpMapResult;
|
||||||
import com.tencent.supersonic.headless.chat.knowledge.MapResult;
|
import com.tencent.supersonic.headless.chat.knowledge.MapResult;
|
||||||
import com.tencent.supersonic.headless.chat.knowledge.MultiCustomDictionary;
|
import com.tencent.supersonic.headless.chat.knowledge.MultiCustomDictionary;
|
||||||
import com.tencent.supersonic.headless.chat.knowledge.SearchService;
|
import com.tencent.supersonic.headless.chat.knowledge.SearchService;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.springframework.beans.BeanUtils;
|
|
||||||
import org.springframework.util.CollectionUtils;
|
|
||||||
import org.springframework.util.ResourceUtils;
|
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@@ -29,6 +26,11 @@ import java.util.Collection;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.springframework.beans.BeanUtils;
|
||||||
|
import org.springframework.util.CollectionUtils;
|
||||||
|
import org.springframework.util.ResourceUtils;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* HanLP helper
|
* HanLP helper
|
||||||
@@ -200,16 +202,69 @@ public class HanlpHelper {
|
|||||||
if (CollectionUtils.isEmpty(mapResults)) {
|
if (CollectionUtils.isEmpty(mapResults)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
List<T> newResults = new ArrayList<>();
|
||||||
for (T mapResult : mapResults) {
|
for (T mapResult : mapResults) {
|
||||||
|
boolean isAdd = false;
|
||||||
if (MultiCustomDictionary.isLowerLetter(mapResult.getName())) {
|
if (MultiCustomDictionary.isLowerLetter(mapResult.getName())) {
|
||||||
if (CustomDictionary.contains(mapResult.getName())) {
|
if (CustomDictionary.contains(mapResult.getName())) {
|
||||||
CoreDictionary.Attribute attribute = CustomDictionary.get(mapResult.getName());
|
CoreDictionary.Attribute attribute = CustomDictionary.get(mapResult.getName());
|
||||||
if (attribute != null && attribute.original != null) {
|
if (attribute != null) {
|
||||||
mapResult.setName(attribute.original);
|
isAdd = addLetterOriginal(newResults, mapResult, attribute);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!isAdd) {
|
||||||
|
newResults.add(mapResult);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mapResults.clear();
|
||||||
|
mapResults.addAll(newResults);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T extends MapResult> boolean addLetterOriginal(List<T> mapResults, T mapResult,
|
||||||
|
CoreDictionary.Attribute attribute) {
|
||||||
|
boolean isAdd = false;
|
||||||
|
if (attribute != null) {
|
||||||
|
if (mapResult instanceof HanlpMapResult) {
|
||||||
|
HanlpMapResult hanlpMapResult = (HanlpMapResult) mapResult;
|
||||||
|
for (String nature : hanlpMapResult.getNatures()) {
|
||||||
|
String orig = attribute.getOriginal(Nature.fromString(nature));
|
||||||
|
if (orig != null) {
|
||||||
|
MapResult addMapResult = new HanlpMapResult(orig, Arrays.asList(nature),
|
||||||
|
hanlpMapResult.getDetectWord());
|
||||||
|
mapResults.add((T) addMapResult);
|
||||||
|
isAdd = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (mapResult instanceof DatabaseMapResult) {
|
||||||
|
List<String> originals = attribute.getOriginals();
|
||||||
|
if (!CollectionUtils.isEmpty(originals)) {
|
||||||
|
for (String orig : originals) {
|
||||||
|
DatabaseMapResult addMapResult = new DatabaseMapResult();
|
||||||
|
addMapResult.setName(orig);
|
||||||
|
addMapResult.setSchemaElement(((DatabaseMapResult) mapResult).getSchemaElement());
|
||||||
|
addMapResult.setDetectWord(mapResult.getDetectWord());
|
||||||
|
mapResults.add((T) addMapResult);
|
||||||
|
isAdd = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (mapResult instanceof EmbeddingResult) {
|
||||||
|
List<String> originals = attribute.getOriginals();
|
||||||
|
if (!CollectionUtils.isEmpty(originals)) {
|
||||||
|
for (String orig : originals) {
|
||||||
|
EmbeddingResult addMapResult = new EmbeddingResult();
|
||||||
|
addMapResult.setName(orig);
|
||||||
|
addMapResult.setDetectWord(mapResult.getDetectWord());
|
||||||
|
addMapResult.setId(((EmbeddingResult) mapResult).getId());
|
||||||
|
addMapResult.setMetadata(((EmbeddingResult) mapResult).getMetadata());
|
||||||
|
addMapResult.setDistance(((EmbeddingResult) mapResult).getDistance());
|
||||||
|
mapResults.add((T) addMapResult);
|
||||||
|
isAdd = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return isAdd;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<S2Term> getTerms(String text, Map<Long, List<Long>> modelIdToDataSetIds) {
|
public static List<S2Term> getTerms(String text, Map<Long, List<Long>> modelIdToDataSetIds) {
|
||||||
|
|||||||
@@ -0,0 +1,39 @@
|
|||||||
|
package com.tencent.supersonic.util;
|
||||||
|
|
||||||
|
import com.hankcs.hanlp.corpus.tag.Nature;
|
||||||
|
import com.hankcs.hanlp.dictionary.CoreDictionary.Attribute;
|
||||||
|
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||||
|
import com.tencent.supersonic.headless.chat.knowledge.DatabaseMapResult;
|
||||||
|
import com.tencent.supersonic.headless.chat.knowledge.MapResult;
|
||||||
|
import com.tencent.supersonic.headless.chat.knowledge.helper.HanlpHelper;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class HanlpTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test() {
|
||||||
|
Nature[] nature = new Nature[2];
|
||||||
|
nature[0] = Nature.create("_3");
|
||||||
|
nature[1] = Nature.create("_4");
|
||||||
|
int[] frequency = new int[2];
|
||||||
|
frequency[0] = 100;
|
||||||
|
frequency[1] = 200;
|
||||||
|
String[] originals = new String[2];
|
||||||
|
originals[0] = "AA";
|
||||||
|
originals[1] = "Aa";
|
||||||
|
Attribute att = new Attribute(nature, frequency, originals, 200);
|
||||||
|
att.original = "DDDDD";
|
||||||
|
HanlpHelper.getDynamicCustomDictionary().getTrie().set("aa", att);
|
||||||
|
List<MapResult> mapResults = new ArrayList<>();
|
||||||
|
DatabaseMapResult addMapResult = new DatabaseMapResult();
|
||||||
|
addMapResult.setName("aa");
|
||||||
|
addMapResult.setSchemaElement(new SchemaElement());
|
||||||
|
addMapResult.setDetectWord("abc");
|
||||||
|
mapResults.add(addMapResult);
|
||||||
|
HanlpHelper.transLetterOriginal(mapResults);
|
||||||
|
Assert.assertEquals(mapResults.size(), 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user