mirror of
https://github.com/tencentmusic/supersonic.git
synced 2025-12-20 06:34:55 +00:00
[improvement][project] global refactor , code format , support llm , support fuzzy detect ,support query filter and so on.
This commit is contained in:
@@ -19,6 +19,7 @@ public abstract class BaseNode<V> implements Comparable<BaseNode> {
|
||||
* 状态数组,方便读取的时候用
|
||||
*/
|
||||
static final Status[] ARRAY_STATUS = Status.values();
|
||||
public String prefix = null;
|
||||
/**
|
||||
* 子节点
|
||||
*/
|
||||
@@ -36,8 +37,6 @@ public abstract class BaseNode<V> implements Comparable<BaseNode> {
|
||||
*/
|
||||
protected V value;
|
||||
|
||||
public String prefix = null;
|
||||
|
||||
public BaseNode<V> transition(String path, int begin) {
|
||||
BaseNode<V> cur = this;
|
||||
for (int i = begin; i < path.length(); ++i) {
|
||||
@@ -231,37 +230,6 @@ public abstract class BaseNode<V> implements Comparable<BaseNode> {
|
||||
}
|
||||
}
|
||||
|
||||
public enum Status {
|
||||
/**
|
||||
* 未指定,用于删除词条
|
||||
*/
|
||||
UNDEFINED_0,
|
||||
/**
|
||||
* 不是词语的结尾
|
||||
*/
|
||||
NOT_WORD_1,
|
||||
/**
|
||||
* 是个词语的结尾,并且还可以继续
|
||||
*/
|
||||
WORD_MIDDLE_2,
|
||||
/**
|
||||
* 是个词语的结尾,并且没有继续
|
||||
*/
|
||||
WORD_END_3,
|
||||
}
|
||||
|
||||
public class TrieEntry extends AbstractMap.SimpleEntry<String, V> implements Comparable<TrieEntry> {
|
||||
|
||||
public TrieEntry(String key, V value) {
|
||||
super(key, value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(TrieEntry o) {
|
||||
return getKey().compareTo(String.valueOf(o.getKey()));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "BaseNode{"
|
||||
@@ -316,4 +284,35 @@ public abstract class BaseNode<V> implements Comparable<BaseNode> {
|
||||
}
|
||||
}
|
||||
|
||||
public enum Status {
|
||||
/**
|
||||
* 未指定,用于删除词条
|
||||
*/
|
||||
UNDEFINED_0,
|
||||
/**
|
||||
* 不是词语的结尾
|
||||
*/
|
||||
NOT_WORD_1,
|
||||
/**
|
||||
* 是个词语的结尾,并且还可以继续
|
||||
*/
|
||||
WORD_MIDDLE_2,
|
||||
/**
|
||||
* 是个词语的结尾,并且没有继续
|
||||
*/
|
||||
WORD_END_3,
|
||||
}
|
||||
|
||||
public class TrieEntry extends AbstractMap.SimpleEntry<String, V> implements Comparable<TrieEntry> {
|
||||
|
||||
public TrieEntry(String key, V value) {
|
||||
super(key, value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(TrieEntry o) {
|
||||
return getKey().compareTo(String.valueOf(o.getKey()));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -26,9 +26,16 @@ import java.util.TreeMap;
|
||||
*/
|
||||
public class CoreDictionary {
|
||||
|
||||
public static DoubleArrayTrie<Attribute> trie = new DoubleArrayTrie<Attribute>();
|
||||
|
||||
public static final String PATH = HanLP.Config.CoreDictionaryPath;
|
||||
public static DoubleArrayTrie<Attribute> trie = new DoubleArrayTrie<Attribute>();
|
||||
// 一些特殊的WORD_ID
|
||||
public static final int NR_WORD_ID = getWordID(Predefine.TAG_PEOPLE);
|
||||
public static final int NS_WORD_ID = getWordID(Predefine.TAG_PLACE);
|
||||
public static final int NT_WORD_ID = getWordID(Predefine.TAG_GROUP);
|
||||
public static final int T_WORD_ID = getWordID(Predefine.TAG_TIME);
|
||||
public static final int X_WORD_ID = getWordID(Predefine.TAG_CLUSTER);
|
||||
public static final int M_WORD_ID = getWordID(Predefine.TAG_NUMBER);
|
||||
public static final int NX_WORD_ID = getWordID(Predefine.TAG_PROPER);
|
||||
|
||||
// 自动加载词典
|
||||
static {
|
||||
@@ -40,15 +47,6 @@ public class CoreDictionary {
|
||||
}
|
||||
}
|
||||
|
||||
// 一些特殊的WORD_ID
|
||||
public static final int NR_WORD_ID = getWordID(Predefine.TAG_PEOPLE);
|
||||
public static final int NS_WORD_ID = getWordID(Predefine.TAG_PLACE);
|
||||
public static final int NT_WORD_ID = getWordID(Predefine.TAG_GROUP);
|
||||
public static final int T_WORD_ID = getWordID(Predefine.TAG_TIME);
|
||||
public static final int X_WORD_ID = getWordID(Predefine.TAG_CLUSTER);
|
||||
public static final int M_WORD_ID = getWordID(Predefine.TAG_NUMBER);
|
||||
public static final int NX_WORD_ID = getWordID(Predefine.TAG_PROPER);
|
||||
|
||||
private static boolean load(String path) {
|
||||
logger.info("核心词典开始加载:" + path);
|
||||
if (loadDat(path)) {
|
||||
@@ -200,6 +198,29 @@ public class CoreDictionary {
|
||||
return trie.get(key) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取词语的ID
|
||||
*
|
||||
* @param a 词语
|
||||
* @return ID, 如果不存在, 则返回-1
|
||||
*/
|
||||
public static int getWordID(String a) {
|
||||
return CoreDictionary.trie.exactMatchSearch(a);
|
||||
}
|
||||
|
||||
/**
|
||||
* 热更新核心词典<br>
|
||||
* 集群环境(或其他IOAdapter)需要自行删除缓存文件
|
||||
*
|
||||
* @return 是否成功
|
||||
*/
|
||||
public static boolean reload() {
|
||||
String path = CoreDictionary.PATH;
|
||||
IOUtil.deleteFile(path + Predefine.BIN_EXT);
|
||||
|
||||
return load(path);
|
||||
}
|
||||
|
||||
/**
|
||||
* 核心词典中的词属性
|
||||
*/
|
||||
@@ -366,28 +387,5 @@ public class CoreDictionary {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取词语的ID
|
||||
*
|
||||
* @param a 词语
|
||||
* @return ID, 如果不存在, 则返回-1
|
||||
*/
|
||||
public static int getWordID(String a) {
|
||||
return CoreDictionary.trie.exactMatchSearch(a);
|
||||
}
|
||||
|
||||
/**
|
||||
* 热更新核心词典<br>
|
||||
* 集群环境(或其他IOAdapter)需要自行删除缓存文件
|
||||
*
|
||||
* @return 是否成功
|
||||
*/
|
||||
public static boolean reload() {
|
||||
String path = CoreDictionary.PATH;
|
||||
IOUtil.deleteFile(path + Predefine.BIN_EXT);
|
||||
|
||||
return load(path);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -236,6 +236,10 @@ public abstract class WordBasedSegment extends Segment {
|
||||
}
|
||||
}
|
||||
|
||||
protected static void speechTagging(List<Vertex> vertexList) {
|
||||
Viterbi.compute(vertexList, CoreDictionaryTransformMatrixDictionary.transformMatrixDictionary);
|
||||
}
|
||||
|
||||
protected void generateWordNet(final WordNet wordNetStorage) {
|
||||
final char[] charArray = wordNetStorage.charArray;
|
||||
DoubleArrayTrie.Searcher searcher = CoreDictionary.trie.getSearcher(charArray, 0);
|
||||
@@ -322,10 +326,6 @@ public abstract class WordBasedSegment extends Segment {
|
||||
return termList;
|
||||
}
|
||||
|
||||
protected static void speechTagging(List<Vertex> vertexList) {
|
||||
Viterbi.compute(vertexList, CoreDictionaryTransformMatrixDictionary.transformMatrixDictionary);
|
||||
}
|
||||
|
||||
protected void addTerms(List<Term> terms, Vertex vertex, int offset) {
|
||||
for (int i = 0; i < vertex.attribute.nature.length; i++) {
|
||||
Term term = new Term(vertex.realWord, vertex.attribute.nature[i]);
|
||||
|
||||
@@ -49,11 +49,4 @@ public abstract class BaseWordNature {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Long getFrequency(String nature) {
|
||||
String[] split = nature.split(NatureType.NATURE_SPILT);
|
||||
if (split.length >= 3) {
|
||||
return Long.valueOf(split[2]);
|
||||
}
|
||||
return 0L;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,8 +27,4 @@ public class DomainWordNature extends BaseWordNature {
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long getFrequency(String nature) {
|
||||
return 0L;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@ import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
import org.springframework.util.ResourceUtils;
|
||||
|
||||
/**
|
||||
@@ -21,13 +22,13 @@ import org.springframework.util.ResourceUtils;
|
||||
*/
|
||||
public class HanlpHelper {
|
||||
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(HanlpHelper.class);
|
||||
public static final String FILE_SPILT = "/";
|
||||
public static final String SPACE_SPILT = "#";
|
||||
private static volatile Segment segment;
|
||||
public static volatile DynamicCustomDictionary CustomDictionary;
|
||||
public static final String DICT_MAIN_FILE_NAME = "CustomDictionary.txt";
|
||||
public static final String DICT_CLASS = "classes";
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(HanlpHelper.class);
|
||||
public static volatile DynamicCustomDictionary CustomDictionary;
|
||||
private static volatile Segment segment;
|
||||
|
||||
static {
|
||||
// reset hanlp config
|
||||
@@ -152,11 +153,14 @@ public class HanlpHelper {
|
||||
}
|
||||
|
||||
public static boolean addToCustomDictionary(WordNature wordNature) {
|
||||
LOGGER.debug("wordNature:{}", wordNature);
|
||||
LOGGER.info("wordNature:{}", wordNature);
|
||||
return getDynamicCustomDictionary().insert(wordNature.getWord(), wordNature.getNatureWithFrequency());
|
||||
}
|
||||
|
||||
public static void transLetterOriginal(List<MapResult> mapResults) {
|
||||
if (CollectionUtils.isEmpty(mapResults)) {
|
||||
return;
|
||||
}
|
||||
for (MapResult mapResult : mapResults) {
|
||||
if (MultiCustomDictionary.isLowerLetter(mapResult.getName())) {
|
||||
if (CustomDictionary.contains(mapResult.getName())) {
|
||||
|
||||
@@ -53,6 +53,7 @@ public class HdfsFileHelper {
|
||||
|
||||
/**
|
||||
* reset path
|
||||
*
|
||||
* @param customDictionary
|
||||
* @throws IOException
|
||||
*/
|
||||
|
||||
@@ -42,20 +42,6 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
|
||||
super(path);
|
||||
}
|
||||
|
||||
public boolean load(String... path) {
|
||||
this.path = path;
|
||||
long start = System.currentTimeMillis();
|
||||
if (!this.loadMainDictionary(path[0])) {
|
||||
Predefine.logger.warning("自定义词典" + Arrays.toString(path) + "加载失败");
|
||||
return false;
|
||||
} else {
|
||||
Predefine.logger.info(
|
||||
"自定义词典加载成功:" + this.dat.size() + "个词条,耗时" + (System.currentTimeMillis() - start) + "ms");
|
||||
this.path = path;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/***
|
||||
* load dictionary
|
||||
* @param path
|
||||
@@ -139,10 +125,6 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
|
||||
}
|
||||
}
|
||||
|
||||
public boolean loadMainDictionary(String mainPath) {
|
||||
return loadMainDictionary(mainPath, this.path, this.dat, true, addToSuggesterTrie);
|
||||
}
|
||||
|
||||
/***
|
||||
* load main dictionary
|
||||
* @param mainPath
|
||||
@@ -291,6 +273,53 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
|
||||
}
|
||||
}
|
||||
|
||||
public static boolean isLetters(String str) {
|
||||
char[] chars = str.toCharArray();
|
||||
if (chars.length <= 1) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < chars.length; i++) {
|
||||
if ((chars[i] >= 'A' && chars[i] <= 'Z')) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static boolean isLowerLetter(String str) {
|
||||
char[] chars = str.toCharArray();
|
||||
for (int i = 0; i < chars.length; i++) {
|
||||
if ((chars[i] >= 'a' && chars[i] <= 'z')) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static String getWordBySpace(String word) {
|
||||
if (word.contains(HanlpHelper.SPACE_SPILT)) {
|
||||
return word.replace(HanlpHelper.SPACE_SPILT, " ");
|
||||
}
|
||||
return word;
|
||||
}
|
||||
|
||||
public boolean load(String... path) {
|
||||
this.path = path;
|
||||
long start = System.currentTimeMillis();
|
||||
if (!this.loadMainDictionary(path[0])) {
|
||||
Predefine.logger.warning("自定义词典" + Arrays.toString(path) + "加载失败");
|
||||
return false;
|
||||
} else {
|
||||
Predefine.logger.info(
|
||||
"自定义词典加载成功:" + this.dat.size() + "个词条,耗时" + (System.currentTimeMillis() - start) + "ms");
|
||||
this.path = path;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean loadMainDictionary(String mainPath) {
|
||||
return loadMainDictionary(mainPath, this.path, this.dat, true, addToSuggesterTrie);
|
||||
}
|
||||
|
||||
public boolean reload() {
|
||||
if (this.path != null && this.path.length != 0) {
|
||||
@@ -344,35 +373,4 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static boolean isLetters(String str) {
|
||||
char[] chars = str.toCharArray();
|
||||
if (chars.length <= 1) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < chars.length; i++) {
|
||||
if ((chars[i] >= 'A' && chars[i] <= 'Z')) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static boolean isLowerLetter(String str) {
|
||||
char[] chars = str.toCharArray();
|
||||
for (int i = 0; i < chars.length; i++) {
|
||||
if ((chars[i] >= 'a' && chars[i] <= 'z')) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static String getWordBySpace(String word) {
|
||||
if (word.contains(HanlpHelper.SPACE_SPILT)) {
|
||||
return word.replace(HanlpHelper.SPACE_SPILT, " ");
|
||||
}
|
||||
return word;
|
||||
}
|
||||
}
|
||||
@@ -23,13 +23,12 @@ import org.springframework.util.CollectionUtils;
|
||||
@Service
|
||||
public class Suggester {
|
||||
|
||||
public static final int SEARCH_SIZE = 200;
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(Suggester.class);
|
||||
private static BinTrie<List<String>> trie;
|
||||
private static BinTrie<List<String>> suffixTrie;
|
||||
private static String localFileCache = "";
|
||||
|
||||
public static final int SEARCH_SIZE = 200;
|
||||
|
||||
static {
|
||||
trie = new BinTrie<>();
|
||||
suffixTrie = new BinTrie<>();
|
||||
@@ -53,7 +52,7 @@ public class Suggester {
|
||||
return result.stream().map(
|
||||
entry -> {
|
||||
String name = entry.getKey().replace("#", " ");
|
||||
return new MapResult(name, entry.getValue(),key);
|
||||
return new MapResult(name, entry.getValue(), key);
|
||||
}
|
||||
).sorted((a, b) -> -(b.getName().length() - a.getName().length()))
|
||||
.limit(SEARCH_SIZE)
|
||||
|
||||
Reference in New Issue
Block a user