[improvement][project] global refactor , code format , support llm , support fuzzy detect ,support query filter and so on.

This commit is contained in:
lexluo
2023-07-08 15:00:03 +08:00
parent 5ffd617431
commit 404163f391
329 changed files with 21050 additions and 5036 deletions

View File

@@ -19,6 +19,7 @@ public abstract class BaseNode<V> implements Comparable<BaseNode> {
* 状态数组,方便读取的时候用
*/
static final Status[] ARRAY_STATUS = Status.values();
public String prefix = null;
/**
* 子节点
*/
@@ -36,8 +37,6 @@ public abstract class BaseNode<V> implements Comparable<BaseNode> {
*/
protected V value;
public String prefix = null;
public BaseNode<V> transition(String path, int begin) {
BaseNode<V> cur = this;
for (int i = begin; i < path.length(); ++i) {
@@ -231,37 +230,6 @@ public abstract class BaseNode<V> implements Comparable<BaseNode> {
}
}
public enum Status {
/**
* 未指定,用于删除词条
*/
UNDEFINED_0,
/**
* 不是词语的结尾
*/
NOT_WORD_1,
/**
* 是个词语的结尾,并且还可以继续
*/
WORD_MIDDLE_2,
/**
* 是个词语的结尾,并且没有继续
*/
WORD_END_3,
}
public class TrieEntry extends AbstractMap.SimpleEntry<String, V> implements Comparable<TrieEntry> {
public TrieEntry(String key, V value) {
super(key, value);
}
@Override
public int compareTo(TrieEntry o) {
return getKey().compareTo(String.valueOf(o.getKey()));
}
}
@Override
public String toString() {
return "BaseNode{"
@@ -316,4 +284,35 @@ public abstract class BaseNode<V> implements Comparable<BaseNode> {
}
}
public enum Status {
/**
* 未指定,用于删除词条
*/
UNDEFINED_0,
/**
* 不是词语的结尾
*/
NOT_WORD_1,
/**
* 是个词语的结尾,并且还可以继续
*/
WORD_MIDDLE_2,
/**
* 是个词语的结尾,并且没有继续
*/
WORD_END_3,
}
public class TrieEntry extends AbstractMap.SimpleEntry<String, V> implements Comparable<TrieEntry> {
public TrieEntry(String key, V value) {
super(key, value);
}
@Override
public int compareTo(TrieEntry o) {
return getKey().compareTo(String.valueOf(o.getKey()));
}
}
}

View File

@@ -26,9 +26,16 @@ import java.util.TreeMap;
*/
public class CoreDictionary {
public static DoubleArrayTrie<Attribute> trie = new DoubleArrayTrie<Attribute>();
public static final String PATH = HanLP.Config.CoreDictionaryPath;
public static DoubleArrayTrie<Attribute> trie = new DoubleArrayTrie<Attribute>();
// 一些特殊的WORD_ID
public static final int NR_WORD_ID = getWordID(Predefine.TAG_PEOPLE);
public static final int NS_WORD_ID = getWordID(Predefine.TAG_PLACE);
public static final int NT_WORD_ID = getWordID(Predefine.TAG_GROUP);
public static final int T_WORD_ID = getWordID(Predefine.TAG_TIME);
public static final int X_WORD_ID = getWordID(Predefine.TAG_CLUSTER);
public static final int M_WORD_ID = getWordID(Predefine.TAG_NUMBER);
public static final int NX_WORD_ID = getWordID(Predefine.TAG_PROPER);
// 自动加载词典
static {
@@ -40,15 +47,6 @@ public class CoreDictionary {
}
}
// 一些特殊的WORD_ID
public static final int NR_WORD_ID = getWordID(Predefine.TAG_PEOPLE);
public static final int NS_WORD_ID = getWordID(Predefine.TAG_PLACE);
public static final int NT_WORD_ID = getWordID(Predefine.TAG_GROUP);
public static final int T_WORD_ID = getWordID(Predefine.TAG_TIME);
public static final int X_WORD_ID = getWordID(Predefine.TAG_CLUSTER);
public static final int M_WORD_ID = getWordID(Predefine.TAG_NUMBER);
public static final int NX_WORD_ID = getWordID(Predefine.TAG_PROPER);
private static boolean load(String path) {
logger.info("核心词典开始加载:" + path);
if (loadDat(path)) {
@@ -200,6 +198,29 @@ public class CoreDictionary {
return trie.get(key) != null;
}
/**
* 获取词语的ID
*
* @param a 词语
* @return ID, 如果不存在, 则返回-1
*/
public static int getWordID(String a) {
return CoreDictionary.trie.exactMatchSearch(a);
}
/**
* 热更新核心词典<br>
* 集群环境或其他IOAdapter需要自行删除缓存文件
*
* @return 是否成功
*/
public static boolean reload() {
String path = CoreDictionary.PATH;
IOUtil.deleteFile(path + Predefine.BIN_EXT);
return load(path);
}
/**
* 核心词典中的词属性
*/
@@ -366,28 +387,5 @@ public class CoreDictionary {
}
}
}
/**
* 获取词语的ID
*
* @param a 词语
* @return ID, 如果不存在, 则返回-1
*/
public static int getWordID(String a) {
return CoreDictionary.trie.exactMatchSearch(a);
}
/**
* 热更新核心词典<br>
* 集群环境或其他IOAdapter需要自行删除缓存文件
*
* @return 是否成功
*/
public static boolean reload() {
String path = CoreDictionary.PATH;
IOUtil.deleteFile(path + Predefine.BIN_EXT);
return load(path);
}
}

View File

@@ -236,6 +236,10 @@ public abstract class WordBasedSegment extends Segment {
}
}
protected static void speechTagging(List<Vertex> vertexList) {
Viterbi.compute(vertexList, CoreDictionaryTransformMatrixDictionary.transformMatrixDictionary);
}
protected void generateWordNet(final WordNet wordNetStorage) {
final char[] charArray = wordNetStorage.charArray;
DoubleArrayTrie.Searcher searcher = CoreDictionary.trie.getSearcher(charArray, 0);
@@ -322,10 +326,6 @@ public abstract class WordBasedSegment extends Segment {
return termList;
}
protected static void speechTagging(List<Vertex> vertexList) {
Viterbi.compute(vertexList, CoreDictionaryTransformMatrixDictionary.transformMatrixDictionary);
}
protected void addTerms(List<Term> terms, Vertex vertex, int offset) {
for (int i = 0; i < vertex.attribute.nature.length; i++) {
Term term = new Term(vertex.realWord, vertex.attribute.nature[i]);

View File

@@ -49,11 +49,4 @@ public abstract class BaseWordNature {
return 0;
}
public Long getFrequency(String nature) {
String[] split = nature.split(NatureType.NATURE_SPILT);
if (split.length >= 3) {
return Long.valueOf(split[2]);
}
return 0L;
}
}

View File

@@ -27,8 +27,4 @@ public class DomainWordNature extends BaseWordNature {
return result;
}
@Override
public Long getFrequency(String nature) {
return 0L;
}
}

View File

@@ -14,6 +14,7 @@ import java.util.Arrays;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.CollectionUtils;
import org.springframework.util.ResourceUtils;
/**
@@ -21,13 +22,13 @@ import org.springframework.util.ResourceUtils;
*/
public class HanlpHelper {
private static final Logger LOGGER = LoggerFactory.getLogger(HanlpHelper.class);
public static final String FILE_SPILT = "/";
public static final String SPACE_SPILT = "#";
private static volatile Segment segment;
public static volatile DynamicCustomDictionary CustomDictionary;
public static final String DICT_MAIN_FILE_NAME = "CustomDictionary.txt";
public static final String DICT_CLASS = "classes";
private static final Logger LOGGER = LoggerFactory.getLogger(HanlpHelper.class);
public static volatile DynamicCustomDictionary CustomDictionary;
private static volatile Segment segment;
static {
// reset hanlp config
@@ -152,11 +153,14 @@ public class HanlpHelper {
}
public static boolean addToCustomDictionary(WordNature wordNature) {
LOGGER.debug("wordNature:{}", wordNature);
LOGGER.info("wordNature:{}", wordNature);
return getDynamicCustomDictionary().insert(wordNature.getWord(), wordNature.getNatureWithFrequency());
}
public static void transLetterOriginal(List<MapResult> mapResults) {
if (CollectionUtils.isEmpty(mapResults)) {
return;
}
for (MapResult mapResult : mapResults) {
if (MultiCustomDictionary.isLowerLetter(mapResult.getName())) {
if (CustomDictionary.contains(mapResult.getName())) {

View File

@@ -53,6 +53,7 @@ public class HdfsFileHelper {
/**
* reset path
*
* @param customDictionary
* @throws IOException
*/

View File

@@ -42,20 +42,6 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
super(path);
}
public boolean load(String... path) {
this.path = path;
long start = System.currentTimeMillis();
if (!this.loadMainDictionary(path[0])) {
Predefine.logger.warning("自定义词典" + Arrays.toString(path) + "加载失败");
return false;
} else {
Predefine.logger.info(
"自定义词典加载成功:" + this.dat.size() + "个词条,耗时" + (System.currentTimeMillis() - start) + "ms");
this.path = path;
return true;
}
}
/***
* load dictionary
* @param path
@@ -139,10 +125,6 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
}
}
public boolean loadMainDictionary(String mainPath) {
return loadMainDictionary(mainPath, this.path, this.dat, true, addToSuggesterTrie);
}
/***
* load main dictionary
* @param mainPath
@@ -291,6 +273,53 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
}
}
public static boolean isLetters(String str) {
char[] chars = str.toCharArray();
if (chars.length <= 1) {
return false;
}
for (int i = 0; i < chars.length; i++) {
if ((chars[i] >= 'A' && chars[i] <= 'Z')) {
return true;
}
}
return false;
}
public static boolean isLowerLetter(String str) {
char[] chars = str.toCharArray();
for (int i = 0; i < chars.length; i++) {
if ((chars[i] >= 'a' && chars[i] <= 'z')) {
return true;
}
}
return false;
}
public static String getWordBySpace(String word) {
if (word.contains(HanlpHelper.SPACE_SPILT)) {
return word.replace(HanlpHelper.SPACE_SPILT, " ");
}
return word;
}
public boolean load(String... path) {
this.path = path;
long start = System.currentTimeMillis();
if (!this.loadMainDictionary(path[0])) {
Predefine.logger.warning("自定义词典" + Arrays.toString(path) + "加载失败");
return false;
} else {
Predefine.logger.info(
"自定义词典加载成功:" + this.dat.size() + "个词条,耗时" + (System.currentTimeMillis() - start) + "ms");
this.path = path;
return true;
}
}
public boolean loadMainDictionary(String mainPath) {
return loadMainDictionary(mainPath, this.path, this.dat, true, addToSuggesterTrie);
}
public boolean reload() {
if (this.path != null && this.path.length != 0) {
@@ -344,35 +373,4 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
return true;
}
}
public static boolean isLetters(String str) {
char[] chars = str.toCharArray();
if (chars.length <= 1) {
return false;
}
for (int i = 0; i < chars.length; i++) {
if ((chars[i] >= 'A' && chars[i] <= 'Z')) {
return true;
}
}
return false;
}
public static boolean isLowerLetter(String str) {
char[] chars = str.toCharArray();
for (int i = 0; i < chars.length; i++) {
if ((chars[i] >= 'a' && chars[i] <= 'z')) {
return true;
}
}
return false;
}
public static String getWordBySpace(String word) {
if (word.contains(HanlpHelper.SPACE_SPILT)) {
return word.replace(HanlpHelper.SPACE_SPILT, " ");
}
return word;
}
}

View File

@@ -23,13 +23,12 @@ import org.springframework.util.CollectionUtils;
@Service
public class Suggester {
public static final int SEARCH_SIZE = 200;
private static final Logger LOGGER = LoggerFactory.getLogger(Suggester.class);
private static BinTrie<List<String>> trie;
private static BinTrie<List<String>> suffixTrie;
private static String localFileCache = "";
public static final int SEARCH_SIZE = 200;
static {
trie = new BinTrie<>();
suffixTrie = new BinTrie<>();
@@ -53,7 +52,7 @@ public class Suggester {
return result.stream().map(
entry -> {
String name = entry.getKey().replace("#", " ");
return new MapResult(name, entry.getValue(),key);
return new MapResult(name, entry.getValue(), key);
}
).sorted((a, b) -> -(b.getName().length() - a.getName().length()))
.limit(SEARCH_SIZE)