From 75e15f4c5095fe2ddebe40bf088ea1840cc16187 Mon Sep 17 00:00:00 2001 From: lexluo Date: Sat, 8 Jul 2023 15:50:39 +0800 Subject: [PATCH] [improvement](chat) unformatted hanlp code make seach/query work --- .../collection/trie/bintrie/BaseNode.java | 65 +++++++++--------- .../hanlp/dictionary/CoreDictionary.java | 66 ++++++++++--------- .../hankcs/hanlp/seg/WordBasedSegment.java | 8 +-- 3 files changed, 71 insertions(+), 68 deletions(-) diff --git a/chat/knowledge/src/main/java/com/hankcs/hanlp/collection/trie/bintrie/BaseNode.java b/chat/knowledge/src/main/java/com/hankcs/hanlp/collection/trie/bintrie/BaseNode.java index 895634c1e..ec978fe09 100644 --- a/chat/knowledge/src/main/java/com/hankcs/hanlp/collection/trie/bintrie/BaseNode.java +++ b/chat/knowledge/src/main/java/com/hankcs/hanlp/collection/trie/bintrie/BaseNode.java @@ -19,7 +19,6 @@ public abstract class BaseNode implements Comparable { * 状态数组,方便读取的时候用 */ static final Status[] ARRAY_STATUS = Status.values(); - public String prefix = null; /** * 子节点 */ @@ -37,6 +36,8 @@ public abstract class BaseNode implements Comparable { */ protected V value; + public String prefix = null; + public BaseNode transition(String path, int begin) { BaseNode cur = this; for (int i = begin; i < path.length(); ++i) { @@ -230,6 +231,37 @@ public abstract class BaseNode implements Comparable { } } + public enum Status { + /** + * 未指定,用于删除词条 + */ + UNDEFINED_0, + /** + * 不是词语的结尾 + */ + NOT_WORD_1, + /** + * 是个词语的结尾,并且还可以继续 + */ + WORD_MIDDLE_2, + /** + * 是个词语的结尾,并且没有继续 + */ + WORD_END_3, + } + + public class TrieEntry extends AbstractMap.SimpleEntry implements Comparable { + + public TrieEntry(String key, V value) { + super(key, value); + } + + @Override + public int compareTo(TrieEntry o) { + return getKey().compareTo(String.valueOf(o.getKey())); + } + } + @Override public String toString() { return "BaseNode{" @@ -284,35 +316,4 @@ public abstract class BaseNode implements Comparable { } } - public enum Status { - /** - * 未指定,用于删除词条 - */ - UNDEFINED_0, - /** - * 不是词语的结尾 - */ - NOT_WORD_1, - /** - * 是个词语的结尾,并且还可以继续 - */ - WORD_MIDDLE_2, - /** - * 是个词语的结尾,并且没有继续 - */ - WORD_END_3, - } - - public class TrieEntry extends AbstractMap.SimpleEntry implements Comparable { - - public TrieEntry(String key, V value) { - super(key, value); - } - - @Override - public int compareTo(TrieEntry o) { - return getKey().compareTo(String.valueOf(o.getKey())); - } - } - } diff --git a/chat/knowledge/src/main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java b/chat/knowledge/src/main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java index fecc6cb1e..e97c9f903 100644 --- a/chat/knowledge/src/main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java +++ b/chat/knowledge/src/main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java @@ -26,16 +26,9 @@ import java.util.TreeMap; */ public class CoreDictionary { - public static final String PATH = HanLP.Config.CoreDictionaryPath; public static DoubleArrayTrie trie = new DoubleArrayTrie(); - // 一些特殊的WORD_ID - public static final int NR_WORD_ID = getWordID(Predefine.TAG_PEOPLE); - public static final int NS_WORD_ID = getWordID(Predefine.TAG_PLACE); - public static final int NT_WORD_ID = getWordID(Predefine.TAG_GROUP); - public static final int T_WORD_ID = getWordID(Predefine.TAG_TIME); - public static final int X_WORD_ID = getWordID(Predefine.TAG_CLUSTER); - public static final int M_WORD_ID = getWordID(Predefine.TAG_NUMBER); - public static final int NX_WORD_ID = getWordID(Predefine.TAG_PROPER); + + public static final String PATH = HanLP.Config.CoreDictionaryPath; // 自动加载词典 static { @@ -47,6 +40,15 @@ public class CoreDictionary { } } + // 一些特殊的WORD_ID + public static final int NR_WORD_ID = getWordID(Predefine.TAG_PEOPLE); + public static final int NS_WORD_ID = getWordID(Predefine.TAG_PLACE); + public static final int NT_WORD_ID = getWordID(Predefine.TAG_GROUP); + public static final int T_WORD_ID = getWordID(Predefine.TAG_TIME); + public static final int X_WORD_ID = getWordID(Predefine.TAG_CLUSTER); + public static final int M_WORD_ID = getWordID(Predefine.TAG_NUMBER); + public static final int NX_WORD_ID = getWordID(Predefine.TAG_PROPER); + private static boolean load(String path) { logger.info("核心词典开始加载:" + path); if (loadDat(path)) { @@ -198,29 +200,6 @@ public class CoreDictionary { return trie.get(key) != null; } - /** - * 获取词语的ID - * - * @param a 词语 - * @return ID, 如果不存在, 则返回-1 - */ - public static int getWordID(String a) { - return CoreDictionary.trie.exactMatchSearch(a); - } - - /** - * 热更新核心词典
- * 集群环境(或其他IOAdapter)需要自行删除缓存文件 - * - * @return 是否成功 - */ - public static boolean reload() { - String path = CoreDictionary.PATH; - IOUtil.deleteFile(path + Predefine.BIN_EXT); - - return load(path); - } - /** * 核心词典中的词属性 */ @@ -387,5 +366,28 @@ public class CoreDictionary { } } } + + /** + * 获取词语的ID + * + * @param a 词语 + * @return ID, 如果不存在, 则返回-1 + */ + public static int getWordID(String a) { + return CoreDictionary.trie.exactMatchSearch(a); + } + + /** + * 热更新核心词典
+ * 集群环境(或其他IOAdapter)需要自行删除缓存文件 + * + * @return 是否成功 + */ + public static boolean reload() { + String path = CoreDictionary.PATH; + IOUtil.deleteFile(path + Predefine.BIN_EXT); + + return load(path); + } } diff --git a/chat/knowledge/src/main/java/com/hankcs/hanlp/seg/WordBasedSegment.java b/chat/knowledge/src/main/java/com/hankcs/hanlp/seg/WordBasedSegment.java index b467abba3..47204ec23 100644 --- a/chat/knowledge/src/main/java/com/hankcs/hanlp/seg/WordBasedSegment.java +++ b/chat/knowledge/src/main/java/com/hankcs/hanlp/seg/WordBasedSegment.java @@ -236,10 +236,6 @@ public abstract class WordBasedSegment extends Segment { } } - protected static void speechTagging(List vertexList) { - Viterbi.compute(vertexList, CoreDictionaryTransformMatrixDictionary.transformMatrixDictionary); - } - protected void generateWordNet(final WordNet wordNetStorage) { final char[] charArray = wordNetStorage.charArray; DoubleArrayTrie.Searcher searcher = CoreDictionary.trie.getSearcher(charArray, 0); @@ -326,6 +322,10 @@ public abstract class WordBasedSegment extends Segment { return termList; } + protected static void speechTagging(List vertexList) { + Viterbi.compute(vertexList, CoreDictionaryTransformMatrixDictionary.transformMatrixDictionary); + } + protected void addTerms(List terms, Vertex vertex, int offset) { for (int i = 0; i < vertex.attribute.nature.length; i++) { Term term = new Term(vertex.realWord, vertex.attribute.nature[i]);